duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -37,7 +37,9 @@ PhysicalUngroupedAggregate::PhysicalUngroupedAggregate(vector<LogicalType> types
37
37
  //===--------------------------------------------------------------------===//
38
38
  struct AggregateState {
39
39
  explicit AggregateState(const vector<unique_ptr<Expression>> &aggregate_expressions) {
40
- for (auto &aggregate : aggregate_expressions) {
40
+ counts = make_uniq_array<atomic<idx_t>>(aggregate_expressions.size());
41
+ for (idx_t i = 0; i < aggregate_expressions.size(); i++) {
42
+ auto &aggregate = aggregate_expressions[i];
41
43
  D_ASSERT(aggregate->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
42
44
  auto &aggr = aggregate->Cast<BoundAggregateExpression>();
43
45
  auto state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
@@ -46,7 +48,7 @@ struct AggregateState {
46
48
  bind_data.push_back(aggr.bind_info.get());
47
49
  destructors.push_back(aggr.function.destructor);
48
50
  #ifdef DEBUG
49
- counts.push_back(0);
51
+ counts[i] = 0;
50
52
  #endif
51
53
  }
52
54
  }
@@ -77,12 +79,12 @@ struct AggregateState {
77
79
  //! The destructors
78
80
  vector<aggregate_destructor_t> destructors;
79
81
  //! Counts (used for verification)
80
- vector<idx_t> counts;
82
+ unique_array<atomic<idx_t>> counts;
81
83
  };
82
84
 
83
- class UngroupedAggregateGlobalState : public GlobalSinkState {
85
+ class UngroupedAggregateGlobalSinkState : public GlobalSinkState {
84
86
  public:
85
- UngroupedAggregateGlobalState(const PhysicalUngroupedAggregate &op, ClientContext &client)
87
+ UngroupedAggregateGlobalSinkState(const PhysicalUngroupedAggregate &op, ClientContext &client)
86
88
  : state(op.aggregates), finished(false), allocator(BufferAllocator::Get(client)) {
87
89
  if (op.distinct_data) {
88
90
  distinct_state = make_uniq<DistinctAggregateState>(*op.distinct_data, client);
@@ -101,13 +103,13 @@ public:
101
103
  ArenaAllocator allocator;
102
104
  };
103
105
 
104
- class UngroupedAggregateLocalState : public LocalSinkState {
106
+ class UngroupedAggregateLocalSinkState : public LocalSinkState {
105
107
  public:
106
- UngroupedAggregateLocalState(const PhysicalUngroupedAggregate &op, const vector<LogicalType> &child_types,
107
- GlobalSinkState &gstate_p, ExecutionContext &context)
108
+ UngroupedAggregateLocalSinkState(const PhysicalUngroupedAggregate &op, const vector<LogicalType> &child_types,
109
+ GlobalSinkState &gstate_p, ExecutionContext &context)
108
110
  : allocator(BufferAllocator::Get(context.client)), state(op.aggregates), child_executor(context.client),
109
111
  aggregate_input_chunk(), filter_set() {
110
- auto &gstate = gstate_p.Cast<UngroupedAggregateGlobalState>();
112
+ auto &gstate = gstate_p.Cast<UngroupedAggregateGlobalSinkState>();
111
113
 
112
114
  auto &allocator = BufferAllocator::Get(context.client);
113
115
  InitializeDistinctAggregates(op, gstate, context);
@@ -147,8 +149,8 @@ public:
147
149
  void Reset() {
148
150
  aggregate_input_chunk.Reset();
149
151
  }
150
- void InitializeDistinctAggregates(const PhysicalUngroupedAggregate &op, const UngroupedAggregateGlobalState &gstate,
151
- ExecutionContext &context) {
152
+ void InitializeDistinctAggregates(const PhysicalUngroupedAggregate &op,
153
+ const UngroupedAggregateGlobalSinkState &gstate, ExecutionContext &context) {
152
154
 
153
155
  if (!op.distinct_data) {
154
156
  return;
@@ -185,19 +187,19 @@ bool PhysicalUngroupedAggregate::SinkOrderDependent() const {
185
187
  }
186
188
 
187
189
  unique_ptr<GlobalSinkState> PhysicalUngroupedAggregate::GetGlobalSinkState(ClientContext &context) const {
188
- return make_uniq<UngroupedAggregateGlobalState>(*this, context);
190
+ return make_uniq<UngroupedAggregateGlobalSinkState>(*this, context);
189
191
  }
190
192
 
191
193
  unique_ptr<LocalSinkState> PhysicalUngroupedAggregate::GetLocalSinkState(ExecutionContext &context) const {
192
194
  D_ASSERT(sink_state);
193
195
  auto &gstate = *sink_state;
194
- return make_uniq<UngroupedAggregateLocalState>(*this, children[0]->GetTypes(), gstate, context);
196
+ return make_uniq<UngroupedAggregateLocalSinkState>(*this, children[0]->GetTypes(), gstate, context);
195
197
  }
196
198
 
197
199
  void PhysicalUngroupedAggregate::SinkDistinct(ExecutionContext &context, DataChunk &chunk,
198
200
  OperatorSinkInput &input) const {
199
- auto &sink = input.local_state.Cast<UngroupedAggregateLocalState>();
200
- auto &global_sink = input.global_state.Cast<UngroupedAggregateGlobalState>();
201
+ auto &sink = input.local_state.Cast<UngroupedAggregateLocalSinkState>();
202
+ auto &global_sink = input.global_state.Cast<UngroupedAggregateGlobalSinkState>();
201
203
  D_ASSERT(distinct_data);
202
204
  auto &distinct_state = *global_sink.distinct_state;
203
205
  auto &distinct_info = *distinct_collection_info;
@@ -239,7 +241,7 @@ void PhysicalUngroupedAggregate::SinkDistinct(ExecutionContext &context, DataChu
239
241
 
240
242
  SinkResultType PhysicalUngroupedAggregate::Sink(ExecutionContext &context, DataChunk &chunk,
241
243
  OperatorSinkInput &input) const {
242
- auto &sink = input.local_state.Cast<UngroupedAggregateLocalState>();
244
+ auto &sink = input.local_state.Cast<UngroupedAggregateLocalSinkState>();
243
245
 
244
246
  // perform the aggregation inside the local state
245
247
  sink.Reset();
@@ -296,23 +298,22 @@ SinkResultType PhysicalUngroupedAggregate::Sink(ExecutionContext &context, DataC
296
298
  }
297
299
 
298
300
  //===--------------------------------------------------------------------===//
299
- // Finalize
301
+ // Combine
300
302
  //===--------------------------------------------------------------------===//
301
-
302
303
  void PhysicalUngroupedAggregate::CombineDistinct(ExecutionContext &context, OperatorSinkCombineInput &input) const {
303
- auto &global_sink = input.global_state.Cast<UngroupedAggregateGlobalState>();
304
- auto &source = input.local_state.Cast<UngroupedAggregateLocalState>();
304
+ auto &gstate = input.global_state.Cast<UngroupedAggregateGlobalSinkState>();
305
+ auto &lstate = input.local_state.Cast<UngroupedAggregateLocalSinkState>();
305
306
 
306
307
  if (!distinct_data) {
307
308
  return;
308
309
  }
309
- auto &distinct_state = global_sink.distinct_state;
310
+ auto &distinct_state = gstate.distinct_state;
310
311
  auto table_count = distinct_data->radix_tables.size();
311
312
  for (idx_t table_idx = 0; table_idx < table_count; table_idx++) {
312
313
  D_ASSERT(distinct_data->radix_tables[table_idx]);
313
314
  auto &radix_table = *distinct_data->radix_tables[table_idx];
314
315
  auto &radix_global_sink = *distinct_state->radix_states[table_idx];
315
- auto &radix_local_sink = *source.radix_states[table_idx];
316
+ auto &radix_local_sink = *lstate.radix_states[table_idx];
316
317
 
317
318
  radix_table.Combine(context, radix_global_sink, radix_local_sink);
318
319
  }
@@ -320,18 +321,17 @@ void PhysicalUngroupedAggregate::CombineDistinct(ExecutionContext &context, Oper
320
321
 
321
322
  SinkCombineResultType PhysicalUngroupedAggregate::Combine(ExecutionContext &context,
322
323
  OperatorSinkCombineInput &input) const {
323
- auto &gstate = input.global_state.Cast<UngroupedAggregateGlobalState>();
324
- auto &source = input.local_state.Cast<UngroupedAggregateLocalState>();
324
+ auto &gstate = input.global_state.Cast<UngroupedAggregateGlobalSinkState>();
325
+ auto &lstate = input.local_state.Cast<UngroupedAggregateLocalSinkState>();
325
326
  D_ASSERT(!gstate.finished);
326
327
 
327
328
  // finalize: combine the local state into the global state
328
329
  // all aggregates are combinable: we might be doing a parallel aggregate
329
330
  // use the combine method to combine the partial aggregates
330
- lock_guard<mutex> glock(gstate.lock);
331
-
332
- OperatorSinkCombineInput distinct_input {gstate, source, input.interrupt_state};
331
+ OperatorSinkCombineInput distinct_input {gstate, lstate, input.interrupt_state};
333
332
  CombineDistinct(context, distinct_input);
334
333
 
334
+ lock_guard<mutex> glock(gstate.lock);
335
335
  for (idx_t aggr_idx = 0; aggr_idx < aggregates.size(); aggr_idx++) {
336
336
  auto &aggregate = aggregates[aggr_idx]->Cast<BoundAggregateExpression>();
337
337
 
@@ -339,207 +339,236 @@ SinkCombineResultType PhysicalUngroupedAggregate::Combine(ExecutionContext &cont
339
339
  continue;
340
340
  }
341
341
 
342
- Vector source_state(Value::POINTER(CastPointerToValue(source.state.aggregates[aggr_idx].get())));
342
+ Vector source_state(Value::POINTER(CastPointerToValue(lstate.state.aggregates[aggr_idx].get())));
343
343
  Vector dest_state(Value::POINTER(CastPointerToValue(gstate.state.aggregates[aggr_idx].get())));
344
344
 
345
345
  AggregateInputData aggr_input_data(aggregate.bind_info.get(), gstate.allocator);
346
346
  aggregate.function.combine(source_state, dest_state, aggr_input_data, 1);
347
347
  #ifdef DEBUG
348
- gstate.state.counts[aggr_idx] += source.state.counts[aggr_idx];
348
+ gstate.state.counts[aggr_idx] += lstate.state.counts[aggr_idx];
349
349
  #endif
350
350
  }
351
- source.allocator.Destroy();
351
+ lstate.allocator.Destroy();
352
352
 
353
353
  auto &client_profiler = QueryProfiler::Get(context.client);
354
- context.thread.profiler.Flush(*this, source.child_executor, "child_executor", 0);
354
+ context.thread.profiler.Flush(*this, lstate.child_executor, "child_executor", 0);
355
355
  client_profiler.Flush(context.thread.profiler);
356
356
 
357
357
  return SinkCombineResultType::FINISHED;
358
358
  }
359
359
 
360
- class UngroupedDistinctAggregateFinalizeTask : public ExecutorTask {
360
+ //===--------------------------------------------------------------------===//
361
+ // Finalize
362
+ //===--------------------------------------------------------------------===//
363
+ class UngroupedDistinctAggregateFinalizeEvent : public BasePipelineEvent {
361
364
  public:
362
- UngroupedDistinctAggregateFinalizeTask(Executor &executor, shared_ptr<Event> event_p,
363
- UngroupedAggregateGlobalState &state_p, ClientContext &context,
364
- const PhysicalUngroupedAggregate &op)
365
- : ExecutorTask(executor), event(std::move(event_p)), gstate(state_p), context(context), op(op),
366
- allocator(BufferAllocator::Get(context)) {
365
+ UngroupedDistinctAggregateFinalizeEvent(ClientContext &context, const PhysicalUngroupedAggregate &op_p,
366
+ UngroupedAggregateGlobalSinkState &gstate_p, Pipeline &pipeline_p)
367
+ : BasePipelineEvent(pipeline_p), context(context), op(op_p), gstate(gstate_p), tasks_scheduled(0),
368
+ tasks_done(0) {
367
369
  }
368
370
 
369
- void AggregateDistinct() {
370
- D_ASSERT(gstate.distinct_state);
371
- auto &aggregates = op.aggregates;
372
- auto &distinct_state = *gstate.distinct_state;
373
- auto &distinct_data = *op.distinct_data;
374
-
375
- ThreadContext temp_thread_context(context);
376
- ExecutionContext temp_exec_context(context, temp_thread_context, nullptr);
377
-
378
- idx_t payload_idx = 0;
379
- idx_t next_payload_idx = 0;
371
+ public:
372
+ void Schedule() override;
380
373
 
381
- for (idx_t agg_idx = 0; agg_idx < aggregates.size(); agg_idx++) {
382
- auto &aggregate = aggregates[agg_idx]->Cast<BoundAggregateExpression>();
374
+ private:
375
+ ClientContext &context;
383
376
 
384
- // Forward the payload idx
385
- payload_idx = next_payload_idx;
386
- next_payload_idx = payload_idx + aggregate.children.size();
377
+ const PhysicalUngroupedAggregate &op;
378
+ UngroupedAggregateGlobalSinkState &gstate;
387
379
 
388
- // If aggregate is not distinct, skip it
389
- if (!distinct_data.IsDistinct(agg_idx)) {
390
- continue;
391
- }
380
+ public:
381
+ mutex lock;
382
+ idx_t tasks_scheduled;
383
+ idx_t tasks_done;
392
384
 
393
- DataChunk payload_chunk;
394
-
395
- D_ASSERT(distinct_data.info.table_map.count(agg_idx));
396
- auto table_idx = distinct_data.info.table_map.at(agg_idx);
397
- auto &radix_table_p = distinct_data.radix_tables[table_idx];
398
- auto &output_chunk = *distinct_state.distinct_output_chunks[table_idx];
399
- auto &grouped_aggregate_data = *distinct_data.grouped_aggregate_data[table_idx];
400
-
401
- payload_chunk.InitializeEmpty(grouped_aggregate_data.group_types);
402
- payload_chunk.SetCardinality(0);
403
-
404
- //! Create global and local state for the hashtable
405
- auto global_source_state = radix_table_p->GetGlobalSourceState(context);
406
- auto local_source_state = radix_table_p->GetLocalSourceState(temp_exec_context);
407
-
408
- //! Retrieve the stored data from the hashtable
409
- while (true) {
410
- output_chunk.Reset();
411
-
412
- InterruptState interrupt_state;
413
- OperatorSourceInput source_input {*global_source_state, *local_source_state, interrupt_state};
414
- auto res = radix_table_p->GetData(temp_exec_context, output_chunk,
415
- *distinct_state.radix_states[table_idx], source_input);
416
- if (res == SourceResultType::FINISHED) {
417
- D_ASSERT(output_chunk.size() == 0);
418
- break;
419
- } else if (res == SourceResultType::BLOCKED) {
420
- throw InternalException(
421
- "Unexpected interrupt from radix table GetData in UngroupedDistinctAggregateFinalizeTask");
422
- }
423
-
424
- // We dont need to resolve the filter, we already did this in Sink
425
- idx_t payload_cnt = aggregate.children.size();
426
- for (idx_t i = 0; i < payload_cnt; i++) {
427
- payload_chunk.data[i].Reference(output_chunk.data[i]);
428
- }
429
- payload_chunk.SetCardinality(output_chunk);
430
- #ifdef DEBUG
431
- gstate.state.counts[agg_idx] += payload_chunk.size();
432
- #endif
385
+ vector<unique_ptr<GlobalSourceState>> global_source_states;
386
+ };
433
387
 
434
- auto start_of_input = payload_cnt ? &payload_chunk.data[0] : nullptr;
435
- //! Update the aggregate state
436
- AggregateInputData aggr_input_data(aggregate.bind_info.get(), allocator);
437
- aggregate.function.simple_update(start_of_input, aggr_input_data, payload_cnt,
438
- gstate.state.aggregates[agg_idx].get(), payload_chunk.size());
439
- }
440
- }
441
- D_ASSERT(!gstate.finished);
442
- gstate.finished = true;
388
+ class UngroupedDistinctAggregateFinalizeTask : public ExecutorTask {
389
+ public:
390
+ UngroupedDistinctAggregateFinalizeTask(Executor &executor, shared_ptr<Event> event_p,
391
+ const PhysicalUngroupedAggregate &op,
392
+ UngroupedAggregateGlobalSinkState &state_p)
393
+ : ExecutorTask(executor), event(std::move(event_p)), op(op), gstate(state_p),
394
+ allocator(BufferAllocator::Get(executor.context)) {
443
395
  }
444
396
 
445
- TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
446
- AggregateDistinct();
447
- event->FinishTask();
448
- return TaskExecutionResult::TASK_FINISHED;
449
- }
397
+ TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override;
398
+
399
+ private:
400
+ void AggregateDistinct();
450
401
 
451
402
  private:
452
403
  shared_ptr<Event> event;
453
- UngroupedAggregateGlobalState &gstate;
454
- ClientContext &context;
404
+
455
405
  const PhysicalUngroupedAggregate &op;
406
+ UngroupedAggregateGlobalSinkState &gstate;
407
+
456
408
  ArenaAllocator allocator;
457
409
  };
458
410
 
459
- // TODO: Create tasks and run these in parallel instead of doing this all in Schedule, single threaded
460
- class UngroupedDistinctAggregateFinalizeEvent : public BasePipelineEvent {
461
- public:
462
- UngroupedDistinctAggregateFinalizeEvent(const PhysicalUngroupedAggregate &op_p,
463
- UngroupedAggregateGlobalState &gstate_p, Pipeline &pipeline_p,
464
- ClientContext &context)
465
- : BasePipelineEvent(pipeline_p), op(op_p), gstate(gstate_p), context(context) {
466
- }
467
- const PhysicalUngroupedAggregate &op;
468
- UngroupedAggregateGlobalState &gstate;
469
- ClientContext &context;
411
+ void UngroupedDistinctAggregateFinalizeEvent::Schedule() {
412
+ D_ASSERT(gstate.distinct_state);
413
+ auto &aggregates = op.aggregates;
414
+ auto &distinct_data = *op.distinct_data;
470
415
 
471
- public:
472
- void Schedule() override {
473
- vector<shared_ptr<Task>> tasks;
474
- tasks.push_back(make_uniq<UngroupedDistinctAggregateFinalizeTask>(pipeline->executor, shared_from_this(),
475
- gstate, context, op));
476
- D_ASSERT(!tasks.empty());
477
- SetTasks(std::move(tasks));
416
+ idx_t payload_idx = 0;
417
+ idx_t next_payload_idx = 0;
418
+ for (idx_t agg_idx = 0; agg_idx < aggregates.size(); agg_idx++) {
419
+ auto &aggregate = aggregates[agg_idx]->Cast<BoundAggregateExpression>();
420
+
421
+ // Forward the payload idx
422
+ payload_idx = next_payload_idx;
423
+ next_payload_idx = payload_idx + aggregate.children.size();
424
+
425
+ // If aggregate is not distinct, skip it
426
+ if (!distinct_data.IsDistinct(agg_idx)) {
427
+ global_source_states.push_back(nullptr);
428
+ continue;
429
+ }
430
+ D_ASSERT(distinct_data.info.table_map.count(agg_idx));
431
+
432
+ // Create global state for scanning
433
+ auto table_idx = distinct_data.info.table_map.at(agg_idx);
434
+ auto &radix_table_p = *distinct_data.radix_tables[table_idx];
435
+ global_source_states.push_back(radix_table_p.GetGlobalSourceState(context));
478
436
  }
479
- };
480
437
 
481
- class UngroupedDistinctCombineFinalizeEvent : public BasePipelineEvent {
482
- public:
483
- UngroupedDistinctCombineFinalizeEvent(const PhysicalUngroupedAggregate &op_p,
484
- UngroupedAggregateGlobalState &gstate_p, Pipeline &pipeline_p,
485
- ClientContext &client)
486
- : BasePipelineEvent(pipeline_p), op(op_p), gstate(gstate_p), client(client) {
438
+ const idx_t n_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
439
+ vector<shared_ptr<Task>> tasks;
440
+ for (idx_t i = 0; i < n_threads; i++) {
441
+ tasks.push_back(
442
+ make_uniq<UngroupedDistinctAggregateFinalizeTask>(pipeline->executor, shared_from_this(), op, gstate));
443
+ tasks_scheduled++;
487
444
  }
445
+ SetTasks(std::move(tasks));
446
+ }
488
447
 
489
- const PhysicalUngroupedAggregate &op;
490
- UngroupedAggregateGlobalState &gstate;
491
- ClientContext &client;
448
+ TaskExecutionResult UngroupedDistinctAggregateFinalizeTask::ExecuteTask(TaskExecutionMode mode) {
449
+ AggregateDistinct();
450
+ event->FinishTask();
451
+ return TaskExecutionResult::TASK_FINISHED;
452
+ }
492
453
 
493
- public:
494
- void Schedule() override {
495
- auto &distinct_state = *gstate.distinct_state;
496
- auto &distinct_data = *op.distinct_data;
497
- vector<shared_ptr<Task>> tasks;
498
- for (idx_t table_idx = 0; table_idx < distinct_data.radix_tables.size(); table_idx++) {
499
- distinct_data.radix_tables[table_idx]->ScheduleTasks(pipeline->executor, shared_from_this(),
500
- *distinct_state.radix_states[table_idx], tasks);
454
+ void UngroupedDistinctAggregateFinalizeTask::AggregateDistinct() {
455
+ D_ASSERT(gstate.distinct_state);
456
+ auto &distinct_state = *gstate.distinct_state;
457
+ auto &distinct_data = *op.distinct_data;
458
+
459
+ // Create thread-local copy of aggregate state
460
+ auto &aggregates = op.aggregates;
461
+ AggregateState state(aggregates);
462
+
463
+ // Thread-local contexts
464
+ ThreadContext thread_context(executor.context);
465
+ ExecutionContext execution_context(executor.context, thread_context, nullptr);
466
+
467
+ auto &finalize_event = event->Cast<UngroupedDistinctAggregateFinalizeEvent>();
468
+
469
+ // Now loop through the distinct aggregates, scanning the distinct HTs
470
+ idx_t payload_idx = 0;
471
+ idx_t next_payload_idx = 0;
472
+ for (idx_t agg_idx = 0; agg_idx < aggregates.size(); agg_idx++) {
473
+ auto &aggregate = aggregates[agg_idx]->Cast<BoundAggregateExpression>();
474
+
475
+ // Forward the payload idx
476
+ payload_idx = next_payload_idx;
477
+ next_payload_idx = payload_idx + aggregate.children.size();
478
+
479
+ // If aggregate is not distinct, skip it
480
+ if (!distinct_data.IsDistinct(agg_idx)) {
481
+ continue;
482
+ }
483
+
484
+ const auto table_idx = distinct_data.info.table_map.at(agg_idx);
485
+ auto &radix_table = *distinct_data.radix_tables[table_idx];
486
+ auto lstate = radix_table.GetLocalSourceState(execution_context);
487
+
488
+ auto &sink = *distinct_state.radix_states[table_idx];
489
+ InterruptState interrupt_state;
490
+ OperatorSourceInput source_input {*finalize_event.global_source_states[agg_idx], *lstate, interrupt_state};
491
+
492
+ DataChunk output_chunk;
493
+ output_chunk.Initialize(executor.context, distinct_state.distinct_output_chunks[table_idx]->GetTypes());
494
+
495
+ DataChunk payload_chunk;
496
+ payload_chunk.InitializeEmpty(distinct_data.grouped_aggregate_data[table_idx]->group_types);
497
+ payload_chunk.SetCardinality(0);
498
+
499
+ AggregateInputData aggr_input_data(aggregate.bind_info.get(), allocator);
500
+ while (true) {
501
+ output_chunk.Reset();
502
+
503
+ auto res = radix_table.GetData(execution_context, output_chunk, sink, source_input);
504
+ if (res == SourceResultType::FINISHED) {
505
+ D_ASSERT(output_chunk.size() == 0);
506
+ break;
507
+ } else if (res == SourceResultType::BLOCKED) {
508
+ throw InternalException(
509
+ "Unexpected interrupt from radix table GetData in UngroupedDistinctAggregateFinalizeTask");
510
+ }
511
+
512
+ // We dont need to resolve the filter, we already did this in Sink
513
+ idx_t payload_cnt = aggregate.children.size();
514
+ for (idx_t i = 0; i < payload_cnt; i++) {
515
+ payload_chunk.data[i].Reference(output_chunk.data[i]);
516
+ }
517
+ payload_chunk.SetCardinality(output_chunk);
518
+
519
+ #ifdef DEBUG
520
+ gstate.state.counts[agg_idx] += payload_chunk.size();
521
+ #endif
522
+
523
+ // Update the aggregate state
524
+ auto start_of_input = payload_cnt ? &payload_chunk.data[0] : nullptr;
525
+ aggregate.function.simple_update(start_of_input, aggr_input_data, payload_cnt,
526
+ state.aggregates[agg_idx].get(), payload_chunk.size());
501
527
  }
502
- D_ASSERT(!tasks.empty());
503
- SetTasks(std::move(tasks));
504
528
  }
505
529
 
506
- void FinishEvent() override {
507
- //! Now that all tables are combined, it's time to do the distinct aggregations
508
- auto new_event = make_shared<UngroupedDistinctAggregateFinalizeEvent>(op, gstate, *pipeline, client);
509
- this->InsertEvent(std::move(new_event));
530
+ // After scanning the distinct HTs, we can combine the thread-local agg states with the thread-global
531
+ lock_guard<mutex> guard(finalize_event.lock);
532
+ payload_idx = 0;
533
+ next_payload_idx = 0;
534
+ for (idx_t agg_idx = 0; agg_idx < aggregates.size(); agg_idx++) {
535
+ if (!distinct_data.IsDistinct(agg_idx)) {
536
+ continue;
537
+ }
538
+
539
+ auto &aggregate = aggregates[agg_idx]->Cast<BoundAggregateExpression>();
540
+ AggregateInputData aggr_input_data(aggregate.bind_info.get(), allocator);
541
+
542
+ Vector state_vec(Value::POINTER(CastPointerToValue(state.aggregates[agg_idx].get())));
543
+ Vector combined_vec(Value::POINTER(CastPointerToValue(gstate.state.aggregates[agg_idx].get())));
544
+ aggregate.function.combine(state_vec, combined_vec, aggr_input_data, 1);
510
545
  }
511
- };
546
+
547
+ D_ASSERT(!gstate.finished);
548
+ if (++finalize_event.tasks_done == finalize_event.tasks_scheduled) {
549
+ gstate.finished = true;
550
+ }
551
+ }
512
552
 
513
553
  SinkFinalizeType PhysicalUngroupedAggregate::FinalizeDistinct(Pipeline &pipeline, Event &event, ClientContext &context,
514
554
  GlobalSinkState &gstate_p) const {
515
- auto &gstate = gstate_p.Cast<UngroupedAggregateGlobalState>();
555
+ auto &gstate = gstate_p.Cast<UngroupedAggregateGlobalSinkState>();
516
556
  D_ASSERT(distinct_data);
517
557
  auto &distinct_state = *gstate.distinct_state;
518
558
 
519
- bool any_partitioned = false;
520
559
  for (idx_t table_idx = 0; table_idx < distinct_data->radix_tables.size(); table_idx++) {
521
560
  auto &radix_table_p = distinct_data->radix_tables[table_idx];
522
561
  auto &radix_state = *distinct_state.radix_states[table_idx];
523
- bool partitioned = radix_table_p->Finalize(context, radix_state);
524
- if (partitioned) {
525
- any_partitioned = true;
526
- }
527
- }
528
- if (any_partitioned) {
529
- auto new_event = make_shared<UngroupedDistinctCombineFinalizeEvent>(*this, gstate, pipeline, context);
530
- event.InsertEvent(std::move(new_event));
531
- } else {
532
- //! Hashtables aren't partitioned, they dont need to be joined first
533
- //! So we can compute the aggregate already
534
- auto new_event = make_shared<UngroupedDistinctAggregateFinalizeEvent>(*this, gstate, pipeline, context);
535
- event.InsertEvent(std::move(new_event));
562
+ radix_table_p->Finalize(context, radix_state);
536
563
  }
564
+ auto new_event = make_shared<UngroupedDistinctAggregateFinalizeEvent>(context, *this, gstate, pipeline);
565
+ event.InsertEvent(std::move(new_event));
537
566
  return SinkFinalizeType::READY;
538
567
  }
539
568
 
540
569
  SinkFinalizeType PhysicalUngroupedAggregate::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
541
570
  OperatorSinkFinalizeInput &input) const {
542
- auto &gstate = input.global_state.Cast<UngroupedAggregateGlobalState>();
571
+ auto &gstate = input.global_state.Cast<UngroupedAggregateGlobalSinkState>();
543
572
 
544
573
  if (distinct_data) {
545
574
  return FinalizeDistinct(pipeline, event, context, input.global_state);
@@ -569,7 +598,7 @@ void VerifyNullHandling(DataChunk &chunk, AggregateState &state, const vector<un
569
598
 
570
599
  SourceResultType PhysicalUngroupedAggregate::GetData(ExecutionContext &context, DataChunk &chunk,
571
600
  OperatorSourceInput &input) const {
572
- auto &gstate = sink_state->Cast<UngroupedAggregateGlobalState>();
601
+ auto &gstate = sink_state->Cast<UngroupedAggregateGlobalSinkState>();
573
602
  D_ASSERT(gstate.finished);
574
603
 
575
604
  // initialize the result chunk with the aggregate values
@@ -14,7 +14,6 @@
14
14
  #include "duckdb/common/vector_operations/vector_operations.hpp"
15
15
  #include "duckdb/common/windows_undefs.hpp"
16
16
  #include "duckdb/execution/expression_executor.hpp"
17
- #include "duckdb/execution/partitionable_hashtable.hpp"
18
17
  #include "duckdb/execution/window_executor.hpp"
19
18
  #include "duckdb/execution/window_segment_tree.hpp"
20
19
  #include "duckdb/main/client_config.hpp"
@@ -222,6 +221,7 @@ WindowGlobalSourceState::WindowGlobalSourceState(ClientContext &context_p, Windo
222
221
  }
223
222
  } else {
224
223
  built.resize(hash_groups.size());
224
+ idx_t batch_base = 0;
225
225
  for (auto &hash_group : hash_groups) {
226
226
  if (!hash_group) {
227
227
  continue;
@@ -235,6 +235,9 @@ WindowGlobalSourceState::WindowGlobalSourceState(ClientContext &context_p, Windo
235
235
  auto &sb = *global_sort_state.sorted_blocks[0];
236
236
  auto &sd = *sb.payload_data;
237
237
  tasks_remaining += sd.data_blocks.size();
238
+
239
+ hash_group->batch_base = batch_base;
240
+ batch_base += sd.data_blocks.size();
238
241
  }
239
242
  }
240
243
  }
@@ -436,6 +439,7 @@ public:
436
439
  using ReadStates = vector<ReadStatePtr>;
437
440
 
438
441
  explicit WindowLocalSourceState(WindowGlobalSourceState &gsource);
442
+ void UpdateBatchIndex();
439
443
  bool NextPartition();
440
444
  void Scan(DataChunk &chunk);
441
445
 
@@ -443,6 +447,8 @@ public:
443
447
  WindowGlobalSourceState &gsource;
444
448
  //! The current bin being processed
445
449
  idx_t hash_bin;
450
+ //! The current batch index (for output reordering)
451
+ idx_t batch_index;
446
452
  //! The current source being processed
447
453
  optional_ptr<WindowPartitionSourceState> partition_source;
448
454
  //! The read cursor
@@ -456,7 +462,7 @@ public:
456
462
  };
457
463
 
458
464
  WindowLocalSourceState::WindowLocalSourceState(WindowGlobalSourceState &gsource)
459
- : gsource(gsource), hash_bin(gsource.built.size()) {
465
+ : gsource(gsource), hash_bin(gsource.built.size()), batch_index(0) {
460
466
  auto &gsink = *gsource.gsink.global_partition;
461
467
  auto &op = gsource.gsink.op;
462
468
 
@@ -564,6 +570,14 @@ WindowGlobalSourceState::Task WindowGlobalSourceState::NextTask(idx_t hash_bin)
564
570
  return Task();
565
571
  }
566
572
 
573
+ void WindowLocalSourceState::UpdateBatchIndex() {
574
+ D_ASSERT(partition_source);
575
+ D_ASSERT(scanner.get());
576
+
577
+ batch_index = partition_source->hash_group ? partition_source->hash_group->batch_base : 0;
578
+ batch_index += scanner->BlockIndex();
579
+ }
580
+
567
581
  bool WindowLocalSourceState::NextPartition() {
568
582
  // Release old states before the source
569
583
  scanner.reset();
@@ -578,6 +592,7 @@ bool WindowLocalSourceState::NextPartition() {
578
592
  partition_source = task.first;
579
593
  scanner = std::move(task.second);
580
594
  hash_bin = partition_source->hash_bin;
595
+ UpdateBatchIndex();
581
596
  }
582
597
 
583
598
  for (auto &wexec : partition_source->executors) {
@@ -599,6 +614,8 @@ void WindowLocalSourceState::Scan(DataChunk &result) {
599
614
  read_states.clear();
600
615
  return;
601
616
  }
617
+
618
+ UpdateBatchIndex();
602
619
  }
603
620
 
604
621
  const auto position = scanner->Scanned();
@@ -638,6 +655,23 @@ unique_ptr<GlobalSourceState> PhysicalWindow::GetGlobalSourceState(ClientContext
638
655
  return make_uniq<WindowGlobalSourceState>(context, gsink);
639
656
  }
640
657
 
658
+ bool PhysicalWindow::SupportsBatchIndex() const {
659
+ // We can only preserve order for single partitioning
660
+ // or work stealing causes out of order batch numbers
661
+ auto &wexpr = select_list[0]->Cast<BoundWindowExpression>();
662
+ return wexpr.partitions.empty() && !wexpr.orders.empty();
663
+ }
664
+
665
+ OrderPreservationType PhysicalWindow::SourceOrder() const {
666
+ return SupportsBatchIndex() ? OrderPreservationType::FIXED_ORDER : OrderPreservationType::NO_ORDER;
667
+ }
668
+
669
+ idx_t PhysicalWindow::GetBatchIndex(ExecutionContext &context, DataChunk &chunk, GlobalSourceState &gstate_p,
670
+ LocalSourceState &lstate_p) const {
671
+ auto &lstate = lstate_p.Cast<WindowLocalSourceState>();
672
+ return lstate.batch_index;
673
+ }
674
+
641
675
  SourceResultType PhysicalWindow::GetData(ExecutionContext &context, DataChunk &chunk,
642
676
  OperatorSourceInput &input) const {
643
677
  auto &lsource = input.local_state.Cast<WindowLocalSourceState>();