duckdb 0.7.2-dev2366.0 → 0.7.2-dev2430.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
  3. package/src/duckdb/src/common/file_buffer.cpp +8 -0
  4. package/src/duckdb/src/common/radix_partitioning.cpp +34 -0
  5. package/src/duckdb/src/common/sort/partition_state.cpp +44 -124
  6. package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
  7. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +144 -31
  8. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +698 -0
  9. package/src/duckdb/src/execution/physical_plan/plan_asof_join.cpp +7 -1
  10. package/src/duckdb/src/function/scalar/list/list_sort.cpp +30 -56
  11. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  12. package/src/duckdb/src/include/duckdb/common/enums/debug_initialize.hpp +17 -0
  13. package/src/duckdb/src/include/duckdb/common/enums/order_type.hpp +8 -0
  14. package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
  15. package/src/duckdb/src/include/duckdb/common/file_buffer.hpp +3 -0
  16. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +3 -0
  17. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +11 -60
  18. package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +6 -1
  19. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_asof_join.hpp +93 -0
  20. package/src/duckdb/src/include/duckdb/execution/physical_operator.hpp +1 -1
  21. package/src/duckdb/src/include/duckdb/main/client_config.hpp +2 -0
  22. package/src/duckdb/src/include/duckdb/main/config.hpp +7 -2
  23. package/src/duckdb/src/include/duckdb/main/settings.hpp +13 -3
  24. package/src/duckdb/src/include/duckdb/parser/expression/window_expression.hpp +4 -2
  25. package/src/duckdb/src/include/duckdb/storage/block_manager.hpp +1 -0
  26. package/src/duckdb/src/include/duckdb/storage/in_memory_block_manager.hpp +3 -0
  27. package/src/duckdb/src/include/duckdb/storage/partial_block_manager.hpp +2 -1
  28. package/src/duckdb/src/include/duckdb/storage/single_file_block_manager.hpp +11 -5
  29. package/src/duckdb/src/main/config.cpp +26 -0
  30. package/src/duckdb/src/main/settings/settings.cpp +31 -8
  31. package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +2 -5
  32. package/src/duckdb/src/planner/binder/expression/bind_window_expression.cpp +6 -14
  33. package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +2 -5
  34. package/src/duckdb/src/storage/buffer/block_manager.cpp +1 -2
  35. package/src/duckdb/src/storage/meta_block_writer.cpp +4 -0
  36. package/src/duckdb/src/storage/partial_block_manager.cpp +11 -4
  37. package/src/duckdb/src/storage/single_file_block_manager.cpp +16 -9
  38. package/src/duckdb/src/storage/standard_buffer_manager.cpp +5 -2
  39. package/src/duckdb/src/storage/storage_manager.cpp +7 -2
  40. package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +21 -1
  41. package/src/duckdb/ub_src_execution_operator_join.cpp +2 -0
@@ -0,0 +1,698 @@
1
+ #include "duckdb/execution/operator/join/physical_asof_join.hpp"
2
+
3
+ #include "duckdb/common/fast_mem.hpp"
4
+ #include "duckdb/common/operator/comparison_operators.hpp"
5
+ #include "duckdb/common/row_operations/row_operations.hpp"
6
+ #include "duckdb/common/sort/comparators.hpp"
7
+ #include "duckdb/common/sort/partition_state.hpp"
8
+ #include "duckdb/common/sort/sort.hpp"
9
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
10
+ #include "duckdb/execution/expression_executor.hpp"
11
+ #include "duckdb/execution/operator/join/outer_join_marker.hpp"
12
+ #include "duckdb/main/client_context.hpp"
13
+ #include "duckdb/parallel/event.hpp"
14
+ #include "duckdb/parallel/thread_context.hpp"
15
+
16
+ namespace duckdb {
17
+
18
+ PhysicalAsOfJoin::PhysicalAsOfJoin(LogicalComparisonJoin &op, unique_ptr<PhysicalOperator> left,
19
+ unique_ptr<PhysicalOperator> right)
20
+ : PhysicalComparisonJoin(op, PhysicalOperatorType::ASOF_JOIN, std::move(op.conditions), op.join_type,
21
+ op.estimated_cardinality) {
22
+
23
+ // Convert the conditions partitions and sorts
24
+ for (auto &cond : conditions) {
25
+ D_ASSERT(cond.left->return_type == cond.right->return_type);
26
+ join_key_types.push_back(cond.left->return_type);
27
+
28
+ auto left = cond.left->Copy();
29
+ auto right = cond.right->Copy();
30
+ switch (cond.comparison) {
31
+ case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
32
+ null_sensitive.emplace_back(lhs_orders.size());
33
+ lhs_orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_LAST, std::move(left));
34
+ rhs_orders.emplace_back(OrderType::ASCENDING, OrderByNullType::NULLS_LAST, std::move(right));
35
+ break;
36
+ case ExpressionType::COMPARE_EQUAL:
37
+ null_sensitive.emplace_back(lhs_orders.size());
38
+ // Fall through
39
+ case ExpressionType::COMPARE_NOT_DISTINCT_FROM:
40
+ lhs_partitions.emplace_back(std::move(left));
41
+ rhs_partitions.emplace_back(std::move(right));
42
+ break;
43
+ default:
44
+ throw NotImplementedException("Unsupported join condition for ASOF join");
45
+ }
46
+ }
47
+ D_ASSERT(!lhs_orders.empty());
48
+ D_ASSERT(!rhs_orders.empty());
49
+
50
+ children.push_back(std::move(left));
51
+ children.push_back(std::move(right));
52
+
53
+ // Fill out the right projection map.
54
+ right_projection_map = op.right_projection_map;
55
+ if (right_projection_map.empty()) {
56
+ const auto right_count = children[1]->types.size();
57
+ right_projection_map.reserve(right_count);
58
+ for (column_t i = 0; i < right_count; ++i) {
59
+ right_projection_map.emplace_back(i);
60
+ }
61
+ }
62
+ }
63
+
64
+ //===--------------------------------------------------------------------===//
65
+ // Sink
66
+ //===--------------------------------------------------------------------===//
67
+ class AsOfGlobalSinkState : public GlobalSinkState {
68
+ public:
69
+ AsOfGlobalSinkState(ClientContext &context, const PhysicalAsOfJoin &op)
70
+ : global_partition(context, op.rhs_partitions, op.rhs_orders, op.children[1]->types, {},
71
+ op.estimated_cardinality),
72
+ is_outer(IsRightOuterJoin(op.join_type)), has_null(false) {
73
+ }
74
+
75
+ idx_t Count() const {
76
+ return global_partition.count;
77
+ }
78
+
79
+ PartitionGlobalSinkState global_partition;
80
+
81
+ // One per partition
82
+ const bool is_outer;
83
+ vector<OuterJoinMarker> right_outers;
84
+ bool has_null;
85
+ };
86
+
87
+ class AsOfLocalSinkState : public LocalSinkState {
88
+ public:
89
+ explicit AsOfLocalSinkState(ClientContext &context, PartitionGlobalSinkState &gstate_p)
90
+ : local_partition(context, gstate_p) {
91
+ }
92
+
93
+ void Sink(DataChunk &input_chunk) {
94
+ local_partition.Sink(input_chunk);
95
+ }
96
+
97
+ void Combine() {
98
+ local_partition.Combine();
99
+ }
100
+
101
+ PartitionLocalSinkState local_partition;
102
+ };
103
+
104
+ unique_ptr<GlobalSinkState> PhysicalAsOfJoin::GetGlobalSinkState(ClientContext &context) const {
105
+ return make_uniq<AsOfGlobalSinkState>(context, *this);
106
+ }
107
+
108
+ unique_ptr<LocalSinkState> PhysicalAsOfJoin::GetLocalSinkState(ExecutionContext &context) const {
109
+ // We only sink the RHS
110
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
111
+ return make_uniq<AsOfLocalSinkState>(context.client, gsink.global_partition);
112
+ }
113
+
114
+ SinkResultType PhysicalAsOfJoin::Sink(ExecutionContext &context, GlobalSinkState &gstate_p, LocalSinkState &lstate_p,
115
+ DataChunk &input) const {
116
+ auto &lstate = lstate_p.Cast<AsOfLocalSinkState>();
117
+
118
+ lstate.Sink(input);
119
+
120
+ return SinkResultType::NEED_MORE_INPUT;
121
+ }
122
+
123
+ void PhysicalAsOfJoin::Combine(ExecutionContext &context, GlobalSinkState &gstate_p, LocalSinkState &lstate_p) const {
124
+ auto &lstate = lstate_p.Cast<AsOfLocalSinkState>();
125
+ lstate.Combine();
126
+ }
127
+
128
+ //===--------------------------------------------------------------------===//
129
+ // Finalize
130
+ //===--------------------------------------------------------------------===//
131
+ SinkFinalizeType PhysicalAsOfJoin::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
132
+ GlobalSinkState &gstate_p) const {
133
+ auto &gstate = gstate_p.Cast<AsOfGlobalSinkState>();
134
+
135
+ // Find the first group to sort
136
+ auto &groups = gstate.global_partition.grouping_data->GetPartitions();
137
+ if (groups.empty() && EmptyResultIfRHSIsEmpty()) {
138
+ // Empty input!
139
+ return SinkFinalizeType::NO_OUTPUT_POSSIBLE;
140
+ }
141
+
142
+ // Schedule all the sorts for maximum thread utilisation
143
+ auto new_event = make_shared<PartitionMergeEvent>(gstate.global_partition, pipeline);
144
+ event.InsertEvent(std::move(new_event));
145
+
146
+ return SinkFinalizeType::READY;
147
+ }
148
+
149
+ //===--------------------------------------------------------------------===//
150
+ // Operator
151
+ //===--------------------------------------------------------------------===//
152
+ class AsOfGlobalState : public GlobalOperatorState {
153
+ public:
154
+ explicit AsOfGlobalState(AsOfGlobalSinkState &gsink) {
155
+ // for FULL/RIGHT OUTER JOIN, initialize right_outers to false for every tuple
156
+ auto &global_partition = gsink.global_partition;
157
+ auto &right_outers = gsink.right_outers;
158
+ right_outers.reserve(global_partition.hash_groups.size());
159
+ for (const auto &hash_group : global_partition.hash_groups) {
160
+ right_outers.emplace_back(OuterJoinMarker(gsink.is_outer));
161
+ right_outers.back().Initialize(hash_group->count);
162
+ }
163
+ }
164
+ };
165
+
166
+ unique_ptr<GlobalOperatorState> PhysicalAsOfJoin::GetGlobalOperatorState(ClientContext &context) const {
167
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
168
+ return make_uniq<AsOfGlobalState>(gsink);
169
+ }
170
+
171
+ class AsOfLocalState : public CachingOperatorState {
172
+ public:
173
+ using Orders = vector<BoundOrderByNode>;
174
+ using Match = std::pair<hash_t, idx_t>;
175
+
176
+ AsOfLocalState(ClientContext &context, const PhysicalAsOfJoin &op, bool force_external);
177
+
178
+ public:
179
+ void ResolveJoin(DataChunk &input, bool *found_matches, Match *matches = nullptr);
180
+
181
+ void ResolveJoinKeys(DataChunk &input);
182
+
183
+ ClientContext &context;
184
+ Allocator &allocator;
185
+ const PhysicalAsOfJoin &op;
186
+ BufferManager &buffer_manager;
187
+ const bool force_external;
188
+ Orders lhs_orders;
189
+
190
+ // LHS sorting
191
+ ExpressionExecutor lhs_executor;
192
+ DataChunk lhs_keys;
193
+ ValidityMask lhs_valid_mask;
194
+ SelectionVector lhs_sel;
195
+ idx_t lhs_valid;
196
+ RowLayout lhs_layout;
197
+ unique_ptr<GlobalSortState> lhs_global_state;
198
+ DataChunk lhs_sorted;
199
+
200
+ // LHS binning
201
+ Vector hash_vector;
202
+ Vector bin_vector;
203
+
204
+ // Output
205
+ idx_t lhs_match_count;
206
+ SelectionVector lhs_matched;
207
+ OuterJoinMarker left_outer;
208
+ bool fetch_next_left;
209
+ DataChunk group_payload;
210
+ DataChunk rhs_payload;
211
+ };
212
+
213
+ AsOfLocalState::AsOfLocalState(ClientContext &context, const PhysicalAsOfJoin &op, bool force_external)
214
+ : context(context), allocator(Allocator::Get(context)), op(op),
215
+ buffer_manager(BufferManager::GetBufferManager(context)), force_external(force_external), lhs_executor(context),
216
+ hash_vector(LogicalType::HASH), bin_vector(LogicalType::HASH), left_outer(IsLeftOuterJoin(op.join_type)),
217
+ fetch_next_left(true) {
218
+ vector<unique_ptr<BaseStatistics>> partition_stats;
219
+ Orders partitions; // Not used.
220
+ PartitionGlobalSinkState::GenerateOrderings(partitions, lhs_orders, op.lhs_partitions, op.lhs_orders,
221
+ partition_stats);
222
+
223
+ // We sort the row numbers of the incoming block, not the rows
224
+ lhs_layout.Initialize({LogicalType::UINTEGER});
225
+ lhs_sorted.Initialize(allocator, lhs_layout.GetTypes());
226
+
227
+ lhs_keys.Initialize(allocator, op.join_key_types);
228
+ for (const auto &cond : op.conditions) {
229
+ lhs_executor.AddExpression(*cond.left);
230
+ }
231
+
232
+ group_payload.Initialize(allocator, op.children[1]->types);
233
+ rhs_payload.Initialize(allocator, op.children[1]->types);
234
+
235
+ lhs_matched.Initialize();
236
+ lhs_sel.Initialize();
237
+ left_outer.Initialize(STANDARD_VECTOR_SIZE);
238
+ }
239
+
240
+ void AsOfLocalState::ResolveJoinKeys(DataChunk &input) {
241
+ // Compute the join keys
242
+ lhs_keys.Reset();
243
+ lhs_executor.Execute(input, lhs_keys);
244
+
245
+ // Extract the NULLs
246
+ const auto count = input.size();
247
+ lhs_valid_mask.Reset();
248
+ for (auto col_idx : op.null_sensitive) {
249
+ auto &col = lhs_keys.data[col_idx];
250
+ UnifiedVectorFormat unified;
251
+ col.ToUnifiedFormat(count, unified);
252
+ lhs_valid_mask.Combine(unified.validity, count);
253
+ }
254
+
255
+ // Convert the mask to a selection vector.
256
+ // We need this anyway for sorting
257
+ lhs_valid = 0;
258
+ const auto entry_count = lhs_valid_mask.EntryCount(count);
259
+ idx_t base_idx = 0;
260
+ for (idx_t entry_idx = 0; entry_idx < entry_count;) {
261
+ const auto validity_entry = lhs_valid_mask.GetValidityEntry(entry_idx++);
262
+ const auto next = MinValue<idx_t>(base_idx + ValidityMask::BITS_PER_VALUE, count);
263
+ if (ValidityMask::AllValid(validity_entry)) {
264
+ for (; base_idx < next; ++base_idx) {
265
+ lhs_sel.set_index(lhs_valid++, base_idx);
266
+ }
267
+ } else if (ValidityMask::NoneValid(validity_entry)) {
268
+ base_idx = next;
269
+ } else {
270
+ const auto start = base_idx;
271
+ for (; base_idx < next; ++base_idx) {
272
+ if (ValidityMask::RowIsValid(validity_entry, base_idx - start)) {
273
+ lhs_sel.set_index(lhs_valid++, base_idx);
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+ // Slice the keys to the ones we can match
280
+ if (lhs_valid < count) {
281
+ lhs_keys.Slice(lhs_sel, lhs_valid);
282
+ }
283
+
284
+ // Hash to assign the partitions
285
+ auto &global_partition = op.sink_state->Cast<AsOfGlobalSinkState>().global_partition;
286
+ if (op.lhs_partitions.empty()) {
287
+ // Only one hash group
288
+ bin_vector.Reference(Value::HASH(0));
289
+ } else {
290
+ // Hash to determine the partitions.
291
+ VectorOperations::Hash(lhs_keys.data[0], hash_vector, lhs_sel, lhs_valid);
292
+ for (size_t prt_idx = 1; prt_idx < op.lhs_partitions.size(); ++prt_idx) {
293
+ VectorOperations::CombineHash(hash_vector, lhs_keys.data[prt_idx], lhs_sel, lhs_valid);
294
+ }
295
+
296
+ // Convert hashes to hash groups
297
+ const auto radix_bits = global_partition.grouping_data->GetRadixBits();
298
+ RadixPartitioning::HashesToBins(hash_vector, radix_bits, bin_vector, count);
299
+ }
300
+
301
+ // Sort the selection vector on the valid keys
302
+ lhs_global_state = make_uniq<GlobalSortState>(buffer_manager, lhs_orders, lhs_layout);
303
+ auto &global_state = *lhs_global_state;
304
+ LocalSortState local_sort;
305
+ local_sort.Initialize(*lhs_global_state, buffer_manager);
306
+
307
+ DataChunk payload_chunk;
308
+ payload_chunk.InitializeEmpty({LogicalType::UINTEGER});
309
+ FlatVector::SetData(payload_chunk.data[0], (data_ptr_t)lhs_sel.data());
310
+ payload_chunk.SetCardinality(lhs_valid);
311
+ local_sort.SinkChunk(lhs_keys, payload_chunk);
312
+
313
+ // Set external (can be forced with the PRAGMA)
314
+ global_state.external = force_external;
315
+ global_state.AddLocalState(local_sort);
316
+ global_state.PrepareMergePhase();
317
+ while (global_state.sorted_blocks.size() > 1) {
318
+ MergeSorter merge_sorter(*lhs_global_state, buffer_manager);
319
+ merge_sorter.PerformInMergeRound();
320
+ global_state.CompleteMergeRound();
321
+ }
322
+
323
+ // Scan the sorted selection
324
+ D_ASSERT(global_state.sorted_blocks.size() == 1);
325
+
326
+ auto scanner = make_uniq<PayloadScanner>(*global_state.sorted_blocks[0]->payload_data, global_state, false);
327
+ lhs_sorted.Reset();
328
+ scanner->Scan(lhs_sorted);
329
+ }
330
+
331
+ void AsOfLocalState::ResolveJoin(DataChunk &input, bool *found_match, std::pair<hash_t, idx_t> *matches) {
332
+ // Sort the input into lhs_payload, radix keys in lhs_global_state
333
+ ResolveJoinKeys(input);
334
+
335
+ auto &gsink = op.sink_state->Cast<AsOfGlobalSinkState>();
336
+ auto &global_partition = gsink.global_partition;
337
+
338
+ // The bins are contiguous from sorting, so load them one at a time
339
+ // But they may be constant, so unify.
340
+ UnifiedVectorFormat bin_unified;
341
+ bin_vector.ToUnifiedFormat(lhs_valid, bin_unified);
342
+ const auto bins = (hash_t *)bin_unified.data;
343
+
344
+ hash_t prev_bin = global_partition.bin_groups.size();
345
+ optional_ptr<PartitionGlobalHashGroup> hash_group;
346
+ optional_ptr<OuterJoinMarker> right_outer;
347
+ // Searching for right <= left
348
+ SBIterator left(*lhs_global_state, ExpressionType::COMPARE_LESSTHANOREQUALTO);
349
+ unique_ptr<SBIterator> right;
350
+ lhs_match_count = 0;
351
+ const auto sorted_sel = FlatVector::GetData<sel_t>(lhs_sorted.data[0]);
352
+ for (idx_t i = 0; i < lhs_valid; ++i) {
353
+ // idx is the index in the input; i is the index in the sorted keys
354
+ const auto idx = sorted_sel[i];
355
+ const auto curr_bin = bins[bin_unified.sel->get_index(idx)];
356
+ if (!hash_group || curr_bin != prev_bin) {
357
+ // Grab the next group
358
+ prev_bin = curr_bin;
359
+ const auto group_idx = global_partition.bin_groups[curr_bin];
360
+ if (group_idx >= global_partition.hash_groups.size()) {
361
+ // No matching partition
362
+ hash_group = nullptr;
363
+ right_outer = nullptr;
364
+ right.reset();
365
+ continue;
366
+ }
367
+ hash_group = global_partition.hash_groups[group_idx].get();
368
+ right_outer = gsink.right_outers.data() + group_idx;
369
+ right = make_uniq<SBIterator>(*(hash_group->global_sort), ExpressionType::COMPARE_LESSTHANOREQUALTO);
370
+ }
371
+ left.SetIndex(i);
372
+
373
+ // If right > left, then there is no match
374
+ if (!right->Compare(left)) {
375
+ continue;
376
+ }
377
+
378
+ // Exponential search forward for a non-matching value using radix iterators
379
+ // (We use exponential search to avoid thrashing the block manager on large probes)
380
+ idx_t bound = 1;
381
+ idx_t begin = right->GetIndex();
382
+ right->SetIndex(begin + bound);
383
+ while (right->GetIndex() < hash_group->count) {
384
+ if (right->Compare(left)) {
385
+ // If right <= left, jump ahead
386
+ bound *= 2;
387
+ right->SetIndex(begin + bound);
388
+ } else {
389
+ break;
390
+ }
391
+ }
392
+
393
+ // Binary search for the first non-matching value using radix iterators
394
+ // The previous value (which we know exists) is the match
395
+ auto first = begin + bound / 2;
396
+ auto last = MinValue<idx_t>(begin + bound, hash_group->count);
397
+ while (first < last) {
398
+ const auto mid = first + (last - first) / 2;
399
+ right->SetIndex(mid);
400
+ if (right->Compare(left)) {
401
+ // If right <= left, new lower bound
402
+ first = mid + 1;
403
+ } else {
404
+ last = mid;
405
+ }
406
+ }
407
+ right->SetIndex(--first);
408
+
409
+ // Check partitions for strict equality
410
+ if (!op.lhs_partitions.empty() && hash_group->ComparePartitions(left, *right)) {
411
+ continue;
412
+ }
413
+
414
+ // Emit match data
415
+ right_outer->SetMatch(first);
416
+ left_outer.SetMatch(idx);
417
+ if (found_match) {
418
+ found_match[idx] = true;
419
+ }
420
+ if (matches) {
421
+ matches[idx] = Match(curr_bin, first);
422
+ }
423
+ lhs_matched.set_index(lhs_match_count++, idx);
424
+ }
425
+ }
426
+
427
+ unique_ptr<OperatorState> PhysicalAsOfJoin::GetOperatorState(ExecutionContext &context) const {
428
+ auto &config = ClientConfig::GetConfig(context.client);
429
+ return make_uniq<AsOfLocalState>(context.client, *this, config.force_external);
430
+ }
431
+
432
+ void PhysicalAsOfJoin::ResolveSimpleJoin(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
433
+ OperatorState &lstate_p) const {
434
+ auto &lstate = lstate_p.Cast<AsOfLocalState>();
435
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
436
+
437
+ // perform the actual join
438
+ bool found_match[STANDARD_VECTOR_SIZE] = {false};
439
+ lstate.ResolveJoin(input, found_match);
440
+
441
+ // now construct the result based on the join result
442
+ switch (join_type) {
443
+ case JoinType::MARK: {
444
+ PhysicalJoin::ConstructMarkJoinResult(lstate.lhs_keys, input, chunk, found_match, gsink.has_null);
445
+ break;
446
+ }
447
+ case JoinType::SEMI:
448
+ PhysicalJoin::ConstructSemiJoinResult(input, chunk, found_match);
449
+ break;
450
+ case JoinType::ANTI:
451
+ PhysicalJoin::ConstructAntiJoinResult(input, chunk, found_match);
452
+ break;
453
+ default:
454
+ throw NotImplementedException("Unimplemented join type for AsOf join");
455
+ }
456
+ }
457
+
458
+ OperatorResultType PhysicalAsOfJoin::ResolveComplexJoin(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
459
+ OperatorState &lstate_p) const {
460
+ auto &lstate = lstate_p.Cast<AsOfLocalState>();
461
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
462
+
463
+ if (!lstate.fetch_next_left) {
464
+ lstate.fetch_next_left = true;
465
+ if (lstate.left_outer.Enabled()) {
466
+ // left join: before we move to the next chunk, see if we need to output any vectors that didn't
467
+ // have a match found
468
+ lstate.left_outer.ConstructLeftJoinResult(input, chunk);
469
+ lstate.left_outer.Reset();
470
+ }
471
+ return OperatorResultType::NEED_MORE_INPUT;
472
+ }
473
+
474
+ // perform the actual join
475
+ AsOfLocalState::Match matches[STANDARD_VECTOR_SIZE];
476
+ lstate.ResolveJoin(input, nullptr, matches);
477
+ lstate.group_payload.Reset();
478
+ lstate.rhs_payload.Reset();
479
+
480
+ auto &global_partition = gsink.global_partition;
481
+ hash_t scan_bin = global_partition.bin_groups.size();
482
+ optional_ptr<PartitionGlobalHashGroup> hash_group;
483
+ unique_ptr<PayloadScanner> scanner;
484
+ for (idx_t i = 0; i < lstate.lhs_match_count; ++i) {
485
+ const auto idx = lstate.lhs_matched[i];
486
+ const auto match_bin = matches[idx].first;
487
+ const auto match_pos = matches[idx].second;
488
+ if (match_bin != scan_bin) {
489
+ // Grab the next group
490
+ const auto group_idx = global_partition.bin_groups[match_bin];
491
+ hash_group = global_partition.hash_groups[group_idx].get();
492
+ scan_bin = match_bin;
493
+ scanner = make_uniq<PayloadScanner>(*hash_group->global_sort, false);
494
+ lstate.group_payload.Reset();
495
+ }
496
+ // Skip to the range containing the match
497
+ while (match_pos >= scanner->Scanned()) {
498
+ lstate.group_payload.Reset();
499
+ scanner->Scan(lstate.group_payload);
500
+ }
501
+ // Append the individual values
502
+ // TODO: Batch the copies
503
+ const auto source_offset = match_pos - (scanner->Scanned() - lstate.group_payload.size());
504
+ for (idx_t col_idx = 0; col_idx < right_projection_map.size(); ++col_idx) {
505
+ const auto rhs_idx = right_projection_map[col_idx];
506
+ auto &source = lstate.group_payload.data[rhs_idx];
507
+ auto &target = chunk.data[input.ColumnCount() + col_idx];
508
+ VectorOperations::Copy(source, target, source_offset + 1, source_offset, i);
509
+ }
510
+ }
511
+
512
+ // Slice the input into the left side
513
+ chunk.Slice(input, lstate.lhs_matched, lstate.lhs_match_count);
514
+
515
+ // If we are doing a left join, come back for the NULLs
516
+ if (lstate.left_outer.Enabled()) {
517
+ lstate.fetch_next_left = false;
518
+ return OperatorResultType::HAVE_MORE_OUTPUT;
519
+ }
520
+
521
+ return OperatorResultType::NEED_MORE_INPUT;
522
+ }
523
+
524
+ OperatorResultType PhysicalAsOfJoin::ExecuteInternal(ExecutionContext &context, DataChunk &input, DataChunk &chunk,
525
+ GlobalOperatorState &gstate, OperatorState &lstate) const {
526
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
527
+
528
+ if (gsink.global_partition.count == 0) {
529
+ // empty RHS
530
+ if (!EmptyResultIfRHSIsEmpty()) {
531
+ ConstructEmptyJoinResult(join_type, gsink.has_null, input, chunk);
532
+ return OperatorResultType::NEED_MORE_INPUT;
533
+ } else {
534
+ return OperatorResultType::FINISHED;
535
+ }
536
+ }
537
+
538
+ input.Verify();
539
+ switch (join_type) {
540
+ case JoinType::SEMI:
541
+ case JoinType::ANTI:
542
+ case JoinType::MARK:
543
+ // simple joins can have max STANDARD_VECTOR_SIZE matches per chunk
544
+ ResolveSimpleJoin(context, input, chunk, lstate);
545
+ return OperatorResultType::NEED_MORE_INPUT;
546
+ case JoinType::LEFT:
547
+ case JoinType::INNER:
548
+ case JoinType::RIGHT:
549
+ case JoinType::OUTER:
550
+ return ResolveComplexJoin(context, input, chunk, lstate);
551
+ default:
552
+ throw NotImplementedException("Unimplemented type for as-of join!");
553
+ }
554
+ }
555
+
556
+ //===--------------------------------------------------------------------===//
557
+ // Source
558
+ //===--------------------------------------------------------------------===//
559
+ class AsOfGlobalSourceState : public GlobalSourceState {
560
+ public:
561
+ explicit AsOfGlobalSourceState(PartitionGlobalSinkState &gsink_p) : gsink(gsink_p), next_bin(0) {
562
+ }
563
+
564
+ PartitionGlobalSinkState &gsink;
565
+ //! The output read position.
566
+ atomic<idx_t> next_bin;
567
+
568
+ public:
569
+ idx_t MaxThreads() override {
570
+ // If there is only one partition, we have to process it on one thread.
571
+ if (!gsink.grouping_data) {
572
+ return 1;
573
+ }
574
+
575
+ // If there is not a lot of data, process serially.
576
+ if (gsink.count < STANDARD_ROW_GROUPS_SIZE) {
577
+ return 1;
578
+ }
579
+
580
+ return gsink.hash_groups.size();
581
+ }
582
+ };
583
+
584
+ unique_ptr<GlobalSourceState> PhysicalAsOfJoin::GetGlobalSourceState(ClientContext &context) const {
585
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
586
+ return make_uniq<AsOfGlobalSourceState>(gsink.global_partition);
587
+ }
588
+
589
+ class AsOfLocalSourceState : public LocalSourceState {
590
+ public:
591
+ using HashGroupPtr = unique_ptr<PartitionGlobalHashGroup>;
592
+
593
+ explicit AsOfLocalSourceState(AsOfGlobalSinkState &gstate_p);
594
+
595
+ idx_t GeneratePartition(const idx_t hash_bin);
596
+
597
+ AsOfGlobalSinkState &gstate;
598
+
599
+ //! The read partition
600
+ idx_t hash_bin;
601
+ HashGroupPtr hash_group;
602
+
603
+ //! The read cursor
604
+ unique_ptr<PayloadScanner> scanner;
605
+ //! Buffer for the inputs
606
+ DataChunk input_chunk;
607
+ //! Pointer to the matches
608
+ const bool *found_match;
609
+ };
610
+
611
+ AsOfLocalSourceState::AsOfLocalSourceState(AsOfGlobalSinkState &gstate_p) : gstate(gstate_p) {
612
+ input_chunk.Initialize(gstate.global_partition.allocator, gstate.global_partition.payload_types);
613
+ }
614
+
615
+ idx_t AsOfLocalSourceState::GeneratePartition(const idx_t hash_bin_p) {
616
+ // Get rid of any stale data
617
+ hash_bin = hash_bin_p;
618
+
619
+ hash_group = std::move(gstate.global_partition.hash_groups[hash_bin]);
620
+ scanner = make_uniq<PayloadScanner>(*hash_group->global_sort);
621
+ found_match = gstate.right_outers[hash_bin].GetMatches();
622
+
623
+ return scanner->Remaining();
624
+ }
625
+
626
+ unique_ptr<LocalSourceState> PhysicalAsOfJoin::GetLocalSourceState(ExecutionContext &context,
627
+ GlobalSourceState &gstate) const {
628
+ auto &gsink = sink_state->Cast<AsOfGlobalSinkState>();
629
+ return make_uniq<AsOfLocalSourceState>(gsink);
630
+ }
631
+
632
+ void PhysicalAsOfJoin::GetData(ExecutionContext &context, DataChunk &result, GlobalSourceState &gstate_p,
633
+ LocalSourceState &lstate_p) const {
634
+ D_ASSERT(IsRightOuterJoin(join_type));
635
+
636
+ auto &gsource = gstate_p.Cast<AsOfGlobalSourceState>();
637
+ auto &lsource = lstate_p.Cast<AsOfLocalSourceState>();
638
+ auto &gsink = gsource.gsink;
639
+
640
+ auto &hash_groups = gsink.hash_groups;
641
+ const auto bin_count = hash_groups.size();
642
+
643
+ DataChunk rhs_chunk;
644
+ rhs_chunk.Initialize(Allocator::Get(context.client), gsink.payload_types);
645
+ SelectionVector rsel(STANDARD_VECTOR_SIZE);
646
+
647
+ while (result.size() == 0) {
648
+ // Move to the next bin if we are done.
649
+ while (!lsource.scanner || !lsource.scanner->Remaining()) {
650
+ lsource.scanner.reset();
651
+ lsource.hash_group.reset();
652
+ auto hash_bin = gsource.next_bin++;
653
+ if (hash_bin >= bin_count) {
654
+ return;
655
+ }
656
+
657
+ for (; hash_bin < hash_groups.size(); hash_bin = gsource.next_bin++) {
658
+ if (hash_groups[hash_bin]) {
659
+ break;
660
+ }
661
+ }
662
+ lsource.GeneratePartition(hash_bin);
663
+ }
664
+ const auto rhs_position = lsource.scanner->Scanned();
665
+ lsource.scanner->Scan(rhs_chunk);
666
+
667
+ const auto count = rhs_chunk.size();
668
+ if (count == 0) {
669
+ return;
670
+ }
671
+
672
+ // figure out which tuples didn't find a match in the RHS
673
+ auto found_match = lsource.found_match;
674
+ idx_t result_count = 0;
675
+ for (idx_t i = 0; i < count; i++) {
676
+ if (!found_match[rhs_position + i]) {
677
+ rsel.set_index(result_count++, i);
678
+ }
679
+ }
680
+
681
+ if (result_count > 0) {
682
+ // if there were any tuples that didn't find a match, output them
683
+ const idx_t left_column_count = children[0]->types.size();
684
+ for (idx_t col_idx = 0; col_idx < left_column_count; ++col_idx) {
685
+ result.data[col_idx].SetVectorType(VectorType::CONSTANT_VECTOR);
686
+ ConstantVector::SetNull(result.data[col_idx], true);
687
+ }
688
+ for (idx_t col_idx = 0; col_idx < right_projection_map.size(); ++col_idx) {
689
+ const auto rhs_idx = right_projection_map[col_idx];
690
+ result.data[left_column_count + col_idx].Slice(rhs_chunk.data[rhs_idx], rsel, result_count);
691
+ }
692
+ result.SetCardinality(result_count);
693
+ return;
694
+ }
695
+ }
696
+ }
697
+
698
+ } // namespace duckdb