duckdb 0.8.2-dev2700.0 → 0.8.2-dev2809.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/icu/icu-makedate.cpp +12 -6
  3. package/src/duckdb/src/common/adbc/adbc.cpp +52 -21
  4. package/src/duckdb/src/common/adbc/driver_manager.cpp +12 -2
  5. package/src/duckdb/src/common/enum_util.cpp +5 -0
  6. package/src/duckdb/src/common/types/row/row_data_collection_scanner.cpp +35 -5
  7. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +283 -91
  8. package/src/duckdb/src/execution/operator/filter/physical_filter.cpp +1 -1
  9. package/src/duckdb/src/execution/operator/join/physical_comparison_join.cpp +1 -2
  10. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  11. package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -6
  12. package/src/duckdb/src/execution/window_executor.cpp +10 -1
  13. package/src/duckdb/src/function/table/version/pragma_version.cpp +5 -2
  14. package/src/duckdb/src/include/duckdb/common/adbc/adbc.hpp +2 -0
  15. package/src/duckdb/src/include/duckdb/common/enums/pending_execution_result.hpp +1 -1
  16. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -1
  17. package/src/duckdb/src/include/duckdb/execution/physical_operator.hpp +0 -2
  18. package/src/duckdb/src/include/duckdb/main/pending_query_result.hpp +5 -0
  19. package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +37 -63
  20. package/src/duckdb/src/include/duckdb/optimizer/join_order/cost_model.hpp +37 -0
  21. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +14 -29
  22. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +7 -21
  23. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +0 -11
  24. package/src/duckdb/src/include/duckdb/optimizer/join_order/plan_enumerator.hpp +89 -0
  25. package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +17 -31
  26. package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph_manager.hpp +113 -0
  27. package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_manager.hpp +73 -0
  28. package/src/duckdb/src/include/duckdb/optimizer/join_order/relation_statistics_helper.hpp +73 -0
  29. package/src/duckdb/src/include/duckdb/parallel/task_scheduler.hpp +4 -1
  30. package/src/duckdb/src/include/duckdb/planner/logical_operator.hpp +0 -2
  31. package/src/duckdb/src/include/duckdb.h +11 -1
  32. package/src/duckdb/src/main/capi/pending-c.cpp +17 -0
  33. package/src/duckdb/src/main/pending_query_result.cpp +9 -1
  34. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +79 -325
  35. package/src/duckdb/src/optimizer/join_order/cost_model.cpp +19 -0
  36. package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -37
  37. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +48 -1078
  38. package/src/duckdb/src/optimizer/join_order/plan_enumerator.cpp +552 -0
  39. package/src/duckdb/src/optimizer/join_order/query_graph.cpp +32 -29
  40. package/src/duckdb/src/optimizer/join_order/query_graph_manager.cpp +409 -0
  41. package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +356 -0
  42. package/src/duckdb/src/optimizer/join_order/relation_statistics_helper.cpp +351 -0
  43. package/src/duckdb/src/parallel/executor.cpp +6 -0
  44. package/src/duckdb/src/parallel/task_scheduler.cpp +7 -0
  45. package/src/duckdb/src/planner/binder/statement/bind_execute.cpp +1 -1
  46. package/src/duckdb/src/planner/operator/logical_get.cpp +4 -0
  47. package/src/duckdb/ub_src_optimizer_join_order.cpp +10 -0
@@ -0,0 +1,552 @@
1
+ #include "duckdb/optimizer/join_order/join_node.hpp"
2
+ #include "duckdb/optimizer/join_order/plan_enumerator.hpp"
3
+ #include "duckdb/optimizer/join_order/query_graph_manager.hpp"
4
+ #include "duckdb/main/client_context.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ bool PlanEnumerator::NodeInFullPlan(JoinNode &node) {
9
+ return join_nodes_in_full_plan.find(node.set.ToString()) != join_nodes_in_full_plan.end();
10
+ }
11
+
12
+ void PlanEnumerator::UpdateJoinNodesInFullPlan(JoinNode &node) {
13
+ if (node.set.count == query_graph_manager.relation_manager.NumRelations()) {
14
+ join_nodes_in_full_plan.clear();
15
+ }
16
+ if (node.set.count < query_graph_manager.relation_manager.NumRelations()) {
17
+ join_nodes_in_full_plan.insert(node.set.ToString());
18
+ }
19
+ if (node.left) {
20
+ UpdateJoinNodesInFullPlan(*node.left);
21
+ }
22
+ if (node.right) {
23
+ UpdateJoinNodesInFullPlan(*node.right);
24
+ }
25
+ }
26
+
27
+ static vector<unordered_set<idx_t>> AddSuperSets(const vector<unordered_set<idx_t>> &current,
28
+ const vector<idx_t> &all_neighbors) {
29
+ vector<unordered_set<idx_t>> ret;
30
+
31
+ for (const auto &neighbor_set : current) {
32
+ auto max_val = std::max_element(neighbor_set.begin(), neighbor_set.end());
33
+ for (const auto &neighbor : all_neighbors) {
34
+ if (*max_val >= neighbor) {
35
+ continue;
36
+ }
37
+ if (neighbor_set.count(neighbor) == 0) {
38
+ unordered_set<idx_t> new_set;
39
+ for (auto &n : neighbor_set) {
40
+ new_set.insert(n);
41
+ }
42
+ new_set.insert(neighbor);
43
+ ret.push_back(new_set);
44
+ }
45
+ }
46
+ }
47
+
48
+ return ret;
49
+ }
50
+
51
+ //! Update the exclusion set with all entries in the subgraph
52
+ static void UpdateExclusionSet(optional_ptr<JoinRelationSet> node, unordered_set<idx_t> &exclusion_set) {
53
+ for (idx_t i = 0; i < node->count; i++) {
54
+ exclusion_set.insert(node->relations[i]);
55
+ }
56
+ }
57
+
58
+ // works by first creating all sets with cardinality 1
59
+ // then iterates over each previously created group of subsets and will only add a neighbor if the neighbor
60
+ // is greater than all relations in the set.
61
+ static vector<unordered_set<idx_t>> GetAllNeighborSets(vector<idx_t> neighbors) {
62
+ vector<unordered_set<idx_t>> ret;
63
+ sort(neighbors.begin(), neighbors.end());
64
+ vector<unordered_set<idx_t>> added;
65
+ for (auto &neighbor : neighbors) {
66
+ added.push_back(unordered_set<idx_t>({neighbor}));
67
+ ret.push_back(unordered_set<idx_t>({neighbor}));
68
+ }
69
+ do {
70
+ added = AddSuperSets(added, neighbors);
71
+ for (auto &d : added) {
72
+ ret.push_back(d);
73
+ }
74
+ } while (!added.empty());
75
+ #if DEBUG
76
+ // drive by test to make sure we have an accurate amount of
77
+ // subsets, and that each neighbor is in a correct amount
78
+ // of those subsets.
79
+ D_ASSERT(ret.size() == pow(2, neighbors.size()) - 1);
80
+ for (auto &n : neighbors) {
81
+ idx_t count = 0;
82
+ for (auto &set : ret) {
83
+ if (set.count(n) >= 1) {
84
+ count += 1;
85
+ }
86
+ }
87
+ D_ASSERT(count == pow(2, neighbors.size() - 1));
88
+ }
89
+ #endif
90
+ return ret;
91
+ }
92
+
93
+ void PlanEnumerator::GenerateCrossProducts() {
94
+ // generate a set of cross products to combine the currently available plans into a full join plan
95
+ // we create edges between every relation with a high cost
96
+ for (idx_t i = 0; i < query_graph_manager.relation_manager.NumRelations(); i++) {
97
+ auto &left = query_graph_manager.set_manager.GetJoinRelation(i);
98
+ for (idx_t j = 0; j < query_graph_manager.relation_manager.NumRelations(); j++) {
99
+ if (i != j) {
100
+ auto &right = query_graph_manager.set_manager.GetJoinRelation(j);
101
+ query_graph_manager.CreateQueryGraphCrossProduct(left, right);
102
+ }
103
+ }
104
+ }
105
+ // Now that the query graph has new edges, we need to re-initialize our query graph.
106
+ // TODO: do we need to initialize our qyery graph again?
107
+ // query_graph = query_graph_manager.GetQueryGraph();
108
+ }
109
+
110
+ //! Create a new JoinTree node by joining together two previous JoinTree nodes
111
+ unique_ptr<JoinNode> PlanEnumerator::CreateJoinTree(JoinRelationSet &set,
112
+ const vector<reference<NeighborInfo>> &possible_connections,
113
+ JoinNode &left, JoinNode &right) {
114
+ // for the hash join we want the right side (build side) to have the smallest cardinality
115
+ // also just a heuristic but for now...
116
+ // FIXME: we should probably actually benchmark that as well
117
+ // FIXME: should consider different join algorithms, should we pick a join algorithm here as well? (probably)
118
+ optional_ptr<NeighborInfo> best_connection = nullptr;
119
+
120
+ // cross products are techincally still connections, but the filter expression is a null_ptr
121
+ if (!possible_connections.empty()) {
122
+ best_connection = &possible_connections.back().get();
123
+ }
124
+
125
+ auto cost = cost_model.ComputeCost(left, right);
126
+ auto result = make_uniq<JoinNode>(set, best_connection, left, right, cost);
127
+ result->cardinality = cost_model.cardinality_estimator.EstimateCardinalityWithSet<idx_t>(set);
128
+ return result;
129
+ }
130
+
131
+ JoinNode &PlanEnumerator::EmitPair(JoinRelationSet &left, JoinRelationSet &right,
132
+ const vector<reference<NeighborInfo>> &info) {
133
+ // get the left and right join plans
134
+ auto left_plan = plans.find(left);
135
+ auto right_plan = plans.find(right);
136
+ if (left_plan == plans.end() || right_plan == plans.end()) {
137
+ throw InternalException("No left or right plan: internal error in join order optimizer");
138
+ }
139
+ auto &new_set = query_graph_manager.set_manager.Union(left, right);
140
+ // create the join tree based on combining the two plans
141
+ auto new_plan = CreateJoinTree(new_set, info, *left_plan->second, *right_plan->second);
142
+ // check if this plan is the optimal plan we found for this set of relations
143
+ auto entry = plans.find(new_set);
144
+ auto new_cost = new_plan->cost;
145
+ double old_cost = NumericLimits<double>::Maximum();
146
+ if (entry != plans.end()) {
147
+ old_cost = entry->second->cost;
148
+ }
149
+ if (entry == plans.end() || new_cost < old_cost) {
150
+ // the new plan costs less than the old plan. Update our DP tree and cost tree
151
+ auto &result = *new_plan;
152
+
153
+ if (full_plan_found &&
154
+ join_nodes_in_full_plan.find(new_plan->set.ToString()) != join_nodes_in_full_plan.end()) {
155
+ must_update_full_plan = true;
156
+ }
157
+ if (new_set.count == query_graph_manager.relation_manager.NumRelations()) {
158
+ full_plan_found = true;
159
+ // If we find a full plan, we need to keep track of which nodes are in the full plan.
160
+ // It's possible the DP algorithm updates a node in the current full plan, then moves on
161
+ // to the SolveApproximately. SolveApproximately may find a full plan with a higher cost than
162
+ // what SolveExactly found. In this case, we revert to the SolveExactly plan, but it is
163
+ // possible to get use-after-free errors if the SolveApproximately algorithm updated some (but not all)
164
+ // nodes in the SolveExactly plan
165
+ // If we know a node in the full plan is updated, we can prevent ourselves from exiting the
166
+ // DP algorithm until the last plan updated is a full plan
167
+ UpdateJoinNodesInFullPlan(result);
168
+ if (must_update_full_plan) {
169
+ must_update_full_plan = false;
170
+ }
171
+ }
172
+
173
+ D_ASSERT(new_plan);
174
+ plans[new_set] = std::move(new_plan);
175
+ return result;
176
+ }
177
+ return *entry->second;
178
+ }
179
+
180
+ bool PlanEnumerator::TryEmitPair(JoinRelationSet &left, JoinRelationSet &right,
181
+ const vector<reference<NeighborInfo>> &info) {
182
+ pairs++;
183
+ // If a full plan is created, it's possible a node in the plan gets updated. When this happens, make sure you keep
184
+ // emitting pairs until you emit another final plan. Another final plan is guaranteed to be produced because of
185
+ // our symmetry guarantees.
186
+ if (pairs >= 10000 && !must_update_full_plan) {
187
+ // when the amount of pairs gets too large we exit the dynamic programming and resort to a greedy algorithm
188
+ // FIXME: simple heuristic currently
189
+ // at 10K pairs stop searching exactly and switch to heuristic
190
+ return false;
191
+ }
192
+ EmitPair(left, right, info);
193
+ return true;
194
+ }
195
+
196
+ bool PlanEnumerator::EmitCSG(JoinRelationSet &node) {
197
+ if (node.count == query_graph_manager.relation_manager.NumRelations()) {
198
+ return true;
199
+ }
200
+ // create the exclusion set as everything inside the subgraph AND anything with members BELOW it
201
+ unordered_set<idx_t> exclusion_set;
202
+ for (idx_t i = 0; i < node.relations[0]; i++) {
203
+ exclusion_set.insert(i);
204
+ }
205
+ UpdateExclusionSet(&node, exclusion_set);
206
+ // find the neighbors given this exclusion set
207
+ auto neighbors = query_graph.GetNeighbors(node, exclusion_set);
208
+ if (neighbors.empty()) {
209
+ return true;
210
+ }
211
+
212
+ //! Neighbors should be reversed when iterating over them.
213
+ std::sort(neighbors.begin(), neighbors.end(), std::greater_equal<idx_t>());
214
+ for (idx_t i = 0; i < neighbors.size() - 1; i++) {
215
+ D_ASSERT(neighbors[i] > neighbors[i + 1]);
216
+ }
217
+
218
+ // Dphyp paper missiing this.
219
+ // Because we are traversing in reverse order, we need to add neighbors whose number is smaller than the current
220
+ // node to exclusion_set
221
+ // This avoids duplicated enumeration
222
+ unordered_set<idx_t> new_exclusion_set = exclusion_set;
223
+ for (idx_t i = 0; i < neighbors.size(); ++i) {
224
+ D_ASSERT(new_exclusion_set.find(neighbors[i]) == new_exclusion_set.end());
225
+ new_exclusion_set.insert(neighbors[i]);
226
+ }
227
+
228
+ for (auto neighbor : neighbors) {
229
+ // since the GetNeighbors only returns the smallest element in a list, the entry might not be connected to
230
+ // (only!) this neighbor, hence we have to do a connectedness check before we can emit it
231
+ auto &neighbor_relation = query_graph_manager.set_manager.GetJoinRelation(neighbor);
232
+ auto connections = query_graph.GetConnections(node, neighbor_relation);
233
+ if (!connections.empty()) {
234
+ if (!TryEmitPair(node, neighbor_relation, connections)) {
235
+ return false;
236
+ }
237
+ }
238
+
239
+ if (!EnumerateCmpRecursive(node, neighbor_relation, new_exclusion_set)) {
240
+ return false;
241
+ }
242
+
243
+ new_exclusion_set.erase(neighbor);
244
+ }
245
+ return true;
246
+ }
247
+
248
+ bool PlanEnumerator::EnumerateCmpRecursive(JoinRelationSet &left, JoinRelationSet &right,
249
+ unordered_set<idx_t> &exclusion_set) {
250
+ // get the neighbors of the second relation under the exclusion set
251
+ auto neighbors = query_graph.GetNeighbors(right, exclusion_set);
252
+ if (neighbors.empty()) {
253
+ return true;
254
+ }
255
+
256
+ auto all_subset = GetAllNeighborSets(neighbors);
257
+ vector<reference<JoinRelationSet>> union_sets;
258
+ union_sets.reserve(all_subset.size());
259
+ for (const auto &rel_set : all_subset) {
260
+ auto &neighbor = query_graph_manager.set_manager.GetJoinRelation(rel_set);
261
+ // emit the combinations of this node and its neighbors
262
+ auto &combined_set = query_graph_manager.set_manager.Union(right, neighbor);
263
+ // If combined_set.count == right.count, This means we found a neighbor that has been present before
264
+ // This means we didn't set exclusion_set correctly.
265
+ D_ASSERT(combined_set.count > right.count);
266
+ if (plans.find(combined_set) != plans.end()) {
267
+ auto connections = query_graph.GetConnections(left, combined_set);
268
+ if (!connections.empty()) {
269
+ if (!TryEmitPair(left, combined_set, connections)) {
270
+ return false;
271
+ }
272
+ }
273
+ }
274
+ union_sets.push_back(combined_set);
275
+ }
276
+
277
+ unordered_set<idx_t> new_exclusion_set = exclusion_set;
278
+ for (const auto &neighbor : neighbors) {
279
+ new_exclusion_set.insert(neighbor);
280
+ }
281
+
282
+ // recursively enumerate the sets
283
+ for (idx_t i = 0; i < union_sets.size(); i++) {
284
+ // updated the set of excluded entries with this neighbor
285
+ if (!EnumerateCmpRecursive(left, union_sets[i], new_exclusion_set)) {
286
+ return false;
287
+ }
288
+ }
289
+ return true;
290
+ }
291
+
292
+ bool PlanEnumerator::EnumerateCSGRecursive(JoinRelationSet &node, unordered_set<idx_t> &exclusion_set) {
293
+ // find neighbors of S under the exclusion set
294
+ auto neighbors = query_graph.GetNeighbors(node, exclusion_set);
295
+ if (neighbors.empty()) {
296
+ return true;
297
+ }
298
+
299
+ auto all_subset = GetAllNeighborSets(neighbors);
300
+ vector<reference<JoinRelationSet>> union_sets;
301
+ union_sets.reserve(all_subset.size());
302
+ for (const auto &rel_set : all_subset) {
303
+ auto &neighbor = query_graph_manager.set_manager.GetJoinRelation(rel_set);
304
+ // emit the combinations of this node and its neighbors
305
+ auto &new_set = query_graph_manager.set_manager.Union(node, neighbor);
306
+ D_ASSERT(new_set.count > node.count);
307
+ if (plans.find(new_set) != plans.end()) {
308
+ if (!EmitCSG(new_set)) {
309
+ return false;
310
+ }
311
+ }
312
+ union_sets.push_back(new_set);
313
+ }
314
+
315
+ unordered_set<idx_t> new_exclusion_set = exclusion_set;
316
+ for (const auto &neighbor : neighbors) {
317
+ new_exclusion_set.insert(neighbor);
318
+ }
319
+
320
+ // recursively enumerate the sets
321
+ for (idx_t i = 0; i < union_sets.size(); i++) {
322
+ // updated the set of excluded entries with this neighbor
323
+ if (!EnumerateCSGRecursive(union_sets[i], new_exclusion_set)) {
324
+ return false;
325
+ }
326
+ }
327
+ return true;
328
+ }
329
+
330
+ bool PlanEnumerator::SolveJoinOrderExactly() {
331
+ // now we perform the actual dynamic programming to compute the final result
332
+ // we enumerate over all the possible pairs in the neighborhood
333
+ for (idx_t i = query_graph_manager.relation_manager.NumRelations(); i > 0; i--) {
334
+ // for every node in the set, we consider it as the start node once
335
+ auto &start_node = query_graph_manager.set_manager.GetJoinRelation(i - 1);
336
+ // emit the start node
337
+ if (!EmitCSG(start_node)) {
338
+ return false;
339
+ }
340
+ // initialize the set of exclusion_set as all the nodes with a number below this
341
+ unordered_set<idx_t> exclusion_set;
342
+ for (idx_t j = 0; j < i; j++) {
343
+ exclusion_set.insert(j);
344
+ }
345
+ // then we recursively search for neighbors that do not belong to the banned entries
346
+ if (!EnumerateCSGRecursive(start_node, exclusion_set)) {
347
+ return false;
348
+ }
349
+ }
350
+ return true;
351
+ }
352
+
353
+ void PlanEnumerator::UpdateDPTree(JoinNode &new_plan) {
354
+ if (!NodeInFullPlan(new_plan)) {
355
+ // if the new node is not in the full plan, feel free to return
356
+ // because you won't be updating the full plan.
357
+ return;
358
+ }
359
+ auto &new_set = new_plan.set;
360
+ // now update every plan that uses this plan
361
+ unordered_set<idx_t> exclusion_set;
362
+ for (idx_t i = 0; i < new_set.count; i++) {
363
+ exclusion_set.insert(new_set.relations[i]);
364
+ }
365
+ auto neighbors = query_graph.GetNeighbors(new_set, exclusion_set);
366
+ auto all_neighbors = GetAllNeighborSets(neighbors);
367
+ for (const auto &neighbor : all_neighbors) {
368
+ auto &neighbor_relation = query_graph_manager.set_manager.GetJoinRelation(neighbor);
369
+ auto &combined_set = query_graph_manager.set_manager.Union(new_set, neighbor_relation);
370
+
371
+ auto combined_set_plan = plans.find(combined_set);
372
+ if (combined_set_plan == plans.end()) {
373
+ continue;
374
+ }
375
+
376
+ double combined_set_plan_cost = combined_set_plan->second->cost; // combined_set_plan->second->GetCost();
377
+ auto connections = query_graph.GetConnections(new_set, neighbor_relation);
378
+ // recurse and update up the tree if the combined set produces a plan with a lower cost
379
+ // only recurse on neighbor relations that have plans.
380
+ auto right_plan = plans.find(neighbor_relation);
381
+ if (right_plan == plans.end()) {
382
+ continue;
383
+ }
384
+ auto &updated_plan = EmitPair(new_set, neighbor_relation, connections);
385
+ // <= because the child node has already been replaced. You need to
386
+ // replace the parent node as well in this case
387
+ if (updated_plan.cost < combined_set_plan_cost) {
388
+ UpdateDPTree(updated_plan);
389
+ }
390
+ }
391
+ }
392
+
393
+ void PlanEnumerator::SolveJoinOrderApproximately() {
394
+ // at this point, we exited the dynamic programming but did not compute the final join order because it took too
395
+ // long instead, we use a greedy heuristic to obtain a join ordering now we use Greedy Operator Ordering to
396
+ // construct the result tree first we start out with all the base relations (the to-be-joined relations)
397
+ vector<reference<JoinRelationSet>> join_relations; // T in the paper
398
+ for (idx_t i = 0; i < query_graph_manager.relation_manager.NumRelations(); i++) {
399
+ join_relations.push_back(query_graph_manager.set_manager.GetJoinRelation(i));
400
+ }
401
+ while (join_relations.size() > 1) {
402
+ // now in every step of the algorithm, we greedily pick the join between the to-be-joined relations that has the
403
+ // smallest cost. This is O(r^2) per step, and every step will reduce the total amount of relations to-be-joined
404
+ // by 1, so the total cost is O(r^3) in the amount of relations
405
+ idx_t best_left = 0, best_right = 0;
406
+ optional_ptr<JoinNode> best_connection;
407
+ for (idx_t i = 0; i < join_relations.size(); i++) {
408
+ auto left = join_relations[i];
409
+ for (idx_t j = i + 1; j < join_relations.size(); j++) {
410
+ auto right = join_relations[j];
411
+ // check if we can connect these two relations
412
+ auto connection = query_graph.GetConnections(left, right);
413
+ if (!connection.empty()) {
414
+ // we can check the cost of this connection
415
+ auto &node = EmitPair(left, right, connection);
416
+
417
+ // update the DP tree in case a plan created by the DP algorithm uses the node
418
+ // that was potentially just updated by EmitPair. You will get a use-after-free
419
+ // error if future plans rely on the old node that was just replaced.
420
+ // if node in FullPath, then updateDP tree.
421
+ UpdateDPTree(node);
422
+
423
+ if (!best_connection || node.cost < best_connection->cost) {
424
+ // best pair found so far
425
+ best_connection = &node;
426
+ best_left = i;
427
+ best_right = j;
428
+ }
429
+ }
430
+ }
431
+ }
432
+ if (!best_connection) {
433
+ // could not find a connection, but we were not done with finding a completed plan
434
+ // we have to add a cross product; we add it between the two smallest relations
435
+ optional_ptr<JoinNode> smallest_plans[2];
436
+ idx_t smallest_index[2];
437
+ D_ASSERT(join_relations.size() >= 2);
438
+
439
+ // first just add the first two join relations. It doesn't matter the cost as the JOO
440
+ // will swap them on estimated cardinality anyway.
441
+ for (idx_t i = 0; i < 2; i++) {
442
+ auto current_plan = plans[join_relations[i]].get();
443
+ smallest_plans[i] = current_plan;
444
+ smallest_index[i] = i;
445
+ }
446
+
447
+ // if there are any other join relations that don't have connections
448
+ // add them if they have lower estimated cardinality.
449
+ for (idx_t i = 2; i < join_relations.size(); i++) {
450
+ // get the plan for this relation
451
+ auto current_plan = plans[join_relations[i].get()].get();
452
+ // check if the cardinality is smaller than the smallest two found so far
453
+ for (idx_t j = 0; j < 2; j++) {
454
+ if (!smallest_plans[j] || smallest_plans[j]->cost > current_plan->cost) {
455
+ smallest_plans[j] = current_plan;
456
+ smallest_index[j] = i;
457
+ break;
458
+ }
459
+ }
460
+ }
461
+ if (!smallest_plans[0] || !smallest_plans[1]) {
462
+ throw InternalException("Internal error in join order optimizer");
463
+ }
464
+ D_ASSERT(smallest_plans[0] && smallest_plans[1]);
465
+ D_ASSERT(smallest_index[0] != smallest_index[1]);
466
+ auto &left = smallest_plans[0]->set;
467
+ auto &right = smallest_plans[1]->set;
468
+ // create a cross product edge (i.e. edge with empty filter) between these two sets in the query graph
469
+ query_graph_manager.CreateQueryGraphCrossProduct(left, right);
470
+ // now emit the pair and continue with the algorithm
471
+ auto connections = query_graph.GetConnections(left, right);
472
+ D_ASSERT(!connections.empty());
473
+
474
+ best_connection = &EmitPair(left, right, connections);
475
+ best_left = smallest_index[0];
476
+ best_right = smallest_index[1];
477
+
478
+ UpdateDPTree(*best_connection);
479
+ // the code below assumes best_right > best_left
480
+ if (best_left > best_right) {
481
+ std::swap(best_left, best_right);
482
+ }
483
+ }
484
+ // now update the to-be-checked pairs
485
+ // remove left and right, and add the combination
486
+
487
+ // important to erase the biggest element first
488
+ // if we erase the smallest element first the index of the biggest element changes
489
+ D_ASSERT(best_right > best_left);
490
+ join_relations.erase(join_relations.begin() + best_right);
491
+ join_relations.erase(join_relations.begin() + best_left);
492
+ join_relations.push_back(best_connection->set);
493
+ }
494
+ }
495
+
496
+ void PlanEnumerator::InitLeafPlans() {
497
+ // First we initialize each of the single-node plans with themselves and with their cardinalities these are the leaf
498
+ // nodes of the join tree NOTE: we can just use pointers to JoinRelationSet* here because the GetJoinRelation
499
+ // function ensures that a unique combination of relations will have a unique JoinRelationSet object.
500
+ // first initialize equivalent relations based on the filters
501
+ auto relation_stats = query_graph_manager.relation_manager.GetRelationStats();
502
+
503
+ cost_model.cardinality_estimator.InitEquivalentRelations(query_graph_manager.GetFilterBindings());
504
+ cost_model.cardinality_estimator.AddRelationNamesToTdoms(relation_stats);
505
+
506
+ // then update the total domains based on the cardinalities of each relation.
507
+ for (idx_t i = 0; i < relation_stats.size(); i++) {
508
+ auto stats = relation_stats.at(i);
509
+ auto &relation_set = query_graph_manager.set_manager.GetJoinRelation(i);
510
+ auto join_node = make_uniq<JoinNode>(relation_set);
511
+ join_node->cost = 0;
512
+ join_node->cardinality = stats.cardinality;
513
+ plans[relation_set] = std::move(join_node);
514
+ cost_model.cardinality_estimator.InitCardinalityEstimatorProps(&relation_set, stats);
515
+ }
516
+ }
517
+
518
+ // the plan enumeration is a straight implementation of the paper "Dynamic Programming Strikes Back" by Guido
519
+ // Moerkotte and Thomas Neumannn, see that paper for additional info/documentation bonus slides:
520
+ // https://db.in.tum.de/teaching/ws1415/queryopt/chapter3.pdf?lang=de
521
+ unique_ptr<JoinNode> PlanEnumerator::SolveJoinOrder() {
522
+ bool force_no_cross_product = query_graph_manager.context.config.force_no_cross_product;
523
+ // first try to solve the join order exactly
524
+ if (!SolveJoinOrderExactly()) {
525
+ // otherwise, if that times out we resort to a greedy algorithm
526
+ SolveJoinOrderApproximately();
527
+ }
528
+
529
+ // now the optimal join path should have been found
530
+ // get it from the node
531
+ unordered_set<idx_t> bindings;
532
+ for (idx_t i = 0; i < query_graph_manager.relation_manager.NumRelations(); i++) {
533
+ bindings.insert(i);
534
+ }
535
+ auto &total_relation = query_graph_manager.set_manager.GetJoinRelation(bindings);
536
+ auto final_plan = plans.find(total_relation);
537
+ if (final_plan == plans.end()) {
538
+ // could not find the final plan
539
+ // this should only happen in case the sets are actually disjunct
540
+ // in this case we need to generate cross product to connect the disjoint sets
541
+ if (force_no_cross_product) {
542
+ throw InvalidInputException(
543
+ "Query requires a cross-product, but 'force_no_cross_product' PRAGMA is enabled");
544
+ }
545
+ GenerateCrossProducts();
546
+ //! solve the join order again, returning the final plan
547
+ return SolveJoinOrder();
548
+ }
549
+ return std::move(final_plan->second);
550
+ }
551
+
552
+ } // namespace duckdb