PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1825 @@
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import itertools
5
+ import math
6
+ import warnings
7
+ from collections.abc import Callable, Iterable, Sequence
8
+ from dataclasses import dataclass
9
+ from typing import Optional
10
+
11
+ from .. import util
12
+ from .._core import (
13
+ Cardinality,
14
+ IntermediateOperator,
15
+ JoinOperator,
16
+ PhysicalOperator,
17
+ ScanOperator,
18
+ TableReference,
19
+ )
20
+ from .._qep import QueryPlan, SortKey
21
+ from .._stages import (
22
+ CardinalityEstimator,
23
+ CostModel,
24
+ OptimizationPreCheck,
25
+ PlanEnumerator,
26
+ )
27
+ from .._validation import (
28
+ CrossProductPreCheck,
29
+ EquiJoinPreCheck,
30
+ InnerJoinPreCheck,
31
+ SetOperationsPreCheck,
32
+ SubqueryPreCheck,
33
+ VirtualTablesPreCheck,
34
+ merge_checks,
35
+ )
36
+ from ..db._db import Database, DatabasePool, DatabaseSchema, DatabaseServerError
37
+ from ..db.postgres import PostgresInterface, PostgresJoinHints, PostgresScanHints
38
+ from ..qal import transform
39
+ from ..qal._qal import (
40
+ AbstractPredicate,
41
+ ColumnExpression,
42
+ ColumnReference,
43
+ CompoundOperator,
44
+ CompoundPredicate,
45
+ QueryPredicates,
46
+ SqlQuery,
47
+ )
48
+ from ..util import LogicError, jsondict
49
+ from . import native
50
+
51
+ DPTable = dict[frozenset[TableReference], QueryPlan]
52
+
53
+
54
+ def _calc_plan_estimates(
55
+ query: SqlQuery,
56
+ plan: QueryPlan,
57
+ *,
58
+ cost_model: CostModel,
59
+ cardinality_estimator: CardinalityEstimator,
60
+ ) -> QueryPlan:
61
+ """Handler method to update the cost and cardinality estimates of a given plan."""
62
+ card_est = cardinality_estimator.calculate_estimate(query, plan.tables())
63
+ plan = plan.with_estimates(cardinality=card_est)
64
+ cost_est = cost_model.estimate_cost(query, plan)
65
+ return plan.with_estimates(cost=cost_est)
66
+
67
+
68
+ def _collect_used_columns(
69
+ query: SqlQuery, table: TableReference, *, schema: DatabaseSchema
70
+ ) -> set[ColumnReference]:
71
+ columns = query.columns_of(table)
72
+ for star_expression in query.select_clause.star_expressions():
73
+ if table not in star_expression.tables():
74
+ continue
75
+
76
+ columns |= schema.columns(table)
77
+ return columns
78
+
79
+
80
+ class DynamicProgrammingEnumerator(PlanEnumerator):
81
+ """A very basic dynamic programming-based plan enumerator.
82
+
83
+ This enumerator is very basic because it does not implement any sophisticated pruning rules or traversal strategies and
84
+ only focuses on a small subset of possible operators. It simply enumerates all possible access paths and join paths and
85
+ picks the cheapest one. This should only serve as a starting point when lacking an actual decent enumerator implementation
86
+ (see *Limitation* below). Its purpose is mainly to shield users that are only interested in the cost model or the
87
+ cardinality estimator from having to implement their own enumerator in order to use the `TextBookOptimizationPipeline`.
88
+ Notice that for experiments based on PostgreSQL, a much more sophisticated implementation is available with the
89
+ `PostgresDynProg` enumerator (and this enumerator is automatically selected when using the textbook pipeline with a
90
+ Postgres target database).
91
+
92
+ Limitations
93
+ -----------
94
+
95
+ - Only the cheapest access paths are considered, without taking sort orders into account. This prevents free merge join
96
+ optimizations, i.e. if an access path is more expensive but already sorted, it will be discarded in favor of a cheaper
97
+ alternative, even though a later merge join might become much cheaper due to the sort order.
98
+ - No optimizations to intermediates are considered, i.e. no materialization or memoization of subplans.
99
+ - Only the basic scan and join operators are considered. For scans, this includes sequential scan, index scan, index-only
100
+ scan and bitmap scan. For joins, this includes nested loop join, hash join and sort merge join. These can be further
101
+ restricted through the `supported_scan_ops` and `supported_join_ops` parameters.
102
+ - Only simple SPJ queries are supported. Importantly, the query may not contain any set operations, subqueries, CTEs etc.
103
+ All joins must be inner equijoins and no cross products are allowed.
104
+ - Aggregations, sorting, etc. are not considered. In this way, the enumerator is comparable to the ``join_search_hook``
105
+ of PostgreSQL. We assume that such "technicalities" are handled when creating appropriate hints for the target database
106
+ or when executing the query on the target database at the latest.
107
+
108
+ Parameters
109
+ ----------
110
+ supported_scan_ops : Optional[set[ScanOperator]], optional
111
+ The set of scan operators that should be considered during the enumeration. This should be a subset of the following
112
+ operators: sequential scan, index scan, index-only scan, bitmap scan. If any other operators are included, these
113
+ are simply never considered. By default all operators that are available on the `target_db` are allowed.
114
+ supported_join_ops : Optional[set[JoinOperator]], optional
115
+ The set of join operators that should be considered during the enumeration. This should be a subset of the following
116
+ operators: nested loop join, hash join, sort merge join. If any other operators are included, these are simply never
117
+ considered. By default all operators that are available on the `target_db` are allowed.
118
+ target_db : Optional[Database], optional
119
+ The target database system for which the optimization pipeline is intended. If not omitted, the database is inferred
120
+ from the `DatabasePool`.
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ *,
126
+ supported_scan_ops: Optional[set[ScanOperator]] = None,
127
+ supported_join_ops: Optional[set[JoinOperator]] = None,
128
+ target_db: Optional[Database] = None,
129
+ ) -> None:
130
+ target_db = (
131
+ target_db
132
+ if target_db is not None
133
+ else DatabasePool.get_instance().current_database()
134
+ )
135
+
136
+ supported_scan_ops = (
137
+ supported_scan_ops if supported_scan_ops is not None else set(ScanOperator)
138
+ )
139
+ supported_join_ops = (
140
+ supported_join_ops if supported_join_ops is not None else set(JoinOperator)
141
+ )
142
+
143
+ if target_db is not None:
144
+ supported_scan_ops = {
145
+ op for op in supported_scan_ops if target_db.hinting().supports_hint(op)
146
+ }
147
+ supported_join_ops = {
148
+ op for op in supported_join_ops if target_db.hinting().supports_hint(op)
149
+ }
150
+
151
+ self.predicates: QueryPredicates = None
152
+
153
+ self._target_db = target_db
154
+ self._scan_ops = supported_scan_ops
155
+ self._join_ops = supported_join_ops
156
+
157
+ def generate_execution_plan(
158
+ self, query, *, cost_model, cardinality_estimator
159
+ ) -> QueryPlan:
160
+ self.predicates = query.predicates()
161
+ cost_model.initialize(self._target_db, query)
162
+ cardinality_estimator.initialize(self._target_db, query)
163
+
164
+ dp_table = self._determine_base_access_paths(
165
+ query, cost_model=cost_model, cardinality_estimator=cardinality_estimator
166
+ )
167
+ final_plan = self._build_join_paths(
168
+ query,
169
+ dp_table=dp_table,
170
+ cost_model=cost_model,
171
+ cardinality_estimator=cardinality_estimator,
172
+ )
173
+
174
+ cost_model.cleanup()
175
+ cardinality_estimator.cleanup()
176
+ self.predicates = None
177
+ return final_plan
178
+
179
+ def pre_check(self) -> OptimizationPreCheck:
180
+ return merge_checks(
181
+ CrossProductPreCheck(),
182
+ VirtualTablesPreCheck(),
183
+ EquiJoinPreCheck(),
184
+ InnerJoinPreCheck(),
185
+ SubqueryPreCheck(),
186
+ SetOperationsPreCheck(),
187
+ )
188
+
189
+ def describe(self) -> jsondict:
190
+ return {
191
+ "name": "dynamic_programming",
192
+ "flavor": "default",
193
+ "scan_ops": [op.name for op in self._scan_ops],
194
+ "join_ops": [op.name for op in self._join_ops],
195
+ "database_system": self._target_db.describe(),
196
+ }
197
+
198
+ def _determine_base_access_paths(
199
+ self,
200
+ query: SqlQuery,
201
+ *,
202
+ cost_model: CostModel,
203
+ cardinality_estimator: CardinalityEstimator,
204
+ ) -> DPTable:
205
+ """Initializes a new dynamic programming table which includes the cheapest access paths for each base table.
206
+
207
+ The base tables are directly inferred from the query.
208
+ """
209
+ dp_table: DPTable = {}
210
+
211
+ for table in query.tables():
212
+ # We determine access paths in two phases: initially, we just gather all possible access paths to a specific table.
213
+ # Aftewards, we evaluate these candidates according to our cost model and select the cheapest one.
214
+ candidate_plans: list[QueryPlan] = []
215
+ filter_condition = self.predicates.filters_for(table)
216
+
217
+ if ScanOperator.SequentialScan in self._scan_ops:
218
+ candidate_plans.append(
219
+ QueryPlan(
220
+ ScanOperator.SequentialScan,
221
+ base_table=table,
222
+ filter_predicate=filter_condition,
223
+ )
224
+ )
225
+ candidate_plans += self._determine_index_paths(query, table)
226
+
227
+ candidate_plans = [
228
+ _calc_plan_estimates(
229
+ query,
230
+ candidate,
231
+ cost_model=cost_model,
232
+ cardinality_estimator=cardinality_estimator,
233
+ )
234
+ for candidate in candidate_plans
235
+ ]
236
+
237
+ cheapest_plan = min(candidate_plans, key=lambda plan: plan.estimated_cost)
238
+ dp_table[frozenset([table])] = cheapest_plan
239
+
240
+ return dp_table
241
+
242
+ def _determine_index_paths(
243
+ self, query: SqlQuery, table: TableReference
244
+ ) -> Iterable[QueryPlan]:
245
+ """Gathers all possible index access paths for a specific table.
246
+
247
+ The access paths do not contain a cost or cardinality estimates, yet. These information must be added by the caller.
248
+ """
249
+ filter_condition = query.predicates().filters_for(table)
250
+ required_columns = _collect_used_columns(
251
+ query, table, schema=self._target_db.schema()
252
+ )
253
+ can_idx_only_scan = (
254
+ len(required_columns) <= 1
255
+ ) # check for <= 1 to include cross products with select star
256
+ candidate_indexes = {
257
+ column: self._target_db.schema().indexes_on(column)
258
+ for column in required_columns
259
+ }
260
+
261
+ if not candidate_indexes:
262
+ return []
263
+
264
+ candidate_plans: list[QueryPlan] = []
265
+ for column, available_indexes in candidate_indexes.items():
266
+ if not available_indexes:
267
+ continue
268
+ sorting = [SortKey.of(column)]
269
+
270
+ for index in available_indexes:
271
+ if ScanOperator.IndexScan in self._scan_ops:
272
+ candidate_plans.append(
273
+ QueryPlan(
274
+ ScanOperator.IndexScan,
275
+ base_table=table,
276
+ index=index,
277
+ sort_keys=sorting,
278
+ filter_predicate=filter_condition,
279
+ )
280
+ )
281
+ if can_idx_only_scan and ScanOperator.IndexOnlyScan in self._scan_ops:
282
+ candidate_plans.append(
283
+ QueryPlan(
284
+ ScanOperator.IndexOnlyScan,
285
+ base_table=table,
286
+ index=index,
287
+ sort_keys=sorting,
288
+ filter_predicate=filter_condition,
289
+ )
290
+ )
291
+
292
+ if ScanOperator.BitmapScan in self._scan_ops:
293
+ # The target DB/cost model is responsible for figuring out good bitmap index hierarchies.
294
+ # Since bitmap scans combine multiple indexes, we do not consider bitmap scans in the above loop.
295
+ # Furthermore, bitmap scans are partial sequential scans and thus do not provide a sort key.
296
+ candidate_plans.append(
297
+ QueryPlan(
298
+ ScanOperator.BitmapScan,
299
+ base_table=table,
300
+ indexes=candidate_indexes,
301
+ filter_predicate=filter_condition,
302
+ )
303
+ )
304
+
305
+ return candidate_plans
306
+
307
+ def _build_join_paths(
308
+ self,
309
+ query: SqlQuery,
310
+ *,
311
+ dp_table: DPTable,
312
+ cost_model: CostModel,
313
+ cardinality_estimator: CardinalityEstimator,
314
+ ) -> QueryPlan:
315
+ """Main optimization loop for the dynamic programmer.
316
+
317
+ In this loop we construct increasingly large join paths by combining the optimal access paths of their input relations.
318
+ At the end of the loop we have just constructed the cheapest join path for the entire query.
319
+
320
+ All access paths are stored in the `dp_table`. This method assumes that the `dp_table` already contains the cheapest
321
+ access paths for all base relations.
322
+
323
+ Returns
324
+ -------
325
+ QueryPlan
326
+ The final query plan that represents the cheapest join path for the given query.
327
+ """
328
+
329
+ predicates = query.predicates()
330
+ candidate_tables = query.tables()
331
+
332
+ for current_level in range(2, len(candidate_tables) + 1):
333
+ # The current level describes how large the intermediate join paths that we are considering next are going to be.
334
+ # For each potential intermediate that matches the current level, we determine the cheapest access path. This path
335
+ # is going to re-use the cheapest access paths that we determined as part of an earlier iteration.
336
+
337
+ current_intermediates = itertools.combinations(
338
+ candidate_tables, current_level
339
+ )
340
+ access_paths = {
341
+ frozenset(join): self._determine_cheapest_path(
342
+ query,
343
+ join,
344
+ dp_table=dp_table,
345
+ cost_model=cost_model,
346
+ cardinality_estimator=cardinality_estimator,
347
+ )
348
+ for join in current_intermediates
349
+ if predicates.joins_tables(join) # we do not consider cross products
350
+ }
351
+ dp_table.update(access_paths)
352
+
353
+ return dp_table[frozenset(candidate_tables)]
354
+
355
+ def _determine_cheapest_path(
356
+ self,
357
+ query: SqlQuery,
358
+ intermediate: Iterable[TableReference],
359
+ *,
360
+ dp_table: DPTable,
361
+ cost_model: CostModel,
362
+ cardinality_estimator: CardinalityEstimator,
363
+ ) -> QueryPlan:
364
+ """DP subroutine that selects the cheapest access path for a specific intermediate."""
365
+ intermediate = frozenset(intermediate)
366
+ candidate_plans: list[QueryPlan] = []
367
+
368
+ # We determine the cheapest access path to our intermediate by checking all potential join partners that could possibly
369
+ # be used to construct this intermediate. This works by splitting the intermediate into an outer relation and an inner
370
+ # one. To guarantee that we test each possible split, we generate the entire power set of the intermediate.
371
+ # By basing our algorithm on the power set we can solve two important problems: first, we can easily generate bushy
372
+ # plans (each time the outer plan has at least two tables and leaves more than one table for the inner plan results in
373
+ # a bushy plan). Second, we can also generate plans that invert the role of inner and outer relation just as easy.
374
+ # This is because the power set will eventually visit the set of all tables in the (former) inner relation, which will
375
+ # then become the outer relation for the current iteration.
376
+ #
377
+ # Once again, we first gather all possible join paths and then evaluate the costs for each of them in order to select
378
+ # the cheapest one.
379
+
380
+ for outer in util.collections.powerset(intermediate):
381
+ if not outer or len(outer) == len(intermediate):
382
+ # Skip the empty set and the full set because we would lack a join partner.
383
+ continue
384
+ outer = frozenset(outer)
385
+
386
+ # All tables of our intermediate that are not part of the outer relation have to become part of the inner relation
387
+ inner = intermediate - outer
388
+
389
+ outer_plan, inner_plan = dp_table.get(outer), dp_table.get(inner)
390
+ if not outer_plan or not inner_plan:
391
+ # If we do not find the access paths for one of our inputs, it means that this is constructed using a cross
392
+ # product. Since we do not consider cross products, we can skip this split.
393
+ continue
394
+
395
+ join_condition = query.predicates().joins_between(outer, inner)
396
+
397
+ if JoinOperator.NestedLoopJoin in self._join_ops:
398
+ candidate_plans.append(
399
+ QueryPlan(
400
+ JoinOperator.NestedLoopJoin,
401
+ children=[outer_plan, inner_plan],
402
+ join_condition=join_condition,
403
+ )
404
+ )
405
+
406
+ if JoinOperator.HashJoin in self._join_ops:
407
+ candidate_plans.append(
408
+ QueryPlan(
409
+ JoinOperator.HashJoin,
410
+ children=[outer_plan, inner_plan],
411
+ join_condition=join_condition,
412
+ )
413
+ )
414
+
415
+ if JoinOperator.SortMergeJoin in self._join_ops:
416
+ # The target DB is utimately responsible for figuring out whether it needs explicit sorts or whether it can
417
+ # just merge directly.
418
+ candidate_plans.append(
419
+ QueryPlan(
420
+ JoinOperator.SortMergeJoin,
421
+ children=[outer_plan, inner_plan],
422
+ join_condition=join_condition,
423
+ )
424
+ )
425
+
426
+ candidate_plans = [
427
+ _calc_plan_estimates(
428
+ query,
429
+ candidate,
430
+ cost_model=cost_model,
431
+ cardinality_estimator=cardinality_estimator,
432
+ )
433
+ for candidate in candidate_plans
434
+ ]
435
+
436
+ return min(candidate_plans, key=lambda plan: plan.estimated_cost)
437
+
438
+
439
+ @dataclass
440
+ class RelOptInfo:
441
+ """Simplified model of the RelOptInfo from the Postgres planner.
442
+
443
+ We only specify the fields that we truly care about (and that are not covered by other parts of PostBOUND), the rest is
444
+ omitted.
445
+
446
+ For example, we don't need to worry about equivalence classes, because the Postgres enumerator is responsible for
447
+ expanding the query with all EQ predicates. Aftwards, we can use the query abstraction to determine available joins.
448
+ """
449
+
450
+ intermediate: frozenset[TableReference]
451
+ """The relation that is represented by this RelOptInfo.
452
+
453
+ This is simply the set of all tables that are part of the relation.
454
+ """
455
+
456
+ pathlist: list[QueryPlan]
457
+ """All access paths that can be used to compute this relation (that we know of and care about).
458
+
459
+ In contrast to the original PG implementation, we don't care about sorting this list. Retaining the sort order is mainly
460
+ an implementation detail and optimization of PG.
461
+ """
462
+
463
+ partial_paths: list[QueryPlan]
464
+ """All access paths that can be used to compute this relation with parallel workers. Otherwise the same as `paths`."""
465
+
466
+ cheapest_path: Optional[QueryPlan]
467
+ """The cheapest access path that we have found.
468
+
469
+ Notice that this is only set after all paths for the RelOpt have been collected.
470
+ """
471
+
472
+ cheapest_partial_path: Optional[QueryPlan]
473
+ """The cheapest access path that we have found for parallel execution.
474
+
475
+ Notice that this is only set after all paths for the RelOpt have been collected.
476
+ """
477
+
478
+ cardinality: Cardinality
479
+ """The estimated number of rows that are produced by this relation."""
480
+
481
+ def __contains__(self, item: object) -> bool:
482
+ if isinstance(item, RelOptInfo):
483
+ item = item.intermediate
484
+ if isinstance(item, TableReference):
485
+ item = {item}
486
+
487
+ return item < self.intermediate
488
+
489
+ def __hash__(self) -> int:
490
+ return hash(self.intermediate)
491
+
492
+ def __eq__(self, other: object) -> bool:
493
+ return isinstance(other, RelOptInfo) and self.intermediate == other.intermediate
494
+
495
+ def __repr__(self) -> str:
496
+ return str(self)
497
+
498
+ def __str__(self) -> str:
499
+ tables = ", ".join(sorted(t.identifier() for t in self.intermediate))
500
+ return f"{{{tables}}}"
501
+
502
+
503
+ Sorting = Sequence[SortKey]
504
+ """A specific sort order for some relation."""
505
+
506
+ Level = int
507
+ """The current level in the dynamic programming table."""
508
+
509
+ JoinRelLevel = dict[Level, list[RelOptInfo]]
510
+ """Alias for our dynamic programming table."""
511
+
512
+ AddPathHook = Callable[["PostgresDynProg", RelOptInfo, QueryPlan, bool], None]
513
+ """Hook method for users to get control in Postgres' *add_path()* method.
514
+
515
+ The method is reponsible for storing a new candidate path in its `RelOptInfo`. It can decide, whether the path is actually
516
+ worth storing or not. Furthermore, the method can also prune existing paths from the `pathlist` that are dominated by the
517
+ new path.
518
+
519
+ All of these actions should be performed in-place by modifying the `RelOptInfo` object. No return value is expected.
520
+
521
+ The last boolean parameter indicates whether the path is a partial path (i.e. a path for parallel execution) or not.
522
+ If it is, the path should be stored in the `partial_paths` list, instead of the standard `pathlist`. Likewise, all checks
523
+ should be performed against the `partial_paths` list. If the hook should not handle partial paths, it can simply delegate
524
+ to the standard implementation on the enumerator.
525
+
526
+ If need be, the method can also access the current state of the dynamic programmer. Specifically, the enumerator provides
527
+ access to the query and database, the selected cost model and cardinality estimator, as well as to the current `JoinRelLevel`.
528
+ Finally, the method is allowed to invoke the default path addition logic by calling the `standard_add_path()` method on the
529
+ enumerator.
530
+ """
531
+
532
+
533
+ class DPWarning(UserWarning):
534
+ pass
535
+
536
+
537
+ class PostgresDynProg(PlanEnumerator):
538
+ """Dynamic programming-based plan enumeration strategy that mimics the behavior of the Postgres query optimizer.
539
+
540
+ Postgres-style dynamic programming means two things: first, we use the Postgres pruning rules to reduce the search space.
541
+ Second, we apply the same opinionated traversal rules. Most importantly, this concerns when we consider materialization or
542
+ memoization of subplans. If some of the related operators are not allowed, the traversal rules are adjusted accordingly.
543
+
544
+ The implementation is based on a translation of the actual Postgres source code.
545
+
546
+ Parameters
547
+ ----------
548
+ supported_scan_ops : Optional[set[ScanOperators]], optional
549
+ The scan operators that the enumerator is allowed to use. If omitted, all scan operators that are supported by the
550
+ target database are used.
551
+ supported_join_ops : Optional[set[JoinOperators]], optional
552
+ The join operators that the enumerator is allowed to use. If omitted, all join operators supported by the target
553
+ database are used.
554
+ enable_materialize : bool, optional
555
+ Whether the optimizer is allowed to insert materialization operators into the query plan. This is enabled by default.
556
+ enable_memoize : bool, optional
557
+ Whether the optimizer is allowed to insert memoization operators into the query plan. This is enabled by default.
558
+ enable_sort : bool, optional
559
+ Whether the optimizer is allowed to perform explicit sorts in the query plan. Notice that setting this to *False* only
560
+ prevents optional sorts. For example, if the query contains an *ORDER BY* clause, the optimizer will still perform the
561
+ required sorting. However, it will not perform any merge joins that require a different kind of sorting.
562
+ This is enabled by default.
563
+ add_path_hook : Optional[AddPathHook], optional
564
+ Optional function to implement custom path addition logic. See documentation on `AddPathHook` for more details.
565
+ target_db : Optional[PostgresInterface], optional
566
+ The database on which the plans should be executed. This has to be a Postgres instance. If omitted, the database is
567
+ inferred from the `DatabasePool`.
568
+ verbose : bool, optional
569
+ Whether the enumerator should issue warnings if it encounters unexpected situations, e.g. if it rejects a path
570
+ because it is illegal. This includes cases where the cost of some operators cannot be estimated by the target
571
+ database system.
572
+ """
573
+
574
+ def __init__(
575
+ self,
576
+ *,
577
+ supported_scan_ops: Optional[set[ScanOperator]] = None,
578
+ supported_join_ops: Optional[set[JoinOperator]] = None,
579
+ enable_materialize: bool = True,
580
+ enable_memoize: bool = True,
581
+ enable_sort: bool = True,
582
+ max_parallel_workers: Optional[int] = None,
583
+ add_path_hook: Optional[AddPathHook] = None,
584
+ target_db: Optional[PostgresInterface] = None,
585
+ verbose: bool = False,
586
+ ) -> None:
587
+ target_db = (
588
+ target_db
589
+ if target_db is not None
590
+ else DatabasePool.get_instance().current_database()
591
+ )
592
+ if not isinstance(target_db, PostgresInterface):
593
+ raise LogicError(
594
+ "The PostgresDynProg enumerator can only be used with a Postgres database. "
595
+ "(but you can execute the plans on any database that supports the required hints)."
596
+ )
597
+
598
+ supported_scan_ops = (
599
+ supported_scan_ops if supported_scan_ops is not None else PostgresScanHints
600
+ )
601
+ supported_join_ops = (
602
+ supported_join_ops if supported_join_ops is not None else PostgresJoinHints
603
+ )
604
+
605
+ if target_db is not None:
606
+ supported_scan_ops = {
607
+ op for op in supported_scan_ops if target_db.hinting().supports_hint(op)
608
+ }
609
+ supported_join_ops = {
610
+ op for op in supported_join_ops if target_db.hinting().supports_hint(op)
611
+ }
612
+
613
+ self.query: SqlQuery = None
614
+ self.predicates: QueryPredicates = None
615
+ self.cost_model: CostModel = None
616
+ self.cardinality_estimator: CardinalityEstimator = None
617
+ self.join_rel_level: JoinRelLevel = None
618
+ self.target_db = target_db
619
+
620
+ self._scan_ops = supported_scan_ops
621
+ self._join_ops = supported_join_ops
622
+ self._enable_materialize = enable_materialize
623
+ self._enable_memoize = enable_memoize
624
+ self._enable_sort = enable_sort
625
+ self._max_workers = max_parallel_workers if max_parallel_workers else 0
626
+ self._add_path_hook = add_path_hook
627
+
628
+ self._verbose = verbose
629
+
630
+ def infer_settings(self) -> None:
631
+ """Sets all allowed operators according to the configuration of the target database.
632
+
633
+ For example, if the target database has index scans disabled, the enumerator will disable them as well.
634
+ """
635
+ allowed_scan_ops: list[ScanOperator] = []
636
+ if self.target_db.config["enable_seqscan"] == "on":
637
+ allowed_scan_ops.append(ScanOperator.SequentialScan)
638
+ if self.target_db.config["enable_indexscan"] == "on":
639
+ allowed_scan_ops.append(ScanOperator.IndexScan)
640
+ if self.target_db.config["enable_indexonlyscan"] == "on":
641
+ allowed_scan_ops.append(ScanOperator.IndexOnlyScan)
642
+ if self.target_db.config["enable_bitmapscan"] == "on":
643
+ allowed_scan_ops.append(ScanOperator.BitmapScan)
644
+ self._scan_ops = set(allowed_scan_ops)
645
+
646
+ allowed_join_ops: list[JoinOperator] = []
647
+ if self.target_db.config["enable_nestloop"] == "on":
648
+ allowed_join_ops.append(JoinOperator.NestedLoopJoin)
649
+ if self.target_db.config["enable_hashjoin"] == "on":
650
+ allowed_join_ops.append(JoinOperator.HashJoin)
651
+ if self.target_db.config["enable_mergejoin"] == "on":
652
+ allowed_join_ops.append(JoinOperator.SortMergeJoin)
653
+ self._join_ops = set(allowed_join_ops)
654
+
655
+ self._enable_materialize = self.target_db.config["enable_material"] == "on"
656
+ self._enable_memoize = self.target_db.config["enable_memoize"] == "on"
657
+ self._enable_sort = self.target_db.config["enable_sort"] == "on"
658
+
659
+ self._max_workers = int(
660
+ self.target_db.config["max_parallel_workers_per_gather"]
661
+ )
662
+
663
+ def generate_execution_plan(
664
+ self, query, *, cost_model, cardinality_estimator
665
+ ) -> QueryPlan:
666
+ self.query = transform.add_ec_predicates(query)
667
+ self.predicates = self.query.predicates()
668
+
669
+ cardinality_estimator.initialize(self.target_db, query)
670
+ cost_model.initialize(self.target_db, query)
671
+ self.cardinality_estimator = cardinality_estimator
672
+ self.cost_model = cost_model
673
+
674
+ base_rels = self._init_base_rels()
675
+ self._set_base_rel_pathlists(base_rels)
676
+
677
+ final_rel = self._standard_join_search(initial_rels=base_rels)
678
+ assert final_rel.cheapest_path is not None, (
679
+ "No valid plan found for the given query."
680
+ )
681
+
682
+ if self._is_pg_cost(cost_model):
683
+ # This seems weird at first, so let's explain what is going on here:
684
+ # If we use the actual PG cost model, we can be make better decisions about what the optimal plan is.
685
+ # The reason is that PG might parallelize a portion of the upper operators (anything that happens after the final
686
+ # join such as aggregations). With these upper rel parallelizations, a partial plan might become cheaper than its
687
+ # counterpart that executes the last join in parallel (and which we compute). Consequently, that very path might
688
+ # become cheaper than the cheapest path that we have computed so far.
689
+ #
690
+ # Therefore, we need to explicitly consider all partial paths as well as the sequential ones and retrieve the
691
+ # final cost estimates for them.
692
+ # But this only works if know we are using the PG execution engine and cost model.
693
+
694
+ plans = final_rel.pathlist + self._generate_pseudo_gather_paths(final_rel)
695
+ plan_costs = {plan: self._pg_cost_estimate(plan) for plan in plans}
696
+ cheapest_plan = util.argmin(plan_costs)
697
+ else:
698
+ cheapest_plan = final_rel.cheapest_path
699
+
700
+ cost_model.cleanup()
701
+ cardinality_estimator.cleanup()
702
+ self.query = None
703
+ self.predicates = None
704
+ self.cost_model = None
705
+ self.cardinality_estimator = None
706
+ return cheapest_plan
707
+
708
+ def describe(self) -> jsondict:
709
+ return {
710
+ "name": "dynamic_programming",
711
+ "flavor": "postgres",
712
+ "scan_ops": [op.name for op in self._scan_ops],
713
+ "join_ops": [op.name for op in self._join_ops],
714
+ "database_system": self.target_db.describe(),
715
+ }
716
+
717
+ def pre_check(self) -> OptimizationPreCheck:
718
+ return merge_checks(
719
+ CrossProductPreCheck(),
720
+ EquiJoinPreCheck(),
721
+ InnerJoinPreCheck(),
722
+ VirtualTablesPreCheck(),
723
+ SetOperationsPreCheck(),
724
+ )
725
+
726
+ def standard_add_path(
727
+ self, rel: RelOptInfo, path: QueryPlan, *, is_partial: bool = False
728
+ ) -> None:
729
+ """Checks, whether a specific path is worthy of further consideration. If it is, the path is stored in the pathlist.
730
+
731
+ This method's naming is exceptionally bad, but this the way it is named in the PG source code, so we stick with it.
732
+
733
+ On an abstract level, this method implements the following logic:
734
+
735
+ For each existing path in the pathlist, we check whether the new path dominates the existing one.
736
+ If it does, we evict the existing path. It the existing path is better, we keep it and discard the new path.
737
+
738
+ To determine, whether one path dominates another, we compare the paths' costs and sort orders. For one path to
739
+ dominate the other one, it must be cheaper and at least as good sorted.
740
+ If the paths are sorted differently, we keep them both.
741
+
742
+ This basic logic is executed for both regular paths and partial paths. Those are paths that can be executed in parallel
743
+ and will eventually be merged with the main paths (once the parallel portion is finished).
744
+ While "normal" paths (completely sequential ones or parallel paths that have finished their parallel portion) are
745
+ stored in the `pathlist`, partial paths are stored in the `partial_paths` list.
746
+ """
747
+ current_paths = rel.partial_paths if is_partial else rel.pathlist
748
+ if not current_paths:
749
+ current_paths.append(path)
750
+ return
751
+
752
+ result_paths: list[QueryPlan] = []
753
+ keep_new = True # we assume that we want to keep the new path to handle new sort orders correctly
754
+ new_cost = path.estimated_cost
755
+
756
+ for i, old_path in enumerate(current_paths):
757
+ if not self._sorting_subsumes(
758
+ path.sort_keys, other=old_path.params.sort_keys
759
+ ):
760
+ result_paths.append(old_path)
761
+ continue
762
+
763
+ # Postgres uses a fuzzy cost comparison (compare_path_costs_fuzzily() from pathnode.c) and evicts old paths even
764
+ # if there cost is slightly better than the new path, if the new path is better sorted.
765
+ old_cost = old_path.estimated_cost
766
+ new_dominates = (
767
+ new_cost < old_cost
768
+ if self._same_sorting(path.sort_keys, other=old_path.sort_keys)
769
+ else new_cost <= 1.01 * old_cost
770
+ )
771
+
772
+ if new_dominates:
773
+ # The new path is better (or at least equally) sorted and cheaper, we can evict the old path
774
+ keep_new = True # strictly speaking, this is not necessary, but it makes our intention clearer
775
+ continue # don't break here, we need to check the remaining paths
776
+ else:
777
+ # The existing path is better (or at least equally) sorted and cheaper, we don't need the new path.
778
+ # This also means that we can stop checking the remaining paths. The new one won't get added and the old ones
779
+ # successfully competed against the path that just beat our new one. We can keep them all.
780
+ result_paths.extend(current_paths[i:])
781
+ keep_new = False
782
+ break
783
+
784
+ if keep_new:
785
+ result_paths.append(path)
786
+
787
+ if is_partial:
788
+ rel.partial_paths = result_paths
789
+ else:
790
+ rel.pathlist = result_paths
791
+
792
+ def _init_base_rels(self) -> list[RelOptInfo]:
793
+ """Creates and initializes the RelOptInfos for all tables in the query, without computing any access paths."""
794
+
795
+ # Combines logic from make_one_rel() and set_base_rel_sizes()
796
+ initial_rels: list[RelOptInfo] = []
797
+
798
+ for base_rel in self.query.tables():
799
+ intermediate = frozenset([base_rel])
800
+ cardinality = self.cardinality_estimator.calculate_estimate(
801
+ self.query, intermediate
802
+ )
803
+
804
+ initial_rels.append(
805
+ RelOptInfo(
806
+ intermediate=intermediate,
807
+ pathlist=[],
808
+ partial_paths=[],
809
+ cheapest_path=None,
810
+ cardinality=cardinality,
811
+ cheapest_partial_path=None,
812
+ )
813
+ )
814
+
815
+ return initial_rels
816
+
817
+ def _set_base_rel_pathlists(self, initial_rels: list[RelOptInfo]) -> None:
818
+ """Adds access paths to the base relations.
819
+
820
+ The specific paths depend on the available operators and the current schema.
821
+ """
822
+ # This function leads to a larger chain of function calls which in end culminate in set_plain_rel_pathlist()
823
+ # We implement the behavior of that function here
824
+
825
+ for rel in initial_rels:
826
+ self._create_sequential_paths(rel)
827
+ self._create_index_paths(rel)
828
+ self._create_bitmap_path(rel)
829
+
830
+ self._generate_gather_paths(rel)
831
+ self._set_cheapest(rel)
832
+
833
+ def _standard_join_search(self, initial_rels: list[RelOptInfo]) -> RelOptInfo:
834
+ """Main entry point into the dynamic programming join search.
835
+
836
+ The implementation assumes that the `initial_rels` have already been initialized. Therefore, the dynamic programmer
837
+ is only concerned with building join rels.
838
+ """
839
+ levels_needed = len(initial_rels)
840
+ self.join_rel_level: JoinRelLevel = collections.defaultdict(list)
841
+ self.join_rel_level[1] = initial_rels
842
+
843
+ for level in range(2, levels_needed + 1):
844
+ self._join_search_one_level(level)
845
+
846
+ for rel in self.join_rel_level[level]:
847
+ self._generate_gather_paths(rel)
848
+ self._set_cheapest(rel)
849
+
850
+ assert len(self.join_rel_level[levels_needed]) == 1, (
851
+ "Final join rel level should only contain one relation."
852
+ )
853
+ final_rel = self.join_rel_level[levels_needed][0]
854
+ self.join_rel_level = None
855
+ return final_rel
856
+
857
+ def _join_search_one_level(self, level: Level) -> None:
858
+ """Handler method to construct all intermediates of a current level for the DP join search.
859
+
860
+ Parameters
861
+ ----------
862
+ level : int
863
+ The number of base tables that should be contained in each intermediate relation that we will construct.
864
+ """
865
+ # First, consider left-deep plans
866
+ for rel1 in self.join_rel_level[level - 1]:
867
+ # the body of this loop implements the logic of make_join_rel() (which is called by make_rels_by_clause_joins())
868
+
869
+ for rel2 in self.join_rel_level[1]:
870
+ if len(rel1.intermediate & rel2.intermediate) > 0:
871
+ # don't join anything that we have already joined
872
+ continue
873
+ if not self.predicates.joins_between(
874
+ rel1.intermediate, rel2.intermediate
875
+ ):
876
+ # don't consider cross products
877
+ continue
878
+
879
+ # functionality of build_join_rel()
880
+ intermediate = rel1.intermediate | rel2.intermediate
881
+ join_rel = self._build_join_rel(intermediate)
882
+
883
+ # functionality of populate_joinrel_with_paths()
884
+ self._add_paths_to_joinrel(join_rel, outer_rel=rel1, inner_rel=rel2)
885
+ self._add_paths_to_joinrel(join_rel, outer_rel=rel2, inner_rel=rel1)
886
+
887
+ for outer_size in range(2, level // 2 + 1):
888
+ inner_size = level - outer_size
889
+ if inner_size < 2:
890
+ continue
891
+
892
+ for rel1 in self.join_rel_level[outer_size]:
893
+ for rel2 in self.join_rel_level[inner_size]:
894
+ if len(rel1.intermediate & rel2.intermediate) > 0:
895
+ # don't join anything that we have already joined
896
+ continue
897
+ if not self.predicates.joins_between(
898
+ rel1.intermediate, rel2.intermediate
899
+ ):
900
+ # don't consider cross products
901
+ continue
902
+
903
+ # functionality of build_join_rel()
904
+ intermediate = rel1.intermediate | rel2.intermediate
905
+ join_rel = self._build_join_rel(intermediate)
906
+
907
+ # functionality of populate_joinrel_with_paths()
908
+ self._add_paths_to_joinrel(join_rel, outer_rel=rel1, inner_rel=rel2)
909
+ self._add_paths_to_joinrel(join_rel, outer_rel=rel2, inner_rel=rel1)
910
+
911
+ def _build_join_rel(self, intermediate: frozenset[TableReference]) -> RelOptInfo:
912
+ """Constructs and initializes a new RelOptInfo for a specific intermediate. No access paths are added, yet."""
913
+
914
+ # This function integrates the logic of find_join_rel() and build_join_rel()
915
+ level = len(intermediate)
916
+ for rel in self.join_rel_level[level]:
917
+ if rel.intermediate == intermediate:
918
+ return rel
919
+
920
+ cardinality = self.cardinality_estimator.calculate_estimate(
921
+ self.query, intermediate
922
+ )
923
+ join_rel = RelOptInfo(
924
+ intermediate=intermediate,
925
+ pathlist=[],
926
+ partial_paths=[],
927
+ cheapest_path=None,
928
+ cardinality=cardinality,
929
+ cheapest_partial_path=None,
930
+ )
931
+
932
+ self.join_rel_level[level].append(join_rel)
933
+ return join_rel
934
+
935
+ def _add_paths_to_joinrel(
936
+ self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
937
+ ) -> None:
938
+ """Builds all possible access paths for a specific join relation.
939
+
940
+ The build process adheres to the assignment of join directions from the parameters, i.e. the `outer_rel` will always be
941
+ the outer relation and the `inner_rel` will always be the inner relation. If it does not matter, what the specific
942
+ assignment is, this method has to be called twice with inversed parameters.
943
+ """
944
+ if JoinOperator.NestedLoopJoin in self._join_ops:
945
+ self._match_unsorted_outer(
946
+ join_rel, outer_rel=outer_rel, inner_rel=inner_rel
947
+ )
948
+ if JoinOperator.SortMergeJoin in self._join_ops:
949
+ self._sort_inner_outer(join_rel, outer_rel=outer_rel, inner_rel=inner_rel)
950
+ if JoinOperator.HashJoin in self._join_ops:
951
+ self._hash_inner_outer(join_rel, outer_rel=outer_rel, inner_rel=inner_rel)
952
+
953
+ def _sort_inner_outer(
954
+ self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
955
+ ) -> None:
956
+ """Constructs all potential merge join paths for a specific intermediate.
957
+
958
+ This method assumes that merge joins are actually enabled.
959
+ """
960
+
961
+ # The implementation of this function is loosely based on sort_inner_outer() of the PG source code.
962
+ # However, since the original function is tightly coupled with Postgres' internal query and planner representation, we
963
+ # deviate a bit more than usual from the original implementation.
964
+ #
965
+ # Specifically, our implementation performs the following high-level algorithm:
966
+ # For each potential join key between the input relations, we check whether they are already sorted based on the join
967
+ # key. If they are not, we introduce an explicit sort operator for the path. Afterwards, we create a merge join based
968
+ # on the candidate paths.
969
+ #
970
+ # As a consequence, this function essentially also implements the merge-join specific behavior of
971
+ # match_unsorted_outer(). In our implementation, that function only handles nested-loop joins.
972
+ #
973
+ # Notice that Postgres does not consider materialization or memoization of subpaths for merge joins, so neither do we.
974
+
975
+ join_keys = self._determine_join_keys(outer_rel=outer_rel, inner_rel=inner_rel)
976
+
977
+ # How often can we nest? Quite often! But this should still be fairly readable.
978
+ # We simply loop over all join keys. For each join key, we try each combination of inner and outer relations and
979
+ # see if we end up with a decent merge join path.
980
+ for join_key in join_keys:
981
+ outer_col, inner_col = self._extract_join_columns(
982
+ join_key, outer_rel=outer_rel, inner_rel=inner_rel
983
+ )
984
+ if not outer_col or not inner_col:
985
+ continue
986
+
987
+ for outer_path in outer_rel.pathlist:
988
+ if (
989
+ not self._is_sorted_by(outer_path, outer_col)
990
+ and not self._enable_sort
991
+ ):
992
+ # If the path is not already sorted and we are not allowed to sort it ourselves, there is no point in
993
+ # merge joining. Just skip the path.
994
+ continue
995
+
996
+ outer_path = (
997
+ outer_path
998
+ if self._is_sorted_by(outer_path, outer_col)
999
+ else self._create_sort_path(outer_path, sort_key=outer_col)
1000
+ )
1001
+
1002
+ for inner_path in inner_rel.pathlist:
1003
+ if (
1004
+ not self._is_sorted_by(inner_path, inner_col)
1005
+ and not self._enable_sort
1006
+ ):
1007
+ # If the path is not already sorted and we are not allowed to sort it ourselves, there is no point in
1008
+ # merge joining. Just skip the path.
1009
+ continue
1010
+
1011
+ inner_path = (
1012
+ inner_path
1013
+ if self._is_sorted_by(inner_path, inner_col)
1014
+ else self._create_sort_path(inner_path, sort_key=inner_col)
1015
+ )
1016
+
1017
+ merge_path = self._create_mergejoin_path(
1018
+ join_rel, outer_path=outer_path, inner_path=inner_path
1019
+ )
1020
+ self._add_path(join_rel, merge_path)
1021
+
1022
+ for outer_partial in outer_rel.partial_paths:
1023
+ # same as above, just as for partial paths - we try each combination of partial outer with regular inner
1024
+ if (
1025
+ not self._is_sorted_by(outer_partial, outer_col)
1026
+ and not self._enable_sort
1027
+ ):
1028
+ continue
1029
+
1030
+ outer_path = (
1031
+ outer_path
1032
+ if self._is_sorted_by(outer_partial, outer_col)
1033
+ else self._create_sort_path(outer_partial, sort_key=outer_col)
1034
+ )
1035
+
1036
+ for inner_path in inner_rel.pathlist:
1037
+ if (
1038
+ not self._is_sorted_by(inner_path, inner_col)
1039
+ and not self._enable_sort
1040
+ ):
1041
+ continue
1042
+
1043
+ inner_path = (
1044
+ inner_path
1045
+ if self._is_sorted_by(inner_path, inner_col)
1046
+ else self._create_sort_path(inner_path, sort_key=inner_col)
1047
+ )
1048
+
1049
+ merge_path = self._create_mergejoin_path(
1050
+ join_rel, outer_path=outer_path, inner_path=inner_path
1051
+ )
1052
+ self._add_path(join_rel, merge_path, is_partial=True)
1053
+
1054
+ def _match_unsorted_outer(
1055
+ self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
1056
+ ) -> None:
1057
+ """Constructs all potential nested loop-join paths for a specific intermediate.
1058
+
1059
+ This also includes adding paths with memoization or materialization if they are allowed and appear useful.
1060
+
1061
+ This method assumes that nested loop joins are actually enabled.
1062
+ """
1063
+
1064
+ # as outlined in _sort_inner_outer(), we only handle nested-loop joins here
1065
+ # Nested-loop joins are inherently unsorted, so we only care about the cheapest access paths to the input relations
1066
+ # here.
1067
+
1068
+ outer_path, inner_path = outer_rel.cheapest_path, inner_rel.cheapest_path
1069
+ if not outer_path or not inner_path:
1070
+ raise LogicError("No cheapest paths set")
1071
+
1072
+ # Try plain NLJ first, variations (memoization/materialization) afterwards
1073
+ nlj_path = self._create_nestloop_path(
1074
+ join_rel, outer_path=outer_path, inner_path=inner_path
1075
+ )
1076
+ self._add_path(join_rel, nlj_path)
1077
+
1078
+ if self._enable_memoize:
1079
+ # For memoization, we attempt to cache each potential join key for the inner relation. Since there might be
1080
+ # multiple such keys, especially for larger intermediates, we need to check multiple
1081
+
1082
+ join_predicate = self.query.predicates().joins_between(
1083
+ outer_rel.intermediate, inner_rel.intermediate
1084
+ )
1085
+ if join_predicate is None:
1086
+ raise LogicError(
1087
+ "Cross product detected. This should never happen so deep down in the optimization process. "
1088
+ f"Intermediates {outer_rel} and {inner_rel}"
1089
+ )
1090
+
1091
+ for first_col, second_col in join_predicate.join_partners():
1092
+ cache_key = (
1093
+ first_col
1094
+ if first_col.table in inner_rel.intermediate
1095
+ else second_col
1096
+ )
1097
+ if cache_key.table not in inner_rel.intermediate:
1098
+ raise LogicError(
1099
+ "Cache key must be part of the inner relation.",
1100
+ f"Key was {cache_key}, relation was {inner_rel}",
1101
+ )
1102
+
1103
+ memo_inner = self._create_memoize_path(inner_path, cache_key=cache_key)
1104
+ memo_nlj = self._create_nestloop_path(
1105
+ join_rel, outer_path=outer_path, inner_path=memo_inner
1106
+ )
1107
+ self._add_path(join_rel, memo_nlj)
1108
+
1109
+ if self._enable_materialize:
1110
+ mat_path = self._create_materialize_path(inner_path)
1111
+ mat_nlj = self._create_nestloop_path(
1112
+ join_rel, outer_path=outer_path, inner_path=mat_path
1113
+ )
1114
+ self._add_path(join_rel, mat_nlj)
1115
+
1116
+ outer_partial = outer_rel.cheapest_partial_path
1117
+ if not outer_partial:
1118
+ return
1119
+ par_nlj = self._create_nestloop_path(
1120
+ join_rel, outer_path=outer_partial, inner_path=inner_path
1121
+ )
1122
+ self._add_path(join_rel, par_nlj, is_partial=True)
1123
+
1124
+ if self._enable_memoize:
1125
+ # same as above
1126
+ join_predicate = self.query.predicates().joins_between(
1127
+ outer_rel.intermediate, inner_rel.intermediate
1128
+ )
1129
+ if join_predicate is None:
1130
+ raise LogicError(
1131
+ "Cross product detected. This should never happen so deep down in the optimization process. "
1132
+ f"Intermediates {outer_rel} and {inner_rel}"
1133
+ )
1134
+
1135
+ for first_col, second_col in join_predicate.join_partners():
1136
+ cache_key = (
1137
+ first_col
1138
+ if first_col.table in inner_rel.intermediate
1139
+ else second_col
1140
+ )
1141
+ if cache_key.table not in inner_rel.intermediate:
1142
+ raise LogicError(
1143
+ "Cache key must be part of the inner relation.",
1144
+ f"Key was {cache_key}, relation was {inner_rel}",
1145
+ )
1146
+
1147
+ memo_inner = self._create_memoize_path(inner_path, cache_key=cache_key)
1148
+ par_nlj = self._create_nestloop_path(
1149
+ join_rel, outer_path=outer_partial, inner_path=memo_inner
1150
+ )
1151
+ self._add_path(join_rel, par_nlj, is_partial=True)
1152
+
1153
+ def _hash_inner_outer(
1154
+ self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
1155
+ ) -> None:
1156
+ """Constructs the hash join path for a specific intermediate.
1157
+
1158
+ In contrast to merge joins and nested loop joins, there is really only one way to perform a hash join.
1159
+
1160
+ This method assumes that hash joins scans are actually enabled.
1161
+ """
1162
+
1163
+ # Hash joins are inherently unsorted, so we only care about the cheapest access paths to the input relations here.
1164
+ # Notice that Postgres does not consider materialization or memoization of subpaths for hash joins, so neither do we.
1165
+
1166
+ outer_path, inner_path = outer_rel.cheapest_path, inner_rel.cheapest_path
1167
+ if not outer_path or not inner_path:
1168
+ raise LogicError("No cheapest paths set")
1169
+ hash_path = self._create_hashjoin_path(
1170
+ join_rel, outer_path=outer_path, inner_path=inner_path
1171
+ )
1172
+ self._add_path(join_rel, hash_path)
1173
+
1174
+ outer_partial = outer_rel.cheapest_partial_path
1175
+ if not outer_partial:
1176
+ return
1177
+ par_hash = self._create_hashjoin_path(
1178
+ join_rel, outer_path=outer_partial, inner_path=inner_path
1179
+ )
1180
+ self._add_path(join_rel, par_hash, is_partial=True)
1181
+
1182
+ def _add_path(
1183
+ self, rel: RelOptInfo, path: QueryPlan, *, is_partial: bool = False
1184
+ ) -> None:
1185
+ """Checks, whether a specific path is worthy of further consideration. If it is, the path is stored in the pathlist.
1186
+
1187
+ This method's naming is exceptionally bad, but this the way it is named in the PG source code, so we stick with it.
1188
+
1189
+ If an `_add_path_hook` has been specified, this hook takes control after checking for illegal paths. The normal
1190
+ path adding logic is skipped in this case. Otherwise, we call the standard path adding logic from
1191
+ `_standard_add_path()`.
1192
+
1193
+ The method handles both regular paths and partial paths. Which path it is can be switched using the `is_partial`
1194
+ parameter.
1195
+ """
1196
+
1197
+ if math.isinf(path.estimated_cost):
1198
+ # The cost model returns infinite costs for illegal query plans.
1199
+ self._warn(f"Rejecting illegal path {path}")
1200
+ return
1201
+
1202
+ if self._add_path_hook:
1203
+ self._add_path_hook(self, rel, path, is_partial)
1204
+ else:
1205
+ self.standard_add_path(rel, path, is_partial=is_partial)
1206
+
1207
+ def _set_cheapest(self, rel: RelOptInfo) -> None:
1208
+ """Determines the cheapest path in terms of costs from the pathlist and partial pathlist."""
1209
+ if rel.pathlist:
1210
+ rel.pathlist.sort(key=lambda path: path.estimated_cost)
1211
+ cheapest_path = rel.pathlist[0]
1212
+ rel.cheapest_path = cheapest_path
1213
+ else:
1214
+ raise LogicError("No valid paths for relation found.")
1215
+
1216
+ if rel.partial_paths:
1217
+ rel.partial_paths.sort(key=lambda path: path.estimated_cost)
1218
+ cheapest_partial = rel.partial_paths[0]
1219
+ rel.cheapest_partial_path = cheapest_partial
1220
+ else:
1221
+ # Empty partial pathlist is allowed
1222
+ pass
1223
+
1224
+ def _generate_gather_paths(self, rel: RelOptInfo) -> None:
1225
+ """Scans a RelOpt for partial paths that might be gathered into regular paths."""
1226
+ for partial_path in rel.partial_paths:
1227
+ n_workers = partial_path.get("estimated_workers", None)
1228
+ if n_workers is None:
1229
+ raise LogicError(
1230
+ f"Partial path does not have estimated workers: {partial_path}"
1231
+ )
1232
+ gather_path = partial_path.parallelize(n_workers)
1233
+ query_fragment = transform.extract_query_fragment(
1234
+ self.query, gather_path.tables()
1235
+ )
1236
+ cost_estimate = self.cost_model.estimate_cost(query_fragment, gather_path)
1237
+ gather_path = gather_path.with_estimates(cost=cost_estimate)
1238
+
1239
+ # It is important to not delete the partial path after gathering it. The path might still be useful in an
1240
+ # upper-level partial path!
1241
+ self._add_path(rel, gather_path)
1242
+
1243
+ def _generate_pseudo_gather_paths(self, rel: RelOptInfo) -> list[QueryPlan]:
1244
+ """Creates partial paths that execute the entire plan in parallel, rather than all joins."""
1245
+ gather_paths: list[QueryPlan] = []
1246
+
1247
+ for partial_path in rel.partial_paths:
1248
+ n_workers = partial_path.get("estimated_workers", None)
1249
+ if n_workers is None:
1250
+ raise LogicError(
1251
+ f"Partial path does not have estimated workers: {partial_path}"
1252
+ )
1253
+
1254
+ pseudo_node = QueryPlan(
1255
+ node_type="Gather", children=partial_path, parallel_workers=n_workers
1256
+ )
1257
+ gather_paths.append(pseudo_node)
1258
+
1259
+ return gather_paths
1260
+
1261
+ def _create_seqscan_path(self, rel: RelOptInfo) -> QueryPlan:
1262
+ """Constructs and initializes a sequential scan path for a specific relation.
1263
+
1264
+ This method assumes that sequential scans are actually enabled.
1265
+ """
1266
+ baserel = util.simplify(rel.intermediate)
1267
+ filter_condition = self.predicates.filters_for(baserel)
1268
+ workers = self._estimate_workers(baserel)
1269
+ path = QueryPlan(
1270
+ ScanOperator.SequentialScan,
1271
+ children=[],
1272
+ base_table=baserel,
1273
+ estimated_cardinality=rel.cardinality,
1274
+ filter_predicate=filter_condition,
1275
+ estimated_workers=workers,
1276
+ )
1277
+
1278
+ # Instead of running the cost estimation on the full query, we transform the into a SELECT * query
1279
+ # This prevents the query optimizer from inserting additional operators that would artifically increase the cost of the
1280
+ # scan.
1281
+ # For example, consider the following JOB query: SELECT min(k.keyword), ... FROM keyword k JOIN ... WHERE ...
1282
+ # If we run the cost estimation without the star transformation, the query fragment for k would end up to be
1283
+ # SELECT min(k.keyword) FROM keyword k WHERE ...
1284
+ # Executing this query would require an additional aggregation step which should no be performed already at this point.
1285
+ # By transforming the query into a star query, we get the plain result set without any additional operators.
1286
+ star_query = transform.as_star_query(self.query)
1287
+ cost = self.cost_model.estimate_cost(star_query, path)
1288
+ path = path.with_estimates(cost=cost)
1289
+ return path
1290
+
1291
+ def _create_sequential_paths(self, rel: RelOptInfo) -> None:
1292
+ """Builds all sequential paths for a specific relation.
1293
+
1294
+ This includes the regular sequential scan as well as the partial version.
1295
+ """
1296
+ if ScanOperator.SequentialScan not in self._scan_ops:
1297
+ return
1298
+ seq_path = self._create_seqscan_path(rel)
1299
+ self._add_path(rel, seq_path)
1300
+
1301
+ if seq_path["estimated_workers"]:
1302
+ self._add_path(rel, seq_path, is_partial=True)
1303
+
1304
+ def _create_index_paths(self, rel: RelOptInfo) -> None:
1305
+ """Builds all index scan paths for a specific relation.
1306
+
1307
+ This method considers each index on the relation that spans columns from the query. If this is just a single column,
1308
+ it tries to create an index-only scan instead.
1309
+
1310
+ If both kinds of index scans are disabled, this method does nothing.
1311
+ """
1312
+ if (
1313
+ ScanOperator.IndexScan not in self._scan_ops
1314
+ and ScanOperator.IndexOnlyScan not in self._scan_ops
1315
+ ):
1316
+ return
1317
+
1318
+ base_table = util.simplify(rel.intermediate)
1319
+ filter_condition = self.predicates.filters_for(base_table)
1320
+ required_columns = self.query.columns_of(base_table)
1321
+ idx_only_scan = (
1322
+ ScanOperator.IndexOnlyScan in self._scan_ops and len(required_columns) <= 1
1323
+ )
1324
+ candidate_indexes = {
1325
+ column: self.target_db.schema().indexes_on(column)
1326
+ for column in required_columns
1327
+ }
1328
+
1329
+ index_paths: list[QueryPlan] = []
1330
+ for column, available_indexes in candidate_indexes.items():
1331
+ if not available_indexes:
1332
+ continue
1333
+ sorting = [SortKey.of(column)]
1334
+
1335
+ for index in available_indexes:
1336
+ workers = self._estimate_workers(index)
1337
+ if ScanOperator.IndexScan in self._scan_ops:
1338
+ idx_path = QueryPlan(
1339
+ ScanOperator.IndexScan,
1340
+ base_table=base_table,
1341
+ index=index,
1342
+ sort_keys=sorting,
1343
+ filter_predicate=filter_condition,
1344
+ estimated_workers=workers,
1345
+ )
1346
+ index_paths.append(idx_path)
1347
+ if idx_only_scan:
1348
+ idx_path = QueryPlan(
1349
+ ScanOperator.IndexOnlyScan,
1350
+ base_table=base_table,
1351
+ index=index,
1352
+ sort_keys=sorting,
1353
+ filter_predicate=filter_condition,
1354
+ estimated_workers=workers,
1355
+ )
1356
+ index_paths.append(idx_path)
1357
+
1358
+ # See comment in _create_seqscan_path() for explanation of the star query. We do the same here.
1359
+ star_query = transform.as_star_query(self.query)
1360
+ for path in index_paths:
1361
+ cost_estimate = self.cost_model.estimate_cost(star_query, path)
1362
+ path = path.with_estimates(cost=cost_estimate)
1363
+ self._add_path(rel, path)
1364
+
1365
+ if path["estimated_workers"]:
1366
+ self._add_path(rel, path, is_partial=True)
1367
+
1368
+ def _create_bitmap_path(self, rel: RelOptInfo) -> None:
1369
+ """Constructs and initializes a bitmap scan path for a specific relation.
1370
+
1371
+ Since we don't model bitmap index scans, bitmap ANDs, etc. explicitly, we only need to create a single bitmap path.
1372
+ Afterwards, we let the hinting backend figure out how to perform the scan precisely.
1373
+
1374
+ If bitmap scans are disabled, this method does nothing.
1375
+ """
1376
+ if ScanOperator.BitmapScan not in self._scan_ops:
1377
+ return
1378
+
1379
+ # We deviate from the vanilla PG implementation and only consider the cheapest bitmap path. Since they are all unsorted
1380
+ # anyway (due to the final sequential scan), this should be fine.
1381
+ #
1382
+ # Notice that the hinting backend is responsible for selecting the appropriate bitmap index hierarchies.
1383
+
1384
+ base_table = util.simplify(rel.intermediate)
1385
+ required_columns = self.query.columns_of(base_table)
1386
+ candidate_indexes = {
1387
+ column: self.target_db.schema().indexes_on(column)
1388
+ for column in required_columns
1389
+ }
1390
+ if not candidate_indexes:
1391
+ # We only check if there are no candidate indexes at all and explicitly accept the case where there is a single
1392
+ # candidate index. This is because we can still perform an index lookup followed by a sequential scan of the pages.
1393
+ # This might be better than doing a full sequential scan or a full random I/O index scan.
1394
+ return
1395
+
1396
+ filter_condition = self.predicates.filters_for(base_table)
1397
+
1398
+ # bitmap scans parallelize the scan portion, not the index portion
1399
+ workers = self._estimate_workers(base_table)
1400
+
1401
+ bitmap_path = QueryPlan(
1402
+ ScanOperator.BitmapScan,
1403
+ base_table=base_table,
1404
+ indexes=candidate_indexes,
1405
+ filter_predicate=filter_condition,
1406
+ estimated_workers=workers,
1407
+ )
1408
+
1409
+ # See comment in _create_seqscan_path() for explanation of the star query. We do the same here.
1410
+ star_query = transform.as_star_query(self.query)
1411
+ cost_estimate = self.cost_model.estimate_cost(star_query, bitmap_path)
1412
+ bitmap_path = bitmap_path.with_estimates(cost=cost_estimate)
1413
+
1414
+ self._add_path(rel, bitmap_path)
1415
+ if workers:
1416
+ self._add_path(rel, bitmap_path, is_partial=True)
1417
+
1418
+ def _create_memoize_path(
1419
+ self, path: QueryPlan, *, cache_key: ColumnReference
1420
+ ) -> QueryPlan:
1421
+ """Constructs and initializes a memo path for a specific relation.
1422
+
1423
+ The `cache_key` is the column that identifies different entries in the memo table.
1424
+
1425
+ This method assumes that memoization is actually enabled.
1426
+ """
1427
+ workers = path.get("estimated_workers", None)
1428
+
1429
+ memo_path = QueryPlan(
1430
+ IntermediateOperator.Memoize,
1431
+ children=path,
1432
+ lookup_key=ColumnExpression(cache_key),
1433
+ estimated_cardinality=path.estimated_cardinality,
1434
+ estimated_workers=workers,
1435
+ )
1436
+
1437
+ if self._is_pg_cost(self.cost_model):
1438
+ # Cost estimation happens as part of the parent join, so we just use a dummy here
1439
+ # see comment in generate_execution_plan() for details. We use the same reasoning here.
1440
+ return memo_path
1441
+
1442
+ cost_estimate = self.cost_model.estimate_cost(self.query, memo_path)
1443
+ memo_path = memo_path.with_estimates(cost=cost_estimate)
1444
+ return memo_path
1445
+
1446
+ def _create_materialize_path(self, path: QueryPlan) -> QueryPlan:
1447
+ """Constructs and initializes a materialize path for a specific relation.
1448
+
1449
+ This method assumes that materialization is actually enabled.
1450
+ """
1451
+ workers = path.get("estimated_workers", None)
1452
+
1453
+ mat_path = QueryPlan(
1454
+ IntermediateOperator.Materialize,
1455
+ children=path,
1456
+ estimated_cardinality=path.estimated_cardinality,
1457
+ estimated_workers=workers,
1458
+ )
1459
+
1460
+ if self._is_pg_cost(self.cost_model):
1461
+ # Cost estimation happens as part of the parent join, so we just use a dummy here
1462
+ # see comment in generate_execution_plan() for details. We use the same reasoning here.
1463
+ return mat_path
1464
+
1465
+ cost_estimate = self.cost_model.estimate_cost(self.query, mat_path)
1466
+ mat_path = mat_path.with_estimates(cost=cost_estimate)
1467
+ return mat_path
1468
+
1469
+ def _create_sort_path(
1470
+ self, path: QueryPlan, *, sort_key: ColumnReference
1471
+ ) -> QueryPlan:
1472
+ """Constructs and initializes a sort path for a specific relation on a specific column.
1473
+
1474
+ The column to sort by is specified by `sort_key`. Notice that the sort path will always be created, even if the path
1475
+ is already sorted by the key.
1476
+
1477
+ This method assumes that sorting is actually enabled.
1478
+ """
1479
+ workers = path.get("estimated_workers", None)
1480
+
1481
+ sort_path = QueryPlan(
1482
+ IntermediateOperator.Sort,
1483
+ children=path,
1484
+ sort_keys=[SortKey.of(sort_key)],
1485
+ estimated_cardinality=path.estimated_cardinality,
1486
+ estimated_workers=workers,
1487
+ )
1488
+
1489
+ cost_estimate = self.cost_model.estimate_cost(self.query, sort_path)
1490
+ sort_path = sort_path.with_estimates(cost=cost_estimate)
1491
+ return sort_path
1492
+
1493
+ def _create_nestloop_path(
1494
+ self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
1495
+ ) -> QueryPlan:
1496
+ """Constructs and initializes a nested loop join path for a specific intermediate.
1497
+
1498
+ This method assumes that nested loop joins are actually enabled.
1499
+
1500
+ Parameters
1501
+ ----------
1502
+ query : SqlQuery
1503
+ The query that is currently being optimized.
1504
+ join_rel : _RelOptInfo
1505
+ The RelOptInfo of the join to construct
1506
+ outer_path : QueryPlan
1507
+ The access path for the outer relation in the join
1508
+ inner_path : QueryPlan
1509
+ The access path for the inner relation in the join
1510
+ cost_model : CostModel
1511
+ The cost model to evaluate the new path
1512
+ """
1513
+ join_condition = self.predicates.joins_between(
1514
+ outer_path.tables(), inner_path.tables()
1515
+ )
1516
+ workers = outer_path.get("estimated_workers", None)
1517
+
1518
+ nlj_path = QueryPlan(
1519
+ JoinOperator.NestedLoopJoin,
1520
+ children=[outer_path, inner_path],
1521
+ estimated_cardinality=join_rel.cardinality,
1522
+ filter_predicate=join_condition,
1523
+ estimated_workers=workers,
1524
+ )
1525
+
1526
+ # We need to be very hacky here. Let's explain why:
1527
+ # PG is rather picky about which operators can be used for which queries. For example, it is generally not possible to
1528
+ # perform an index scan for an arbitrary SELECT * FROM relation query. Instead, PG requires the existence of a filter
1529
+ # condition on the indexed column. Likewise, Memoize and Materialize can never be the final operator in a plan, and
1530
+ # must be part of a nested-loop join.
1531
+ # Therefore, it might not be possible to correctly estimate the cost of these operators on their own, but only in
1532
+ # conjunction with a nested-loop join.
1533
+ # To account for this, we check whether we have any of these special/"weird" operators on the inner side of our NLJ.
1534
+ # If we do, and our target cost model is actually PG itself, we ask PG to estimate the cost for the entire NLJ plan
1535
+ # and just extract the costs that we require.
1536
+
1537
+ weird_ops: set[PhysicalOperator] = {
1538
+ ScanOperator.IndexScan,
1539
+ ScanOperator.IndexOnlyScan,
1540
+ IntermediateOperator.Memoize,
1541
+ IntermediateOperator.Materialize,
1542
+ }
1543
+ if self._is_pg_cost(self.cost_model) and inner_path.operator in weird_ops:
1544
+ native_plan = self._pg_plan(nlj_path)
1545
+
1546
+ if native_plan:
1547
+ nlj_node = native_plan.find_first_node(
1548
+ lambda n: n.operator == JoinOperator.NestedLoopJoin
1549
+ )
1550
+ weird_node = native_plan.find_first_node(
1551
+ lambda n: n.operator == inner_path.operator, direction="inner"
1552
+ )
1553
+ inner_cost = weird_node.estimated_cost if weird_node else math.inf
1554
+ nlj_cost = (
1555
+ nlj_node.estimated_cost if not math.isinf(inner_cost) else math.inf
1556
+ )
1557
+
1558
+ updated_inner = inner_path.with_estimates(cost=inner_cost)
1559
+ nlj_path = QueryPlan(
1560
+ JoinOperator.NestedLoopJoin,
1561
+ children=[outer_path, updated_inner],
1562
+ estimated_cardinality=join_rel.cardinality,
1563
+ estimated_cost=nlj_cost,
1564
+ filter_predicate=join_condition,
1565
+ estimated_workers=workers,
1566
+ )
1567
+ else:
1568
+ cost_estimate = self.cost_model.estimate_cost(self.query, nlj_path)
1569
+ nlj_path = nlj_path.with_estimates(cost=cost_estimate)
1570
+
1571
+ else:
1572
+ cost_estimate = self.cost_model.estimate_cost(self.query, nlj_path)
1573
+ nlj_path = nlj_path.with_estimates(cost=cost_estimate)
1574
+
1575
+ return nlj_path
1576
+
1577
+ def _create_mergejoin_path(
1578
+ self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
1579
+ ) -> QueryPlan:
1580
+ """Constructs and initializes a merge join path for a specific intermediate.
1581
+
1582
+ This method assumes that merge joins are actually enabled and that the input paths are already sorted by the join key.
1583
+ However, we take a conservative approach and only assume sorting by the first sort key of each path.
1584
+
1585
+ Parameters
1586
+ ----------
1587
+ query : SqlQuery
1588
+ The query that is currently being optimized.
1589
+ join_rel : _RelOptInfo
1590
+ The RelOptInfo of the join to construct
1591
+ outer_path : QueryPlan
1592
+ The access path for the outer relation in the join
1593
+ inner_path : QueryPlan
1594
+ The access path for the inner relation in the join
1595
+ cost_model : CostModel
1596
+ The cost model to evaluate the new path
1597
+ """
1598
+
1599
+ # This function assumes that outer_path and inner_path are already sorted appropriately.
1600
+ merge_key = outer_path.sort_keys[0].merge_with(inner_path.sort_keys[0])
1601
+ join_condition = self.predicates.joins_between(
1602
+ outer_path.tables(), inner_path.tables()
1603
+ )
1604
+ workers = outer_path.get("estimated_workers", None)
1605
+
1606
+ merge_path = QueryPlan(
1607
+ JoinOperator.SortMergeJoin,
1608
+ children=[outer_path, inner_path],
1609
+ sort_keys=[merge_key],
1610
+ estimated_cardinality=join_rel.cardinality,
1611
+ filter_predicate=join_condition,
1612
+ estimated_workers=workers,
1613
+ )
1614
+
1615
+ cost_estimate = self.cost_model.estimate_cost(self.query, merge_path)
1616
+ merge_path = merge_path.with_estimates(cost=cost_estimate)
1617
+ return merge_path
1618
+
1619
+ def _create_hashjoin_path(
1620
+ self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
1621
+ ) -> QueryPlan:
1622
+ """Constructs and initializes a hash join path for a specific intermediate.
1623
+
1624
+ This method assumes that hash joins are actually enabled.
1625
+
1626
+ Parameters
1627
+ ----------
1628
+ query : SqlQuery
1629
+ The query that is currently being optimized.
1630
+ join_rel : _RelOptInfo
1631
+ The RelOptInfo of the join to construct
1632
+ outer_path : QueryPlan
1633
+ The access path for the outer relation in the join
1634
+ inner_path : QueryPlan
1635
+ The access path for the inner relation in the join
1636
+ cost_model : CostModel
1637
+ The cost model to evaluate the new path
1638
+ """
1639
+ join_condition = self.predicates.joins_between(
1640
+ outer_path.tables(), inner_path.tables()
1641
+ )
1642
+ workers = outer_path.get("estimated_workers", None)
1643
+
1644
+ hash_path = QueryPlan(
1645
+ JoinOperator.HashJoin,
1646
+ children=[outer_path, inner_path],
1647
+ estimated_cardinality=join_rel.cardinality,
1648
+ filter_predicate=join_condition,
1649
+ estimated_workers=workers,
1650
+ )
1651
+
1652
+ cost_estimate = self.cost_model.estimate_cost(self.query, hash_path)
1653
+ hash_path = hash_path.with_estimates(cost=cost_estimate)
1654
+ return hash_path
1655
+
1656
+ def _estimate_workers(self, relation: TableReference | str) -> int:
1657
+ """Computes the number of worker processes to use for specific base relation or index.
1658
+
1659
+ Parameters
1660
+ ----------
1661
+ relation : TableReference | str
1662
+ The base table or index to estimate. Indexes should be represented by their name whereas base tables can be
1663
+ supplied either as a `TableReference` or by name.
1664
+ """
1665
+ if not self._max_workers:
1666
+ return 0
1667
+ n_pages = self.target_db.statistics().n_pages(relation)
1668
+ workers = math.log(n_pages, 3)
1669
+ if self._max_workers is None:
1670
+ return round(workers)
1671
+ return min(self._max_workers, round(workers))
1672
+
1673
+ def _pg_cost_estimate(self, path: QueryPlan) -> float:
1674
+ query_fragment = transform.extract_query_fragment(self.query, path.tables())
1675
+ hinted_query = self.target_db.hinting().generate_hints(query_fragment, path)
1676
+ try:
1677
+ cost = self.target_db.optimizer().cost_estimate(hinted_query)
1678
+ return cost
1679
+ except DatabaseServerError:
1680
+ return math.inf
1681
+
1682
+ def _is_pg_cost(self, cost_model: CostModel) -> bool:
1683
+ return (
1684
+ isinstance(cost_model, native.NativeCostModel)
1685
+ and cost_model.target_db == self.target_db
1686
+ )
1687
+
1688
+ def _pg_plan(self, path: QueryPlan) -> Optional[QueryPlan]:
1689
+ query_fragment = transform.extract_query_fragment(self.query, path.tables())
1690
+ hinted_query = self.target_db.hinting().generate_hints(query_fragment, path)
1691
+ try:
1692
+ native_plan = self.target_db.optimizer().query_plan(hinted_query)
1693
+ return native_plan
1694
+ except DatabaseServerError:
1695
+ return None
1696
+
1697
+ def _determine_join_keys(
1698
+ self, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
1699
+ ) -> list[AbstractPredicate]:
1700
+ """Determines all available join predicates between two relations.
1701
+
1702
+ The predicates are implicitly ANDed together.
1703
+ """
1704
+ join_predicates = self.query.predicates().joins_between(
1705
+ outer_rel.intermediate, inner_rel.intermediate
1706
+ )
1707
+ if not join_predicates:
1708
+ raise LogicError(
1709
+ "Cross product detected. This should never happen so deep down in the "
1710
+ "optimization process. Intermediates are "
1711
+ f"{outer_rel} and {inner_rel}"
1712
+ )
1713
+
1714
+ match join_predicates:
1715
+ case CompoundPredicate(op, children) if op == CompoundOperator.And:
1716
+ join_keys: Sequence[AbstractPredicate] = children
1717
+ case _:
1718
+ join_keys = [join_predicates]
1719
+
1720
+ return join_keys
1721
+
1722
+ def _extract_join_columns(
1723
+ self,
1724
+ join_key: AbstractPredicate,
1725
+ *,
1726
+ outer_rel: RelOptInfo,
1727
+ inner_rel: RelOptInfo,
1728
+ ) -> tuple[ColumnReference, ColumnReference]:
1729
+ """Provides the join columns that are joined together in the format (outer_col, inner_col).
1730
+
1731
+ This method assumes that we indeed only perform a binary equi-join and will break otherwise.
1732
+ """
1733
+ partners = join_key.join_partners()
1734
+ if len(partners) != 2:
1735
+ # TODO: in all further processing, we ignore the case where a path might be sorted by more than one join key
1736
+ # already this should occur very rarely, but it might provide a decent performance boost in those situations
1737
+ return None, None
1738
+
1739
+ partner: tuple[ColumnReference, ColumnReference] = util.simplify(partners)
1740
+ first_col, second_col = partner
1741
+
1742
+ if (
1743
+ first_col.table in outer_rel.intermediate
1744
+ and second_col.table in inner_rel.intermediate
1745
+ ):
1746
+ return first_col, second_col
1747
+ elif (
1748
+ first_col.table in inner_rel.intermediate
1749
+ and second_col.table in outer_rel.intermediate
1750
+ ):
1751
+ return second_col, first_col
1752
+ else:
1753
+ raise LogicError()
1754
+
1755
+ def _is_sorted_by(self, path: QueryPlan, column: ColumnReference) -> bool:
1756
+ """Checks, whether a specific path is sorted by some column.
1757
+
1758
+ The column has to be the dominating part of the ordering, i.e. it is not sufficient that the column appears somewhere
1759
+ on the join path, it has to be the first column.
1760
+ """
1761
+ if not path.sort_keys:
1762
+ return False
1763
+
1764
+ primary_sorting = path.sort_keys[0]
1765
+ return primary_sorting.is_compatible_with(column)
1766
+
1767
+ def _sorting_subsumes(self, sorting: Sorting, *, other: Sorting) -> bool:
1768
+ """Checks, whether some sorting is "included" in another sorting.
1769
+
1770
+ We define subsumption as follows:
1771
+ - If both sortings are equal, they subsume each other
1772
+ - If one sorting is longer than the other, but the shorter one is a prefix of the larger one, the larger one subsumes
1773
+ the smaller one
1774
+
1775
+ Parameters
1776
+ ----------
1777
+ sorting : Sorting
1778
+ The sorting which should subsume the `other` sorting
1779
+ other : Sorting
1780
+ The sorting being subsumed
1781
+ """
1782
+ if sorting and not other:
1783
+ # we should always be able to evict other paths if we are sorted (and cheaper) and the other path is not
1784
+ # notice that we only check for sorting here, costs are handled elsewhere
1785
+ return True
1786
+ if other and not sorting:
1787
+ # we should never evict if we are not sorted but the other path is
1788
+ return False
1789
+
1790
+ if len(other) > len(sorting):
1791
+ # we should never evict if the other path is more precise
1792
+ return False
1793
+
1794
+ for i, key in enumerate(sorting):
1795
+ if i >= len(other):
1796
+ # Our current path has more sort keys than the other path (i.e. it is more specific) and so far all sort keys
1797
+ # have been equivalent. The other sorting is subsumed by our sorting.
1798
+ return True
1799
+
1800
+ other_key = other[i]
1801
+ if not key.is_compatible_with(other_key):
1802
+ return False
1803
+
1804
+ return True
1805
+
1806
+ def _same_sorting(self, sorting: Sorting | None, *, other: Sorting | None) -> bool:
1807
+ """Checks, whether two sort orders are exactly equivalent."""
1808
+ if sorting is None and other is None:
1809
+ return True
1810
+ if sorting is None or other is None:
1811
+ return False
1812
+
1813
+ if len(sorting) != len(other):
1814
+ return False
1815
+
1816
+ for key, other_key in zip(sorting, other):
1817
+ if not key.is_compatible_with(other_key):
1818
+ return False
1819
+
1820
+ return True
1821
+
1822
+ def _warn(self, msg: str) -> None:
1823
+ if not self._verbose:
1824
+ return
1825
+ warnings.warn(msg, category=DPWarning)