PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1825 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import collections
|
|
4
|
+
import itertools
|
|
5
|
+
import math
|
|
6
|
+
import warnings
|
|
7
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .. import util
|
|
12
|
+
from .._core import (
|
|
13
|
+
Cardinality,
|
|
14
|
+
IntermediateOperator,
|
|
15
|
+
JoinOperator,
|
|
16
|
+
PhysicalOperator,
|
|
17
|
+
ScanOperator,
|
|
18
|
+
TableReference,
|
|
19
|
+
)
|
|
20
|
+
from .._qep import QueryPlan, SortKey
|
|
21
|
+
from .._stages import (
|
|
22
|
+
CardinalityEstimator,
|
|
23
|
+
CostModel,
|
|
24
|
+
OptimizationPreCheck,
|
|
25
|
+
PlanEnumerator,
|
|
26
|
+
)
|
|
27
|
+
from .._validation import (
|
|
28
|
+
CrossProductPreCheck,
|
|
29
|
+
EquiJoinPreCheck,
|
|
30
|
+
InnerJoinPreCheck,
|
|
31
|
+
SetOperationsPreCheck,
|
|
32
|
+
SubqueryPreCheck,
|
|
33
|
+
VirtualTablesPreCheck,
|
|
34
|
+
merge_checks,
|
|
35
|
+
)
|
|
36
|
+
from ..db._db import Database, DatabasePool, DatabaseSchema, DatabaseServerError
|
|
37
|
+
from ..db.postgres import PostgresInterface, PostgresJoinHints, PostgresScanHints
|
|
38
|
+
from ..qal import transform
|
|
39
|
+
from ..qal._qal import (
|
|
40
|
+
AbstractPredicate,
|
|
41
|
+
ColumnExpression,
|
|
42
|
+
ColumnReference,
|
|
43
|
+
CompoundOperator,
|
|
44
|
+
CompoundPredicate,
|
|
45
|
+
QueryPredicates,
|
|
46
|
+
SqlQuery,
|
|
47
|
+
)
|
|
48
|
+
from ..util import LogicError, jsondict
|
|
49
|
+
from . import native
|
|
50
|
+
|
|
51
|
+
DPTable = dict[frozenset[TableReference], QueryPlan]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _calc_plan_estimates(
|
|
55
|
+
query: SqlQuery,
|
|
56
|
+
plan: QueryPlan,
|
|
57
|
+
*,
|
|
58
|
+
cost_model: CostModel,
|
|
59
|
+
cardinality_estimator: CardinalityEstimator,
|
|
60
|
+
) -> QueryPlan:
|
|
61
|
+
"""Handler method to update the cost and cardinality estimates of a given plan."""
|
|
62
|
+
card_est = cardinality_estimator.calculate_estimate(query, plan.tables())
|
|
63
|
+
plan = plan.with_estimates(cardinality=card_est)
|
|
64
|
+
cost_est = cost_model.estimate_cost(query, plan)
|
|
65
|
+
return plan.with_estimates(cost=cost_est)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _collect_used_columns(
|
|
69
|
+
query: SqlQuery, table: TableReference, *, schema: DatabaseSchema
|
|
70
|
+
) -> set[ColumnReference]:
|
|
71
|
+
columns = query.columns_of(table)
|
|
72
|
+
for star_expression in query.select_clause.star_expressions():
|
|
73
|
+
if table not in star_expression.tables():
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
columns |= schema.columns(table)
|
|
77
|
+
return columns
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DynamicProgrammingEnumerator(PlanEnumerator):
|
|
81
|
+
"""A very basic dynamic programming-based plan enumerator.
|
|
82
|
+
|
|
83
|
+
This enumerator is very basic because it does not implement any sophisticated pruning rules or traversal strategies and
|
|
84
|
+
only focuses on a small subset of possible operators. It simply enumerates all possible access paths and join paths and
|
|
85
|
+
picks the cheapest one. This should only serve as a starting point when lacking an actual decent enumerator implementation
|
|
86
|
+
(see *Limitation* below). Its purpose is mainly to shield users that are only interested in the cost model or the
|
|
87
|
+
cardinality estimator from having to implement their own enumerator in order to use the `TextBookOptimizationPipeline`.
|
|
88
|
+
Notice that for experiments based on PostgreSQL, a much more sophisticated implementation is available with the
|
|
89
|
+
`PostgresDynProg` enumerator (and this enumerator is automatically selected when using the textbook pipeline with a
|
|
90
|
+
Postgres target database).
|
|
91
|
+
|
|
92
|
+
Limitations
|
|
93
|
+
-----------
|
|
94
|
+
|
|
95
|
+
- Only the cheapest access paths are considered, without taking sort orders into account. This prevents free merge join
|
|
96
|
+
optimizations, i.e. if an access path is more expensive but already sorted, it will be discarded in favor of a cheaper
|
|
97
|
+
alternative, even though a later merge join might become much cheaper due to the sort order.
|
|
98
|
+
- No optimizations to intermediates are considered, i.e. no materialization or memoization of subplans.
|
|
99
|
+
- Only the basic scan and join operators are considered. For scans, this includes sequential scan, index scan, index-only
|
|
100
|
+
scan and bitmap scan. For joins, this includes nested loop join, hash join and sort merge join. These can be further
|
|
101
|
+
restricted through the `supported_scan_ops` and `supported_join_ops` parameters.
|
|
102
|
+
- Only simple SPJ queries are supported. Importantly, the query may not contain any set operations, subqueries, CTEs etc.
|
|
103
|
+
All joins must be inner equijoins and no cross products are allowed.
|
|
104
|
+
- Aggregations, sorting, etc. are not considered. In this way, the enumerator is comparable to the ``join_search_hook``
|
|
105
|
+
of PostgreSQL. We assume that such "technicalities" are handled when creating appropriate hints for the target database
|
|
106
|
+
or when executing the query on the target database at the latest.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
supported_scan_ops : Optional[set[ScanOperator]], optional
|
|
111
|
+
The set of scan operators that should be considered during the enumeration. This should be a subset of the following
|
|
112
|
+
operators: sequential scan, index scan, index-only scan, bitmap scan. If any other operators are included, these
|
|
113
|
+
are simply never considered. By default all operators that are available on the `target_db` are allowed.
|
|
114
|
+
supported_join_ops : Optional[set[JoinOperator]], optional
|
|
115
|
+
The set of join operators that should be considered during the enumeration. This should be a subset of the following
|
|
116
|
+
operators: nested loop join, hash join, sort merge join. If any other operators are included, these are simply never
|
|
117
|
+
considered. By default all operators that are available on the `target_db` are allowed.
|
|
118
|
+
target_db : Optional[Database], optional
|
|
119
|
+
The target database system for which the optimization pipeline is intended. If not omitted, the database is inferred
|
|
120
|
+
from the `DatabasePool`.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
*,
|
|
126
|
+
supported_scan_ops: Optional[set[ScanOperator]] = None,
|
|
127
|
+
supported_join_ops: Optional[set[JoinOperator]] = None,
|
|
128
|
+
target_db: Optional[Database] = None,
|
|
129
|
+
) -> None:
|
|
130
|
+
target_db = (
|
|
131
|
+
target_db
|
|
132
|
+
if target_db is not None
|
|
133
|
+
else DatabasePool.get_instance().current_database()
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
supported_scan_ops = (
|
|
137
|
+
supported_scan_ops if supported_scan_ops is not None else set(ScanOperator)
|
|
138
|
+
)
|
|
139
|
+
supported_join_ops = (
|
|
140
|
+
supported_join_ops if supported_join_ops is not None else set(JoinOperator)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if target_db is not None:
|
|
144
|
+
supported_scan_ops = {
|
|
145
|
+
op for op in supported_scan_ops if target_db.hinting().supports_hint(op)
|
|
146
|
+
}
|
|
147
|
+
supported_join_ops = {
|
|
148
|
+
op for op in supported_join_ops if target_db.hinting().supports_hint(op)
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
self.predicates: QueryPredicates = None
|
|
152
|
+
|
|
153
|
+
self._target_db = target_db
|
|
154
|
+
self._scan_ops = supported_scan_ops
|
|
155
|
+
self._join_ops = supported_join_ops
|
|
156
|
+
|
|
157
|
+
def generate_execution_plan(
|
|
158
|
+
self, query, *, cost_model, cardinality_estimator
|
|
159
|
+
) -> QueryPlan:
|
|
160
|
+
self.predicates = query.predicates()
|
|
161
|
+
cost_model.initialize(self._target_db, query)
|
|
162
|
+
cardinality_estimator.initialize(self._target_db, query)
|
|
163
|
+
|
|
164
|
+
dp_table = self._determine_base_access_paths(
|
|
165
|
+
query, cost_model=cost_model, cardinality_estimator=cardinality_estimator
|
|
166
|
+
)
|
|
167
|
+
final_plan = self._build_join_paths(
|
|
168
|
+
query,
|
|
169
|
+
dp_table=dp_table,
|
|
170
|
+
cost_model=cost_model,
|
|
171
|
+
cardinality_estimator=cardinality_estimator,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
cost_model.cleanup()
|
|
175
|
+
cardinality_estimator.cleanup()
|
|
176
|
+
self.predicates = None
|
|
177
|
+
return final_plan
|
|
178
|
+
|
|
179
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
180
|
+
return merge_checks(
|
|
181
|
+
CrossProductPreCheck(),
|
|
182
|
+
VirtualTablesPreCheck(),
|
|
183
|
+
EquiJoinPreCheck(),
|
|
184
|
+
InnerJoinPreCheck(),
|
|
185
|
+
SubqueryPreCheck(),
|
|
186
|
+
SetOperationsPreCheck(),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def describe(self) -> jsondict:
|
|
190
|
+
return {
|
|
191
|
+
"name": "dynamic_programming",
|
|
192
|
+
"flavor": "default",
|
|
193
|
+
"scan_ops": [op.name for op in self._scan_ops],
|
|
194
|
+
"join_ops": [op.name for op in self._join_ops],
|
|
195
|
+
"database_system": self._target_db.describe(),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
def _determine_base_access_paths(
|
|
199
|
+
self,
|
|
200
|
+
query: SqlQuery,
|
|
201
|
+
*,
|
|
202
|
+
cost_model: CostModel,
|
|
203
|
+
cardinality_estimator: CardinalityEstimator,
|
|
204
|
+
) -> DPTable:
|
|
205
|
+
"""Initializes a new dynamic programming table which includes the cheapest access paths for each base table.
|
|
206
|
+
|
|
207
|
+
The base tables are directly inferred from the query.
|
|
208
|
+
"""
|
|
209
|
+
dp_table: DPTable = {}
|
|
210
|
+
|
|
211
|
+
for table in query.tables():
|
|
212
|
+
# We determine access paths in two phases: initially, we just gather all possible access paths to a specific table.
|
|
213
|
+
# Aftewards, we evaluate these candidates according to our cost model and select the cheapest one.
|
|
214
|
+
candidate_plans: list[QueryPlan] = []
|
|
215
|
+
filter_condition = self.predicates.filters_for(table)
|
|
216
|
+
|
|
217
|
+
if ScanOperator.SequentialScan in self._scan_ops:
|
|
218
|
+
candidate_plans.append(
|
|
219
|
+
QueryPlan(
|
|
220
|
+
ScanOperator.SequentialScan,
|
|
221
|
+
base_table=table,
|
|
222
|
+
filter_predicate=filter_condition,
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
candidate_plans += self._determine_index_paths(query, table)
|
|
226
|
+
|
|
227
|
+
candidate_plans = [
|
|
228
|
+
_calc_plan_estimates(
|
|
229
|
+
query,
|
|
230
|
+
candidate,
|
|
231
|
+
cost_model=cost_model,
|
|
232
|
+
cardinality_estimator=cardinality_estimator,
|
|
233
|
+
)
|
|
234
|
+
for candidate in candidate_plans
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
cheapest_plan = min(candidate_plans, key=lambda plan: plan.estimated_cost)
|
|
238
|
+
dp_table[frozenset([table])] = cheapest_plan
|
|
239
|
+
|
|
240
|
+
return dp_table
|
|
241
|
+
|
|
242
|
+
def _determine_index_paths(
|
|
243
|
+
self, query: SqlQuery, table: TableReference
|
|
244
|
+
) -> Iterable[QueryPlan]:
|
|
245
|
+
"""Gathers all possible index access paths for a specific table.
|
|
246
|
+
|
|
247
|
+
The access paths do not contain a cost or cardinality estimates, yet. These information must be added by the caller.
|
|
248
|
+
"""
|
|
249
|
+
filter_condition = query.predicates().filters_for(table)
|
|
250
|
+
required_columns = _collect_used_columns(
|
|
251
|
+
query, table, schema=self._target_db.schema()
|
|
252
|
+
)
|
|
253
|
+
can_idx_only_scan = (
|
|
254
|
+
len(required_columns) <= 1
|
|
255
|
+
) # check for <= 1 to include cross products with select star
|
|
256
|
+
candidate_indexes = {
|
|
257
|
+
column: self._target_db.schema().indexes_on(column)
|
|
258
|
+
for column in required_columns
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if not candidate_indexes:
|
|
262
|
+
return []
|
|
263
|
+
|
|
264
|
+
candidate_plans: list[QueryPlan] = []
|
|
265
|
+
for column, available_indexes in candidate_indexes.items():
|
|
266
|
+
if not available_indexes:
|
|
267
|
+
continue
|
|
268
|
+
sorting = [SortKey.of(column)]
|
|
269
|
+
|
|
270
|
+
for index in available_indexes:
|
|
271
|
+
if ScanOperator.IndexScan in self._scan_ops:
|
|
272
|
+
candidate_plans.append(
|
|
273
|
+
QueryPlan(
|
|
274
|
+
ScanOperator.IndexScan,
|
|
275
|
+
base_table=table,
|
|
276
|
+
index=index,
|
|
277
|
+
sort_keys=sorting,
|
|
278
|
+
filter_predicate=filter_condition,
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
if can_idx_only_scan and ScanOperator.IndexOnlyScan in self._scan_ops:
|
|
282
|
+
candidate_plans.append(
|
|
283
|
+
QueryPlan(
|
|
284
|
+
ScanOperator.IndexOnlyScan,
|
|
285
|
+
base_table=table,
|
|
286
|
+
index=index,
|
|
287
|
+
sort_keys=sorting,
|
|
288
|
+
filter_predicate=filter_condition,
|
|
289
|
+
)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if ScanOperator.BitmapScan in self._scan_ops:
|
|
293
|
+
# The target DB/cost model is responsible for figuring out good bitmap index hierarchies.
|
|
294
|
+
# Since bitmap scans combine multiple indexes, we do not consider bitmap scans in the above loop.
|
|
295
|
+
# Furthermore, bitmap scans are partial sequential scans and thus do not provide a sort key.
|
|
296
|
+
candidate_plans.append(
|
|
297
|
+
QueryPlan(
|
|
298
|
+
ScanOperator.BitmapScan,
|
|
299
|
+
base_table=table,
|
|
300
|
+
indexes=candidate_indexes,
|
|
301
|
+
filter_predicate=filter_condition,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
return candidate_plans
|
|
306
|
+
|
|
307
|
+
def _build_join_paths(
|
|
308
|
+
self,
|
|
309
|
+
query: SqlQuery,
|
|
310
|
+
*,
|
|
311
|
+
dp_table: DPTable,
|
|
312
|
+
cost_model: CostModel,
|
|
313
|
+
cardinality_estimator: CardinalityEstimator,
|
|
314
|
+
) -> QueryPlan:
|
|
315
|
+
"""Main optimization loop for the dynamic programmer.
|
|
316
|
+
|
|
317
|
+
In this loop we construct increasingly large join paths by combining the optimal access paths of their input relations.
|
|
318
|
+
At the end of the loop we have just constructed the cheapest join path for the entire query.
|
|
319
|
+
|
|
320
|
+
All access paths are stored in the `dp_table`. This method assumes that the `dp_table` already contains the cheapest
|
|
321
|
+
access paths for all base relations.
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
QueryPlan
|
|
326
|
+
The final query plan that represents the cheapest join path for the given query.
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
predicates = query.predicates()
|
|
330
|
+
candidate_tables = query.tables()
|
|
331
|
+
|
|
332
|
+
for current_level in range(2, len(candidate_tables) + 1):
|
|
333
|
+
# The current level describes how large the intermediate join paths that we are considering next are going to be.
|
|
334
|
+
# For each potential intermediate that matches the current level, we determine the cheapest access path. This path
|
|
335
|
+
# is going to re-use the cheapest access paths that we determined as part of an earlier iteration.
|
|
336
|
+
|
|
337
|
+
current_intermediates = itertools.combinations(
|
|
338
|
+
candidate_tables, current_level
|
|
339
|
+
)
|
|
340
|
+
access_paths = {
|
|
341
|
+
frozenset(join): self._determine_cheapest_path(
|
|
342
|
+
query,
|
|
343
|
+
join,
|
|
344
|
+
dp_table=dp_table,
|
|
345
|
+
cost_model=cost_model,
|
|
346
|
+
cardinality_estimator=cardinality_estimator,
|
|
347
|
+
)
|
|
348
|
+
for join in current_intermediates
|
|
349
|
+
if predicates.joins_tables(join) # we do not consider cross products
|
|
350
|
+
}
|
|
351
|
+
dp_table.update(access_paths)
|
|
352
|
+
|
|
353
|
+
return dp_table[frozenset(candidate_tables)]
|
|
354
|
+
|
|
355
|
+
def _determine_cheapest_path(
|
|
356
|
+
self,
|
|
357
|
+
query: SqlQuery,
|
|
358
|
+
intermediate: Iterable[TableReference],
|
|
359
|
+
*,
|
|
360
|
+
dp_table: DPTable,
|
|
361
|
+
cost_model: CostModel,
|
|
362
|
+
cardinality_estimator: CardinalityEstimator,
|
|
363
|
+
) -> QueryPlan:
|
|
364
|
+
"""DP subroutine that selects the cheapest access path for a specific intermediate."""
|
|
365
|
+
intermediate = frozenset(intermediate)
|
|
366
|
+
candidate_plans: list[QueryPlan] = []
|
|
367
|
+
|
|
368
|
+
# We determine the cheapest access path to our intermediate by checking all potential join partners that could possibly
|
|
369
|
+
# be used to construct this intermediate. This works by splitting the intermediate into an outer relation and an inner
|
|
370
|
+
# one. To guarantee that we test each possible split, we generate the entire power set of the intermediate.
|
|
371
|
+
# By basing our algorithm on the power set we can solve two important problems: first, we can easily generate bushy
|
|
372
|
+
# plans (each time the outer plan has at least two tables and leaves more than one table for the inner plan results in
|
|
373
|
+
# a bushy plan). Second, we can also generate plans that invert the role of inner and outer relation just as easy.
|
|
374
|
+
# This is because the power set will eventually visit the set of all tables in the (former) inner relation, which will
|
|
375
|
+
# then become the outer relation for the current iteration.
|
|
376
|
+
#
|
|
377
|
+
# Once again, we first gather all possible join paths and then evaluate the costs for each of them in order to select
|
|
378
|
+
# the cheapest one.
|
|
379
|
+
|
|
380
|
+
for outer in util.collections.powerset(intermediate):
|
|
381
|
+
if not outer or len(outer) == len(intermediate):
|
|
382
|
+
# Skip the empty set and the full set because we would lack a join partner.
|
|
383
|
+
continue
|
|
384
|
+
outer = frozenset(outer)
|
|
385
|
+
|
|
386
|
+
# All tables of our intermediate that are not part of the outer relation have to become part of the inner relation
|
|
387
|
+
inner = intermediate - outer
|
|
388
|
+
|
|
389
|
+
outer_plan, inner_plan = dp_table.get(outer), dp_table.get(inner)
|
|
390
|
+
if not outer_plan or not inner_plan:
|
|
391
|
+
# If we do not find the access paths for one of our inputs, it means that this is constructed using a cross
|
|
392
|
+
# product. Since we do not consider cross products, we can skip this split.
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
join_condition = query.predicates().joins_between(outer, inner)
|
|
396
|
+
|
|
397
|
+
if JoinOperator.NestedLoopJoin in self._join_ops:
|
|
398
|
+
candidate_plans.append(
|
|
399
|
+
QueryPlan(
|
|
400
|
+
JoinOperator.NestedLoopJoin,
|
|
401
|
+
children=[outer_plan, inner_plan],
|
|
402
|
+
join_condition=join_condition,
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if JoinOperator.HashJoin in self._join_ops:
|
|
407
|
+
candidate_plans.append(
|
|
408
|
+
QueryPlan(
|
|
409
|
+
JoinOperator.HashJoin,
|
|
410
|
+
children=[outer_plan, inner_plan],
|
|
411
|
+
join_condition=join_condition,
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
if JoinOperator.SortMergeJoin in self._join_ops:
|
|
416
|
+
# The target DB is utimately responsible for figuring out whether it needs explicit sorts or whether it can
|
|
417
|
+
# just merge directly.
|
|
418
|
+
candidate_plans.append(
|
|
419
|
+
QueryPlan(
|
|
420
|
+
JoinOperator.SortMergeJoin,
|
|
421
|
+
children=[outer_plan, inner_plan],
|
|
422
|
+
join_condition=join_condition,
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
candidate_plans = [
|
|
427
|
+
_calc_plan_estimates(
|
|
428
|
+
query,
|
|
429
|
+
candidate,
|
|
430
|
+
cost_model=cost_model,
|
|
431
|
+
cardinality_estimator=cardinality_estimator,
|
|
432
|
+
)
|
|
433
|
+
for candidate in candidate_plans
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
return min(candidate_plans, key=lambda plan: plan.estimated_cost)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
@dataclass
|
|
440
|
+
class RelOptInfo:
|
|
441
|
+
"""Simplified model of the RelOptInfo from the Postgres planner.
|
|
442
|
+
|
|
443
|
+
We only specify the fields that we truly care about (and that are not covered by other parts of PostBOUND), the rest is
|
|
444
|
+
omitted.
|
|
445
|
+
|
|
446
|
+
For example, we don't need to worry about equivalence classes, because the Postgres enumerator is responsible for
|
|
447
|
+
expanding the query with all EQ predicates. Aftwards, we can use the query abstraction to determine available joins.
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
intermediate: frozenset[TableReference]
|
|
451
|
+
"""The relation that is represented by this RelOptInfo.
|
|
452
|
+
|
|
453
|
+
This is simply the set of all tables that are part of the relation.
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
pathlist: list[QueryPlan]
|
|
457
|
+
"""All access paths that can be used to compute this relation (that we know of and care about).
|
|
458
|
+
|
|
459
|
+
In contrast to the original PG implementation, we don't care about sorting this list. Retaining the sort order is mainly
|
|
460
|
+
an implementation detail and optimization of PG.
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
partial_paths: list[QueryPlan]
|
|
464
|
+
"""All access paths that can be used to compute this relation with parallel workers. Otherwise the same as `paths`."""
|
|
465
|
+
|
|
466
|
+
cheapest_path: Optional[QueryPlan]
|
|
467
|
+
"""The cheapest access path that we have found.
|
|
468
|
+
|
|
469
|
+
Notice that this is only set after all paths for the RelOpt have been collected.
|
|
470
|
+
"""
|
|
471
|
+
|
|
472
|
+
cheapest_partial_path: Optional[QueryPlan]
|
|
473
|
+
"""The cheapest access path that we have found for parallel execution.
|
|
474
|
+
|
|
475
|
+
Notice that this is only set after all paths for the RelOpt have been collected.
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
cardinality: Cardinality
|
|
479
|
+
"""The estimated number of rows that are produced by this relation."""
|
|
480
|
+
|
|
481
|
+
def __contains__(self, item: object) -> bool:
|
|
482
|
+
if isinstance(item, RelOptInfo):
|
|
483
|
+
item = item.intermediate
|
|
484
|
+
if isinstance(item, TableReference):
|
|
485
|
+
item = {item}
|
|
486
|
+
|
|
487
|
+
return item < self.intermediate
|
|
488
|
+
|
|
489
|
+
def __hash__(self) -> int:
|
|
490
|
+
return hash(self.intermediate)
|
|
491
|
+
|
|
492
|
+
def __eq__(self, other: object) -> bool:
|
|
493
|
+
return isinstance(other, RelOptInfo) and self.intermediate == other.intermediate
|
|
494
|
+
|
|
495
|
+
def __repr__(self) -> str:
|
|
496
|
+
return str(self)
|
|
497
|
+
|
|
498
|
+
def __str__(self) -> str:
|
|
499
|
+
tables = ", ".join(sorted(t.identifier() for t in self.intermediate))
|
|
500
|
+
return f"{{{tables}}}"
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
Sorting = Sequence[SortKey]
|
|
504
|
+
"""A specific sort order for some relation."""
|
|
505
|
+
|
|
506
|
+
Level = int
|
|
507
|
+
"""The current level in the dynamic programming table."""
|
|
508
|
+
|
|
509
|
+
JoinRelLevel = dict[Level, list[RelOptInfo]]
|
|
510
|
+
"""Alias for our dynamic programming table."""
|
|
511
|
+
|
|
512
|
+
AddPathHook = Callable[["PostgresDynProg", RelOptInfo, QueryPlan, bool], None]
|
|
513
|
+
"""Hook method for users to get control in Postgres' *add_path()* method.
|
|
514
|
+
|
|
515
|
+
The method is reponsible for storing a new candidate path in its `RelOptInfo`. It can decide, whether the path is actually
|
|
516
|
+
worth storing or not. Furthermore, the method can also prune existing paths from the `pathlist` that are dominated by the
|
|
517
|
+
new path.
|
|
518
|
+
|
|
519
|
+
All of these actions should be performed in-place by modifying the `RelOptInfo` object. No return value is expected.
|
|
520
|
+
|
|
521
|
+
The last boolean parameter indicates whether the path is a partial path (i.e. a path for parallel execution) or not.
|
|
522
|
+
If it is, the path should be stored in the `partial_paths` list, instead of the standard `pathlist`. Likewise, all checks
|
|
523
|
+
should be performed against the `partial_paths` list. If the hook should not handle partial paths, it can simply delegate
|
|
524
|
+
to the standard implementation on the enumerator.
|
|
525
|
+
|
|
526
|
+
If need be, the method can also access the current state of the dynamic programmer. Specifically, the enumerator provides
|
|
527
|
+
access to the query and database, the selected cost model and cardinality estimator, as well as to the current `JoinRelLevel`.
|
|
528
|
+
Finally, the method is allowed to invoke the default path addition logic by calling the `standard_add_path()` method on the
|
|
529
|
+
enumerator.
|
|
530
|
+
"""
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class DPWarning(UserWarning):
|
|
534
|
+
pass
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
class PostgresDynProg(PlanEnumerator):
|
|
538
|
+
"""Dynamic programming-based plan enumeration strategy that mimics the behavior of the Postgres query optimizer.
|
|
539
|
+
|
|
540
|
+
Postgres-style dynamic programming means two things: first, we use the Postgres pruning rules to reduce the search space.
|
|
541
|
+
Second, we apply the same opinionated traversal rules. Most importantly, this concerns when we consider materialization or
|
|
542
|
+
memoization of subplans. If some of the related operators are not allowed, the traversal rules are adjusted accordingly.
|
|
543
|
+
|
|
544
|
+
The implementation is based on a translation of the actual Postgres source code.
|
|
545
|
+
|
|
546
|
+
Parameters
|
|
547
|
+
----------
|
|
548
|
+
supported_scan_ops : Optional[set[ScanOperators]], optional
|
|
549
|
+
The scan operators that the enumerator is allowed to use. If omitted, all scan operators that are supported by the
|
|
550
|
+
target database are used.
|
|
551
|
+
supported_join_ops : Optional[set[JoinOperators]], optional
|
|
552
|
+
The join operators that the enumerator is allowed to use. If omitted, all join operators supported by the target
|
|
553
|
+
database are used.
|
|
554
|
+
enable_materialize : bool, optional
|
|
555
|
+
Whether the optimizer is allowed to insert materialization operators into the query plan. This is enabled by default.
|
|
556
|
+
enable_memoize : bool, optional
|
|
557
|
+
Whether the optimizer is allowed to insert memoization operators into the query plan. This is enabled by default.
|
|
558
|
+
enable_sort : bool, optional
|
|
559
|
+
Whether the optimizer is allowed to perform explicit sorts in the query plan. Notice that setting this to *False* only
|
|
560
|
+
prevents optional sorts. For example, if the query contains an *ORDER BY* clause, the optimizer will still perform the
|
|
561
|
+
required sorting. However, it will not perform any merge joins that require a different kind of sorting.
|
|
562
|
+
This is enabled by default.
|
|
563
|
+
add_path_hook : Optional[AddPathHook], optional
|
|
564
|
+
Optional function to implement custom path addition logic. See documentation on `AddPathHook` for more details.
|
|
565
|
+
target_db : Optional[PostgresInterface], optional
|
|
566
|
+
The database on which the plans should be executed. This has to be a Postgres instance. If omitted, the database is
|
|
567
|
+
inferred from the `DatabasePool`.
|
|
568
|
+
verbose : bool, optional
|
|
569
|
+
Whether the enumerator should issue warnings if it encounters unexpected situations, e.g. if it rejects a path
|
|
570
|
+
because it is illegal. This includes cases where the cost of some operators cannot be estimated by the target
|
|
571
|
+
database system.
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
def __init__(
|
|
575
|
+
self,
|
|
576
|
+
*,
|
|
577
|
+
supported_scan_ops: Optional[set[ScanOperator]] = None,
|
|
578
|
+
supported_join_ops: Optional[set[JoinOperator]] = None,
|
|
579
|
+
enable_materialize: bool = True,
|
|
580
|
+
enable_memoize: bool = True,
|
|
581
|
+
enable_sort: bool = True,
|
|
582
|
+
max_parallel_workers: Optional[int] = None,
|
|
583
|
+
add_path_hook: Optional[AddPathHook] = None,
|
|
584
|
+
target_db: Optional[PostgresInterface] = None,
|
|
585
|
+
verbose: bool = False,
|
|
586
|
+
) -> None:
|
|
587
|
+
target_db = (
|
|
588
|
+
target_db
|
|
589
|
+
if target_db is not None
|
|
590
|
+
else DatabasePool.get_instance().current_database()
|
|
591
|
+
)
|
|
592
|
+
if not isinstance(target_db, PostgresInterface):
|
|
593
|
+
raise LogicError(
|
|
594
|
+
"The PostgresDynProg enumerator can only be used with a Postgres database. "
|
|
595
|
+
"(but you can execute the plans on any database that supports the required hints)."
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
supported_scan_ops = (
|
|
599
|
+
supported_scan_ops if supported_scan_ops is not None else PostgresScanHints
|
|
600
|
+
)
|
|
601
|
+
supported_join_ops = (
|
|
602
|
+
supported_join_ops if supported_join_ops is not None else PostgresJoinHints
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
if target_db is not None:
|
|
606
|
+
supported_scan_ops = {
|
|
607
|
+
op for op in supported_scan_ops if target_db.hinting().supports_hint(op)
|
|
608
|
+
}
|
|
609
|
+
supported_join_ops = {
|
|
610
|
+
op for op in supported_join_ops if target_db.hinting().supports_hint(op)
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
self.query: SqlQuery = None
|
|
614
|
+
self.predicates: QueryPredicates = None
|
|
615
|
+
self.cost_model: CostModel = None
|
|
616
|
+
self.cardinality_estimator: CardinalityEstimator = None
|
|
617
|
+
self.join_rel_level: JoinRelLevel = None
|
|
618
|
+
self.target_db = target_db
|
|
619
|
+
|
|
620
|
+
self._scan_ops = supported_scan_ops
|
|
621
|
+
self._join_ops = supported_join_ops
|
|
622
|
+
self._enable_materialize = enable_materialize
|
|
623
|
+
self._enable_memoize = enable_memoize
|
|
624
|
+
self._enable_sort = enable_sort
|
|
625
|
+
self._max_workers = max_parallel_workers if max_parallel_workers else 0
|
|
626
|
+
self._add_path_hook = add_path_hook
|
|
627
|
+
|
|
628
|
+
self._verbose = verbose
|
|
629
|
+
|
|
630
|
+
def infer_settings(self) -> None:
|
|
631
|
+
"""Sets all allowed operators according to the configuration of the target database.
|
|
632
|
+
|
|
633
|
+
For example, if the target database has index scans disabled, the enumerator will disable them as well.
|
|
634
|
+
"""
|
|
635
|
+
allowed_scan_ops: list[ScanOperator] = []
|
|
636
|
+
if self.target_db.config["enable_seqscan"] == "on":
|
|
637
|
+
allowed_scan_ops.append(ScanOperator.SequentialScan)
|
|
638
|
+
if self.target_db.config["enable_indexscan"] == "on":
|
|
639
|
+
allowed_scan_ops.append(ScanOperator.IndexScan)
|
|
640
|
+
if self.target_db.config["enable_indexonlyscan"] == "on":
|
|
641
|
+
allowed_scan_ops.append(ScanOperator.IndexOnlyScan)
|
|
642
|
+
if self.target_db.config["enable_bitmapscan"] == "on":
|
|
643
|
+
allowed_scan_ops.append(ScanOperator.BitmapScan)
|
|
644
|
+
self._scan_ops = set(allowed_scan_ops)
|
|
645
|
+
|
|
646
|
+
allowed_join_ops: list[JoinOperator] = []
|
|
647
|
+
if self.target_db.config["enable_nestloop"] == "on":
|
|
648
|
+
allowed_join_ops.append(JoinOperator.NestedLoopJoin)
|
|
649
|
+
if self.target_db.config["enable_hashjoin"] == "on":
|
|
650
|
+
allowed_join_ops.append(JoinOperator.HashJoin)
|
|
651
|
+
if self.target_db.config["enable_mergejoin"] == "on":
|
|
652
|
+
allowed_join_ops.append(JoinOperator.SortMergeJoin)
|
|
653
|
+
self._join_ops = set(allowed_join_ops)
|
|
654
|
+
|
|
655
|
+
self._enable_materialize = self.target_db.config["enable_material"] == "on"
|
|
656
|
+
self._enable_memoize = self.target_db.config["enable_memoize"] == "on"
|
|
657
|
+
self._enable_sort = self.target_db.config["enable_sort"] == "on"
|
|
658
|
+
|
|
659
|
+
self._max_workers = int(
|
|
660
|
+
self.target_db.config["max_parallel_workers_per_gather"]
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
def generate_execution_plan(
|
|
664
|
+
self, query, *, cost_model, cardinality_estimator
|
|
665
|
+
) -> QueryPlan:
|
|
666
|
+
self.query = transform.add_ec_predicates(query)
|
|
667
|
+
self.predicates = self.query.predicates()
|
|
668
|
+
|
|
669
|
+
cardinality_estimator.initialize(self.target_db, query)
|
|
670
|
+
cost_model.initialize(self.target_db, query)
|
|
671
|
+
self.cardinality_estimator = cardinality_estimator
|
|
672
|
+
self.cost_model = cost_model
|
|
673
|
+
|
|
674
|
+
base_rels = self._init_base_rels()
|
|
675
|
+
self._set_base_rel_pathlists(base_rels)
|
|
676
|
+
|
|
677
|
+
final_rel = self._standard_join_search(initial_rels=base_rels)
|
|
678
|
+
assert final_rel.cheapest_path is not None, (
|
|
679
|
+
"No valid plan found for the given query."
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
if self._is_pg_cost(cost_model):
|
|
683
|
+
# This seems weird at first, so let's explain what is going on here:
|
|
684
|
+
# If we use the actual PG cost model, we can be make better decisions about what the optimal plan is.
|
|
685
|
+
# The reason is that PG might parallelize a portion of the upper operators (anything that happens after the final
|
|
686
|
+
# join such as aggregations). With these upper rel parallelizations, a partial plan might become cheaper than its
|
|
687
|
+
# counterpart that executes the last join in parallel (and which we compute). Consequently, that very path might
|
|
688
|
+
# become cheaper than the cheapest path that we have computed so far.
|
|
689
|
+
#
|
|
690
|
+
# Therefore, we need to explicitly consider all partial paths as well as the sequential ones and retrieve the
|
|
691
|
+
# final cost estimates for them.
|
|
692
|
+
# But this only works if know we are using the PG execution engine and cost model.
|
|
693
|
+
|
|
694
|
+
plans = final_rel.pathlist + self._generate_pseudo_gather_paths(final_rel)
|
|
695
|
+
plan_costs = {plan: self._pg_cost_estimate(plan) for plan in plans}
|
|
696
|
+
cheapest_plan = util.argmin(plan_costs)
|
|
697
|
+
else:
|
|
698
|
+
cheapest_plan = final_rel.cheapest_path
|
|
699
|
+
|
|
700
|
+
cost_model.cleanup()
|
|
701
|
+
cardinality_estimator.cleanup()
|
|
702
|
+
self.query = None
|
|
703
|
+
self.predicates = None
|
|
704
|
+
self.cost_model = None
|
|
705
|
+
self.cardinality_estimator = None
|
|
706
|
+
return cheapest_plan
|
|
707
|
+
|
|
708
|
+
def describe(self) -> jsondict:
|
|
709
|
+
return {
|
|
710
|
+
"name": "dynamic_programming",
|
|
711
|
+
"flavor": "postgres",
|
|
712
|
+
"scan_ops": [op.name for op in self._scan_ops],
|
|
713
|
+
"join_ops": [op.name for op in self._join_ops],
|
|
714
|
+
"database_system": self.target_db.describe(),
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
718
|
+
return merge_checks(
|
|
719
|
+
CrossProductPreCheck(),
|
|
720
|
+
EquiJoinPreCheck(),
|
|
721
|
+
InnerJoinPreCheck(),
|
|
722
|
+
VirtualTablesPreCheck(),
|
|
723
|
+
SetOperationsPreCheck(),
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
def standard_add_path(
|
|
727
|
+
self, rel: RelOptInfo, path: QueryPlan, *, is_partial: bool = False
|
|
728
|
+
) -> None:
|
|
729
|
+
"""Checks, whether a specific path is worthy of further consideration. If it is, the path is stored in the pathlist.
|
|
730
|
+
|
|
731
|
+
This method's naming is exceptionally bad, but this the way it is named in the PG source code, so we stick with it.
|
|
732
|
+
|
|
733
|
+
On an abstract level, this method implements the following logic:
|
|
734
|
+
|
|
735
|
+
For each existing path in the pathlist, we check whether the new path dominates the existing one.
|
|
736
|
+
If it does, we evict the existing path. It the existing path is better, we keep it and discard the new path.
|
|
737
|
+
|
|
738
|
+
To determine, whether one path dominates another, we compare the paths' costs and sort orders. For one path to
|
|
739
|
+
dominate the other one, it must be cheaper and at least as good sorted.
|
|
740
|
+
If the paths are sorted differently, we keep them both.
|
|
741
|
+
|
|
742
|
+
This basic logic is executed for both regular paths and partial paths. Those are paths that can be executed in parallel
|
|
743
|
+
and will eventually be merged with the main paths (once the parallel portion is finished).
|
|
744
|
+
While "normal" paths (completely sequential ones or parallel paths that have finished their parallel portion) are
|
|
745
|
+
stored in the `pathlist`, partial paths are stored in the `partial_paths` list.
|
|
746
|
+
"""
|
|
747
|
+
current_paths = rel.partial_paths if is_partial else rel.pathlist
|
|
748
|
+
if not current_paths:
|
|
749
|
+
current_paths.append(path)
|
|
750
|
+
return
|
|
751
|
+
|
|
752
|
+
result_paths: list[QueryPlan] = []
|
|
753
|
+
keep_new = True # we assume that we want to keep the new path to handle new sort orders correctly
|
|
754
|
+
new_cost = path.estimated_cost
|
|
755
|
+
|
|
756
|
+
for i, old_path in enumerate(current_paths):
|
|
757
|
+
if not self._sorting_subsumes(
|
|
758
|
+
path.sort_keys, other=old_path.params.sort_keys
|
|
759
|
+
):
|
|
760
|
+
result_paths.append(old_path)
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
# Postgres uses a fuzzy cost comparison (compare_path_costs_fuzzily() from pathnode.c) and evicts old paths even
|
|
764
|
+
# if there cost is slightly better than the new path, if the new path is better sorted.
|
|
765
|
+
old_cost = old_path.estimated_cost
|
|
766
|
+
new_dominates = (
|
|
767
|
+
new_cost < old_cost
|
|
768
|
+
if self._same_sorting(path.sort_keys, other=old_path.sort_keys)
|
|
769
|
+
else new_cost <= 1.01 * old_cost
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
if new_dominates:
|
|
773
|
+
# The new path is better (or at least equally) sorted and cheaper, we can evict the old path
|
|
774
|
+
keep_new = True # strictly speaking, this is not necessary, but it makes our intention clearer
|
|
775
|
+
continue # don't break here, we need to check the remaining paths
|
|
776
|
+
else:
|
|
777
|
+
# The existing path is better (or at least equally) sorted and cheaper, we don't need the new path.
|
|
778
|
+
# This also means that we can stop checking the remaining paths. The new one won't get added and the old ones
|
|
779
|
+
# successfully competed against the path that just beat our new one. We can keep them all.
|
|
780
|
+
result_paths.extend(current_paths[i:])
|
|
781
|
+
keep_new = False
|
|
782
|
+
break
|
|
783
|
+
|
|
784
|
+
if keep_new:
|
|
785
|
+
result_paths.append(path)
|
|
786
|
+
|
|
787
|
+
if is_partial:
|
|
788
|
+
rel.partial_paths = result_paths
|
|
789
|
+
else:
|
|
790
|
+
rel.pathlist = result_paths
|
|
791
|
+
|
|
792
|
+
def _init_base_rels(self) -> list[RelOptInfo]:
|
|
793
|
+
"""Creates and initializes the RelOptInfos for all tables in the query, without computing any access paths."""
|
|
794
|
+
|
|
795
|
+
# Combines logic from make_one_rel() and set_base_rel_sizes()
|
|
796
|
+
initial_rels: list[RelOptInfo] = []
|
|
797
|
+
|
|
798
|
+
for base_rel in self.query.tables():
|
|
799
|
+
intermediate = frozenset([base_rel])
|
|
800
|
+
cardinality = self.cardinality_estimator.calculate_estimate(
|
|
801
|
+
self.query, intermediate
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
initial_rels.append(
|
|
805
|
+
RelOptInfo(
|
|
806
|
+
intermediate=intermediate,
|
|
807
|
+
pathlist=[],
|
|
808
|
+
partial_paths=[],
|
|
809
|
+
cheapest_path=None,
|
|
810
|
+
cardinality=cardinality,
|
|
811
|
+
cheapest_partial_path=None,
|
|
812
|
+
)
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
return initial_rels
|
|
816
|
+
|
|
817
|
+
def _set_base_rel_pathlists(self, initial_rels: list[RelOptInfo]) -> None:
|
|
818
|
+
"""Adds access paths to the base relations.
|
|
819
|
+
|
|
820
|
+
The specific paths depend on the available operators and the current schema.
|
|
821
|
+
"""
|
|
822
|
+
# This function leads to a larger chain of function calls which in end culminate in set_plain_rel_pathlist()
|
|
823
|
+
# We implement the behavior of that function here
|
|
824
|
+
|
|
825
|
+
for rel in initial_rels:
|
|
826
|
+
self._create_sequential_paths(rel)
|
|
827
|
+
self._create_index_paths(rel)
|
|
828
|
+
self._create_bitmap_path(rel)
|
|
829
|
+
|
|
830
|
+
self._generate_gather_paths(rel)
|
|
831
|
+
self._set_cheapest(rel)
|
|
832
|
+
|
|
833
|
+
def _standard_join_search(self, initial_rels: list[RelOptInfo]) -> RelOptInfo:
|
|
834
|
+
"""Main entry point into the dynamic programming join search.
|
|
835
|
+
|
|
836
|
+
The implementation assumes that the `initial_rels` have already been initialized. Therefore, the dynamic programmer
|
|
837
|
+
is only concerned with building join rels.
|
|
838
|
+
"""
|
|
839
|
+
levels_needed = len(initial_rels)
|
|
840
|
+
self.join_rel_level: JoinRelLevel = collections.defaultdict(list)
|
|
841
|
+
self.join_rel_level[1] = initial_rels
|
|
842
|
+
|
|
843
|
+
for level in range(2, levels_needed + 1):
|
|
844
|
+
self._join_search_one_level(level)
|
|
845
|
+
|
|
846
|
+
for rel in self.join_rel_level[level]:
|
|
847
|
+
self._generate_gather_paths(rel)
|
|
848
|
+
self._set_cheapest(rel)
|
|
849
|
+
|
|
850
|
+
assert len(self.join_rel_level[levels_needed]) == 1, (
|
|
851
|
+
"Final join rel level should only contain one relation."
|
|
852
|
+
)
|
|
853
|
+
final_rel = self.join_rel_level[levels_needed][0]
|
|
854
|
+
self.join_rel_level = None
|
|
855
|
+
return final_rel
|
|
856
|
+
|
|
857
|
+
def _join_search_one_level(self, level: Level) -> None:
|
|
858
|
+
"""Handler method to construct all intermediates of a current level for the DP join search.
|
|
859
|
+
|
|
860
|
+
Parameters
|
|
861
|
+
----------
|
|
862
|
+
level : int
|
|
863
|
+
The number of base tables that should be contained in each intermediate relation that we will construct.
|
|
864
|
+
"""
|
|
865
|
+
# First, consider left-deep plans
|
|
866
|
+
for rel1 in self.join_rel_level[level - 1]:
|
|
867
|
+
# the body of this loop implements the logic of make_join_rel() (which is called by make_rels_by_clause_joins())
|
|
868
|
+
|
|
869
|
+
for rel2 in self.join_rel_level[1]:
|
|
870
|
+
if len(rel1.intermediate & rel2.intermediate) > 0:
|
|
871
|
+
# don't join anything that we have already joined
|
|
872
|
+
continue
|
|
873
|
+
if not self.predicates.joins_between(
|
|
874
|
+
rel1.intermediate, rel2.intermediate
|
|
875
|
+
):
|
|
876
|
+
# don't consider cross products
|
|
877
|
+
continue
|
|
878
|
+
|
|
879
|
+
# functionality of build_join_rel()
|
|
880
|
+
intermediate = rel1.intermediate | rel2.intermediate
|
|
881
|
+
join_rel = self._build_join_rel(intermediate)
|
|
882
|
+
|
|
883
|
+
# functionality of populate_joinrel_with_paths()
|
|
884
|
+
self._add_paths_to_joinrel(join_rel, outer_rel=rel1, inner_rel=rel2)
|
|
885
|
+
self._add_paths_to_joinrel(join_rel, outer_rel=rel2, inner_rel=rel1)
|
|
886
|
+
|
|
887
|
+
for outer_size in range(2, level // 2 + 1):
|
|
888
|
+
inner_size = level - outer_size
|
|
889
|
+
if inner_size < 2:
|
|
890
|
+
continue
|
|
891
|
+
|
|
892
|
+
for rel1 in self.join_rel_level[outer_size]:
|
|
893
|
+
for rel2 in self.join_rel_level[inner_size]:
|
|
894
|
+
if len(rel1.intermediate & rel2.intermediate) > 0:
|
|
895
|
+
# don't join anything that we have already joined
|
|
896
|
+
continue
|
|
897
|
+
if not self.predicates.joins_between(
|
|
898
|
+
rel1.intermediate, rel2.intermediate
|
|
899
|
+
):
|
|
900
|
+
# don't consider cross products
|
|
901
|
+
continue
|
|
902
|
+
|
|
903
|
+
# functionality of build_join_rel()
|
|
904
|
+
intermediate = rel1.intermediate | rel2.intermediate
|
|
905
|
+
join_rel = self._build_join_rel(intermediate)
|
|
906
|
+
|
|
907
|
+
# functionality of populate_joinrel_with_paths()
|
|
908
|
+
self._add_paths_to_joinrel(join_rel, outer_rel=rel1, inner_rel=rel2)
|
|
909
|
+
self._add_paths_to_joinrel(join_rel, outer_rel=rel2, inner_rel=rel1)
|
|
910
|
+
|
|
911
|
+
def _build_join_rel(self, intermediate: frozenset[TableReference]) -> RelOptInfo:
|
|
912
|
+
"""Constructs and initializes a new RelOptInfo for a specific intermediate. No access paths are added, yet."""
|
|
913
|
+
|
|
914
|
+
# This function integrates the logic of find_join_rel() and build_join_rel()
|
|
915
|
+
level = len(intermediate)
|
|
916
|
+
for rel in self.join_rel_level[level]:
|
|
917
|
+
if rel.intermediate == intermediate:
|
|
918
|
+
return rel
|
|
919
|
+
|
|
920
|
+
cardinality = self.cardinality_estimator.calculate_estimate(
|
|
921
|
+
self.query, intermediate
|
|
922
|
+
)
|
|
923
|
+
join_rel = RelOptInfo(
|
|
924
|
+
intermediate=intermediate,
|
|
925
|
+
pathlist=[],
|
|
926
|
+
partial_paths=[],
|
|
927
|
+
cheapest_path=None,
|
|
928
|
+
cardinality=cardinality,
|
|
929
|
+
cheapest_partial_path=None,
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
self.join_rel_level[level].append(join_rel)
|
|
933
|
+
return join_rel
|
|
934
|
+
|
|
935
|
+
def _add_paths_to_joinrel(
|
|
936
|
+
self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
|
|
937
|
+
) -> None:
|
|
938
|
+
"""Builds all possible access paths for a specific join relation.
|
|
939
|
+
|
|
940
|
+
The build process adheres to the assignment of join directions from the parameters, i.e. the `outer_rel` will always be
|
|
941
|
+
the outer relation and the `inner_rel` will always be the inner relation. If it does not matter, what the specific
|
|
942
|
+
assignment is, this method has to be called twice with inversed parameters.
|
|
943
|
+
"""
|
|
944
|
+
if JoinOperator.NestedLoopJoin in self._join_ops:
|
|
945
|
+
self._match_unsorted_outer(
|
|
946
|
+
join_rel, outer_rel=outer_rel, inner_rel=inner_rel
|
|
947
|
+
)
|
|
948
|
+
if JoinOperator.SortMergeJoin in self._join_ops:
|
|
949
|
+
self._sort_inner_outer(join_rel, outer_rel=outer_rel, inner_rel=inner_rel)
|
|
950
|
+
if JoinOperator.HashJoin in self._join_ops:
|
|
951
|
+
self._hash_inner_outer(join_rel, outer_rel=outer_rel, inner_rel=inner_rel)
|
|
952
|
+
|
|
953
|
+
def _sort_inner_outer(
|
|
954
|
+
self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
|
|
955
|
+
) -> None:
|
|
956
|
+
"""Constructs all potential merge join paths for a specific intermediate.
|
|
957
|
+
|
|
958
|
+
This method assumes that merge joins are actually enabled.
|
|
959
|
+
"""
|
|
960
|
+
|
|
961
|
+
# The implementation of this function is loosely based on sort_inner_outer() of the PG source code.
|
|
962
|
+
# However, since the original function is tightly coupled with Postgres' internal query and planner representation, we
|
|
963
|
+
# deviate a bit more than usual from the original implementation.
|
|
964
|
+
#
|
|
965
|
+
# Specifically, our implementation performs the following high-level algorithm:
|
|
966
|
+
# For each potential join key between the input relations, we check whether they are already sorted based on the join
|
|
967
|
+
# key. If they are not, we introduce an explicit sort operator for the path. Afterwards, we create a merge join based
|
|
968
|
+
# on the candidate paths.
|
|
969
|
+
#
|
|
970
|
+
# As a consequence, this function essentially also implements the merge-join specific behavior of
|
|
971
|
+
# match_unsorted_outer(). In our implementation, that function only handles nested-loop joins.
|
|
972
|
+
#
|
|
973
|
+
# Notice that Postgres does not consider materialization or memoization of subpaths for merge joins, so neither do we.
|
|
974
|
+
|
|
975
|
+
join_keys = self._determine_join_keys(outer_rel=outer_rel, inner_rel=inner_rel)
|
|
976
|
+
|
|
977
|
+
# How often can we nest? Quite often! But this should still be fairly readable.
|
|
978
|
+
# We simply loop over all join keys. For each join key, we try each combination of inner and outer relations and
|
|
979
|
+
# see if we end up with a decent merge join path.
|
|
980
|
+
for join_key in join_keys:
|
|
981
|
+
outer_col, inner_col = self._extract_join_columns(
|
|
982
|
+
join_key, outer_rel=outer_rel, inner_rel=inner_rel
|
|
983
|
+
)
|
|
984
|
+
if not outer_col or not inner_col:
|
|
985
|
+
continue
|
|
986
|
+
|
|
987
|
+
for outer_path in outer_rel.pathlist:
|
|
988
|
+
if (
|
|
989
|
+
not self._is_sorted_by(outer_path, outer_col)
|
|
990
|
+
and not self._enable_sort
|
|
991
|
+
):
|
|
992
|
+
# If the path is not already sorted and we are not allowed to sort it ourselves, there is no point in
|
|
993
|
+
# merge joining. Just skip the path.
|
|
994
|
+
continue
|
|
995
|
+
|
|
996
|
+
outer_path = (
|
|
997
|
+
outer_path
|
|
998
|
+
if self._is_sorted_by(outer_path, outer_col)
|
|
999
|
+
else self._create_sort_path(outer_path, sort_key=outer_col)
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
for inner_path in inner_rel.pathlist:
|
|
1003
|
+
if (
|
|
1004
|
+
not self._is_sorted_by(inner_path, inner_col)
|
|
1005
|
+
and not self._enable_sort
|
|
1006
|
+
):
|
|
1007
|
+
# If the path is not already sorted and we are not allowed to sort it ourselves, there is no point in
|
|
1008
|
+
# merge joining. Just skip the path.
|
|
1009
|
+
continue
|
|
1010
|
+
|
|
1011
|
+
inner_path = (
|
|
1012
|
+
inner_path
|
|
1013
|
+
if self._is_sorted_by(inner_path, inner_col)
|
|
1014
|
+
else self._create_sort_path(inner_path, sort_key=inner_col)
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
merge_path = self._create_mergejoin_path(
|
|
1018
|
+
join_rel, outer_path=outer_path, inner_path=inner_path
|
|
1019
|
+
)
|
|
1020
|
+
self._add_path(join_rel, merge_path)
|
|
1021
|
+
|
|
1022
|
+
for outer_partial in outer_rel.partial_paths:
|
|
1023
|
+
# same as above, just as for partial paths - we try each combination of partial outer with regular inner
|
|
1024
|
+
if (
|
|
1025
|
+
not self._is_sorted_by(outer_partial, outer_col)
|
|
1026
|
+
and not self._enable_sort
|
|
1027
|
+
):
|
|
1028
|
+
continue
|
|
1029
|
+
|
|
1030
|
+
outer_path = (
|
|
1031
|
+
outer_path
|
|
1032
|
+
if self._is_sorted_by(outer_partial, outer_col)
|
|
1033
|
+
else self._create_sort_path(outer_partial, sort_key=outer_col)
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
for inner_path in inner_rel.pathlist:
|
|
1037
|
+
if (
|
|
1038
|
+
not self._is_sorted_by(inner_path, inner_col)
|
|
1039
|
+
and not self._enable_sort
|
|
1040
|
+
):
|
|
1041
|
+
continue
|
|
1042
|
+
|
|
1043
|
+
inner_path = (
|
|
1044
|
+
inner_path
|
|
1045
|
+
if self._is_sorted_by(inner_path, inner_col)
|
|
1046
|
+
else self._create_sort_path(inner_path, sort_key=inner_col)
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
merge_path = self._create_mergejoin_path(
|
|
1050
|
+
join_rel, outer_path=outer_path, inner_path=inner_path
|
|
1051
|
+
)
|
|
1052
|
+
self._add_path(join_rel, merge_path, is_partial=True)
|
|
1053
|
+
|
|
1054
|
+
def _match_unsorted_outer(
|
|
1055
|
+
self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
|
|
1056
|
+
) -> None:
|
|
1057
|
+
"""Constructs all potential nested loop-join paths for a specific intermediate.
|
|
1058
|
+
|
|
1059
|
+
This also includes adding paths with memoization or materialization if they are allowed and appear useful.
|
|
1060
|
+
|
|
1061
|
+
This method assumes that nested loop joins are actually enabled.
|
|
1062
|
+
"""
|
|
1063
|
+
|
|
1064
|
+
# as outlined in _sort_inner_outer(), we only handle nested-loop joins here
|
|
1065
|
+
# Nested-loop joins are inherently unsorted, so we only care about the cheapest access paths to the input relations
|
|
1066
|
+
# here.
|
|
1067
|
+
|
|
1068
|
+
outer_path, inner_path = outer_rel.cheapest_path, inner_rel.cheapest_path
|
|
1069
|
+
if not outer_path or not inner_path:
|
|
1070
|
+
raise LogicError("No cheapest paths set")
|
|
1071
|
+
|
|
1072
|
+
# Try plain NLJ first, variations (memoization/materialization) afterwards
|
|
1073
|
+
nlj_path = self._create_nestloop_path(
|
|
1074
|
+
join_rel, outer_path=outer_path, inner_path=inner_path
|
|
1075
|
+
)
|
|
1076
|
+
self._add_path(join_rel, nlj_path)
|
|
1077
|
+
|
|
1078
|
+
if self._enable_memoize:
|
|
1079
|
+
# For memoization, we attempt to cache each potential join key for the inner relation. Since there might be
|
|
1080
|
+
# multiple such keys, especially for larger intermediates, we need to check multiple
|
|
1081
|
+
|
|
1082
|
+
join_predicate = self.query.predicates().joins_between(
|
|
1083
|
+
outer_rel.intermediate, inner_rel.intermediate
|
|
1084
|
+
)
|
|
1085
|
+
if join_predicate is None:
|
|
1086
|
+
raise LogicError(
|
|
1087
|
+
"Cross product detected. This should never happen so deep down in the optimization process. "
|
|
1088
|
+
f"Intermediates {outer_rel} and {inner_rel}"
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
for first_col, second_col in join_predicate.join_partners():
|
|
1092
|
+
cache_key = (
|
|
1093
|
+
first_col
|
|
1094
|
+
if first_col.table in inner_rel.intermediate
|
|
1095
|
+
else second_col
|
|
1096
|
+
)
|
|
1097
|
+
if cache_key.table not in inner_rel.intermediate:
|
|
1098
|
+
raise LogicError(
|
|
1099
|
+
"Cache key must be part of the inner relation.",
|
|
1100
|
+
f"Key was {cache_key}, relation was {inner_rel}",
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
memo_inner = self._create_memoize_path(inner_path, cache_key=cache_key)
|
|
1104
|
+
memo_nlj = self._create_nestloop_path(
|
|
1105
|
+
join_rel, outer_path=outer_path, inner_path=memo_inner
|
|
1106
|
+
)
|
|
1107
|
+
self._add_path(join_rel, memo_nlj)
|
|
1108
|
+
|
|
1109
|
+
if self._enable_materialize:
|
|
1110
|
+
mat_path = self._create_materialize_path(inner_path)
|
|
1111
|
+
mat_nlj = self._create_nestloop_path(
|
|
1112
|
+
join_rel, outer_path=outer_path, inner_path=mat_path
|
|
1113
|
+
)
|
|
1114
|
+
self._add_path(join_rel, mat_nlj)
|
|
1115
|
+
|
|
1116
|
+
outer_partial = outer_rel.cheapest_partial_path
|
|
1117
|
+
if not outer_partial:
|
|
1118
|
+
return
|
|
1119
|
+
par_nlj = self._create_nestloop_path(
|
|
1120
|
+
join_rel, outer_path=outer_partial, inner_path=inner_path
|
|
1121
|
+
)
|
|
1122
|
+
self._add_path(join_rel, par_nlj, is_partial=True)
|
|
1123
|
+
|
|
1124
|
+
if self._enable_memoize:
|
|
1125
|
+
# same as above
|
|
1126
|
+
join_predicate = self.query.predicates().joins_between(
|
|
1127
|
+
outer_rel.intermediate, inner_rel.intermediate
|
|
1128
|
+
)
|
|
1129
|
+
if join_predicate is None:
|
|
1130
|
+
raise LogicError(
|
|
1131
|
+
"Cross product detected. This should never happen so deep down in the optimization process. "
|
|
1132
|
+
f"Intermediates {outer_rel} and {inner_rel}"
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
for first_col, second_col in join_predicate.join_partners():
|
|
1136
|
+
cache_key = (
|
|
1137
|
+
first_col
|
|
1138
|
+
if first_col.table in inner_rel.intermediate
|
|
1139
|
+
else second_col
|
|
1140
|
+
)
|
|
1141
|
+
if cache_key.table not in inner_rel.intermediate:
|
|
1142
|
+
raise LogicError(
|
|
1143
|
+
"Cache key must be part of the inner relation.",
|
|
1144
|
+
f"Key was {cache_key}, relation was {inner_rel}",
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
memo_inner = self._create_memoize_path(inner_path, cache_key=cache_key)
|
|
1148
|
+
par_nlj = self._create_nestloop_path(
|
|
1149
|
+
join_rel, outer_path=outer_partial, inner_path=memo_inner
|
|
1150
|
+
)
|
|
1151
|
+
self._add_path(join_rel, par_nlj, is_partial=True)
|
|
1152
|
+
|
|
1153
|
+
def _hash_inner_outer(
|
|
1154
|
+
self, join_rel: RelOptInfo, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
|
|
1155
|
+
) -> None:
|
|
1156
|
+
"""Constructs the hash join path for a specific intermediate.
|
|
1157
|
+
|
|
1158
|
+
In contrast to merge joins and nested loop joins, there is really only one way to perform a hash join.
|
|
1159
|
+
|
|
1160
|
+
This method assumes that hash joins scans are actually enabled.
|
|
1161
|
+
"""
|
|
1162
|
+
|
|
1163
|
+
# Hash joins are inherently unsorted, so we only care about the cheapest access paths to the input relations here.
|
|
1164
|
+
# Notice that Postgres does not consider materialization or memoization of subpaths for hash joins, so neither do we.
|
|
1165
|
+
|
|
1166
|
+
outer_path, inner_path = outer_rel.cheapest_path, inner_rel.cheapest_path
|
|
1167
|
+
if not outer_path or not inner_path:
|
|
1168
|
+
raise LogicError("No cheapest paths set")
|
|
1169
|
+
hash_path = self._create_hashjoin_path(
|
|
1170
|
+
join_rel, outer_path=outer_path, inner_path=inner_path
|
|
1171
|
+
)
|
|
1172
|
+
self._add_path(join_rel, hash_path)
|
|
1173
|
+
|
|
1174
|
+
outer_partial = outer_rel.cheapest_partial_path
|
|
1175
|
+
if not outer_partial:
|
|
1176
|
+
return
|
|
1177
|
+
par_hash = self._create_hashjoin_path(
|
|
1178
|
+
join_rel, outer_path=outer_partial, inner_path=inner_path
|
|
1179
|
+
)
|
|
1180
|
+
self._add_path(join_rel, par_hash, is_partial=True)
|
|
1181
|
+
|
|
1182
|
+
def _add_path(
|
|
1183
|
+
self, rel: RelOptInfo, path: QueryPlan, *, is_partial: bool = False
|
|
1184
|
+
) -> None:
|
|
1185
|
+
"""Checks, whether a specific path is worthy of further consideration. If it is, the path is stored in the pathlist.
|
|
1186
|
+
|
|
1187
|
+
This method's naming is exceptionally bad, but this the way it is named in the PG source code, so we stick with it.
|
|
1188
|
+
|
|
1189
|
+
If an `_add_path_hook` has been specified, this hook takes control after checking for illegal paths. The normal
|
|
1190
|
+
path adding logic is skipped in this case. Otherwise, we call the standard path adding logic from
|
|
1191
|
+
`_standard_add_path()`.
|
|
1192
|
+
|
|
1193
|
+
The method handles both regular paths and partial paths. Which path it is can be switched using the `is_partial`
|
|
1194
|
+
parameter.
|
|
1195
|
+
"""
|
|
1196
|
+
|
|
1197
|
+
if math.isinf(path.estimated_cost):
|
|
1198
|
+
# The cost model returns infinite costs for illegal query plans.
|
|
1199
|
+
self._warn(f"Rejecting illegal path {path}")
|
|
1200
|
+
return
|
|
1201
|
+
|
|
1202
|
+
if self._add_path_hook:
|
|
1203
|
+
self._add_path_hook(self, rel, path, is_partial)
|
|
1204
|
+
else:
|
|
1205
|
+
self.standard_add_path(rel, path, is_partial=is_partial)
|
|
1206
|
+
|
|
1207
|
+
def _set_cheapest(self, rel: RelOptInfo) -> None:
|
|
1208
|
+
"""Determines the cheapest path in terms of costs from the pathlist and partial pathlist."""
|
|
1209
|
+
if rel.pathlist:
|
|
1210
|
+
rel.pathlist.sort(key=lambda path: path.estimated_cost)
|
|
1211
|
+
cheapest_path = rel.pathlist[0]
|
|
1212
|
+
rel.cheapest_path = cheapest_path
|
|
1213
|
+
else:
|
|
1214
|
+
raise LogicError("No valid paths for relation found.")
|
|
1215
|
+
|
|
1216
|
+
if rel.partial_paths:
|
|
1217
|
+
rel.partial_paths.sort(key=lambda path: path.estimated_cost)
|
|
1218
|
+
cheapest_partial = rel.partial_paths[0]
|
|
1219
|
+
rel.cheapest_partial_path = cheapest_partial
|
|
1220
|
+
else:
|
|
1221
|
+
# Empty partial pathlist is allowed
|
|
1222
|
+
pass
|
|
1223
|
+
|
|
1224
|
+
def _generate_gather_paths(self, rel: RelOptInfo) -> None:
|
|
1225
|
+
"""Scans a RelOpt for partial paths that might be gathered into regular paths."""
|
|
1226
|
+
for partial_path in rel.partial_paths:
|
|
1227
|
+
n_workers = partial_path.get("estimated_workers", None)
|
|
1228
|
+
if n_workers is None:
|
|
1229
|
+
raise LogicError(
|
|
1230
|
+
f"Partial path does not have estimated workers: {partial_path}"
|
|
1231
|
+
)
|
|
1232
|
+
gather_path = partial_path.parallelize(n_workers)
|
|
1233
|
+
query_fragment = transform.extract_query_fragment(
|
|
1234
|
+
self.query, gather_path.tables()
|
|
1235
|
+
)
|
|
1236
|
+
cost_estimate = self.cost_model.estimate_cost(query_fragment, gather_path)
|
|
1237
|
+
gather_path = gather_path.with_estimates(cost=cost_estimate)
|
|
1238
|
+
|
|
1239
|
+
# It is important to not delete the partial path after gathering it. The path might still be useful in an
|
|
1240
|
+
# upper-level partial path!
|
|
1241
|
+
self._add_path(rel, gather_path)
|
|
1242
|
+
|
|
1243
|
+
def _generate_pseudo_gather_paths(self, rel: RelOptInfo) -> list[QueryPlan]:
|
|
1244
|
+
"""Creates partial paths that execute the entire plan in parallel, rather than all joins."""
|
|
1245
|
+
gather_paths: list[QueryPlan] = []
|
|
1246
|
+
|
|
1247
|
+
for partial_path in rel.partial_paths:
|
|
1248
|
+
n_workers = partial_path.get("estimated_workers", None)
|
|
1249
|
+
if n_workers is None:
|
|
1250
|
+
raise LogicError(
|
|
1251
|
+
f"Partial path does not have estimated workers: {partial_path}"
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
pseudo_node = QueryPlan(
|
|
1255
|
+
node_type="Gather", children=partial_path, parallel_workers=n_workers
|
|
1256
|
+
)
|
|
1257
|
+
gather_paths.append(pseudo_node)
|
|
1258
|
+
|
|
1259
|
+
return gather_paths
|
|
1260
|
+
|
|
1261
|
+
def _create_seqscan_path(self, rel: RelOptInfo) -> QueryPlan:
|
|
1262
|
+
"""Constructs and initializes a sequential scan path for a specific relation.
|
|
1263
|
+
|
|
1264
|
+
This method assumes that sequential scans are actually enabled.
|
|
1265
|
+
"""
|
|
1266
|
+
baserel = util.simplify(rel.intermediate)
|
|
1267
|
+
filter_condition = self.predicates.filters_for(baserel)
|
|
1268
|
+
workers = self._estimate_workers(baserel)
|
|
1269
|
+
path = QueryPlan(
|
|
1270
|
+
ScanOperator.SequentialScan,
|
|
1271
|
+
children=[],
|
|
1272
|
+
base_table=baserel,
|
|
1273
|
+
estimated_cardinality=rel.cardinality,
|
|
1274
|
+
filter_predicate=filter_condition,
|
|
1275
|
+
estimated_workers=workers,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
# Instead of running the cost estimation on the full query, we transform the into a SELECT * query
|
|
1279
|
+
# This prevents the query optimizer from inserting additional operators that would artifically increase the cost of the
|
|
1280
|
+
# scan.
|
|
1281
|
+
# For example, consider the following JOB query: SELECT min(k.keyword), ... FROM keyword k JOIN ... WHERE ...
|
|
1282
|
+
# If we run the cost estimation without the star transformation, the query fragment for k would end up to be
|
|
1283
|
+
# SELECT min(k.keyword) FROM keyword k WHERE ...
|
|
1284
|
+
# Executing this query would require an additional aggregation step which should no be performed already at this point.
|
|
1285
|
+
# By transforming the query into a star query, we get the plain result set without any additional operators.
|
|
1286
|
+
star_query = transform.as_star_query(self.query)
|
|
1287
|
+
cost = self.cost_model.estimate_cost(star_query, path)
|
|
1288
|
+
path = path.with_estimates(cost=cost)
|
|
1289
|
+
return path
|
|
1290
|
+
|
|
1291
|
+
def _create_sequential_paths(self, rel: RelOptInfo) -> None:
|
|
1292
|
+
"""Builds all sequential paths for a specific relation.
|
|
1293
|
+
|
|
1294
|
+
This includes the regular sequential scan as well as the partial version.
|
|
1295
|
+
"""
|
|
1296
|
+
if ScanOperator.SequentialScan not in self._scan_ops:
|
|
1297
|
+
return
|
|
1298
|
+
seq_path = self._create_seqscan_path(rel)
|
|
1299
|
+
self._add_path(rel, seq_path)
|
|
1300
|
+
|
|
1301
|
+
if seq_path["estimated_workers"]:
|
|
1302
|
+
self._add_path(rel, seq_path, is_partial=True)
|
|
1303
|
+
|
|
1304
|
+
def _create_index_paths(self, rel: RelOptInfo) -> None:
|
|
1305
|
+
"""Builds all index scan paths for a specific relation.
|
|
1306
|
+
|
|
1307
|
+
This method considers each index on the relation that spans columns from the query. If this is just a single column,
|
|
1308
|
+
it tries to create an index-only scan instead.
|
|
1309
|
+
|
|
1310
|
+
If both kinds of index scans are disabled, this method does nothing.
|
|
1311
|
+
"""
|
|
1312
|
+
if (
|
|
1313
|
+
ScanOperator.IndexScan not in self._scan_ops
|
|
1314
|
+
and ScanOperator.IndexOnlyScan not in self._scan_ops
|
|
1315
|
+
):
|
|
1316
|
+
return
|
|
1317
|
+
|
|
1318
|
+
base_table = util.simplify(rel.intermediate)
|
|
1319
|
+
filter_condition = self.predicates.filters_for(base_table)
|
|
1320
|
+
required_columns = self.query.columns_of(base_table)
|
|
1321
|
+
idx_only_scan = (
|
|
1322
|
+
ScanOperator.IndexOnlyScan in self._scan_ops and len(required_columns) <= 1
|
|
1323
|
+
)
|
|
1324
|
+
candidate_indexes = {
|
|
1325
|
+
column: self.target_db.schema().indexes_on(column)
|
|
1326
|
+
for column in required_columns
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
index_paths: list[QueryPlan] = []
|
|
1330
|
+
for column, available_indexes in candidate_indexes.items():
|
|
1331
|
+
if not available_indexes:
|
|
1332
|
+
continue
|
|
1333
|
+
sorting = [SortKey.of(column)]
|
|
1334
|
+
|
|
1335
|
+
for index in available_indexes:
|
|
1336
|
+
workers = self._estimate_workers(index)
|
|
1337
|
+
if ScanOperator.IndexScan in self._scan_ops:
|
|
1338
|
+
idx_path = QueryPlan(
|
|
1339
|
+
ScanOperator.IndexScan,
|
|
1340
|
+
base_table=base_table,
|
|
1341
|
+
index=index,
|
|
1342
|
+
sort_keys=sorting,
|
|
1343
|
+
filter_predicate=filter_condition,
|
|
1344
|
+
estimated_workers=workers,
|
|
1345
|
+
)
|
|
1346
|
+
index_paths.append(idx_path)
|
|
1347
|
+
if idx_only_scan:
|
|
1348
|
+
idx_path = QueryPlan(
|
|
1349
|
+
ScanOperator.IndexOnlyScan,
|
|
1350
|
+
base_table=base_table,
|
|
1351
|
+
index=index,
|
|
1352
|
+
sort_keys=sorting,
|
|
1353
|
+
filter_predicate=filter_condition,
|
|
1354
|
+
estimated_workers=workers,
|
|
1355
|
+
)
|
|
1356
|
+
index_paths.append(idx_path)
|
|
1357
|
+
|
|
1358
|
+
# See comment in _create_seqscan_path() for explanation of the star query. We do the same here.
|
|
1359
|
+
star_query = transform.as_star_query(self.query)
|
|
1360
|
+
for path in index_paths:
|
|
1361
|
+
cost_estimate = self.cost_model.estimate_cost(star_query, path)
|
|
1362
|
+
path = path.with_estimates(cost=cost_estimate)
|
|
1363
|
+
self._add_path(rel, path)
|
|
1364
|
+
|
|
1365
|
+
if path["estimated_workers"]:
|
|
1366
|
+
self._add_path(rel, path, is_partial=True)
|
|
1367
|
+
|
|
1368
|
+
def _create_bitmap_path(self, rel: RelOptInfo) -> None:
|
|
1369
|
+
"""Constructs and initializes a bitmap scan path for a specific relation.
|
|
1370
|
+
|
|
1371
|
+
Since we don't model bitmap index scans, bitmap ANDs, etc. explicitly, we only need to create a single bitmap path.
|
|
1372
|
+
Afterwards, we let the hinting backend figure out how to perform the scan precisely.
|
|
1373
|
+
|
|
1374
|
+
If bitmap scans are disabled, this method does nothing.
|
|
1375
|
+
"""
|
|
1376
|
+
if ScanOperator.BitmapScan not in self._scan_ops:
|
|
1377
|
+
return
|
|
1378
|
+
|
|
1379
|
+
# We deviate from the vanilla PG implementation and only consider the cheapest bitmap path. Since they are all unsorted
|
|
1380
|
+
# anyway (due to the final sequential scan), this should be fine.
|
|
1381
|
+
#
|
|
1382
|
+
# Notice that the hinting backend is responsible for selecting the appropriate bitmap index hierarchies.
|
|
1383
|
+
|
|
1384
|
+
base_table = util.simplify(rel.intermediate)
|
|
1385
|
+
required_columns = self.query.columns_of(base_table)
|
|
1386
|
+
candidate_indexes = {
|
|
1387
|
+
column: self.target_db.schema().indexes_on(column)
|
|
1388
|
+
for column in required_columns
|
|
1389
|
+
}
|
|
1390
|
+
if not candidate_indexes:
|
|
1391
|
+
# We only check if there are no candidate indexes at all and explicitly accept the case where there is a single
|
|
1392
|
+
# candidate index. This is because we can still perform an index lookup followed by a sequential scan of the pages.
|
|
1393
|
+
# This might be better than doing a full sequential scan or a full random I/O index scan.
|
|
1394
|
+
return
|
|
1395
|
+
|
|
1396
|
+
filter_condition = self.predicates.filters_for(base_table)
|
|
1397
|
+
|
|
1398
|
+
# bitmap scans parallelize the scan portion, not the index portion
|
|
1399
|
+
workers = self._estimate_workers(base_table)
|
|
1400
|
+
|
|
1401
|
+
bitmap_path = QueryPlan(
|
|
1402
|
+
ScanOperator.BitmapScan,
|
|
1403
|
+
base_table=base_table,
|
|
1404
|
+
indexes=candidate_indexes,
|
|
1405
|
+
filter_predicate=filter_condition,
|
|
1406
|
+
estimated_workers=workers,
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
# See comment in _create_seqscan_path() for explanation of the star query. We do the same here.
|
|
1410
|
+
star_query = transform.as_star_query(self.query)
|
|
1411
|
+
cost_estimate = self.cost_model.estimate_cost(star_query, bitmap_path)
|
|
1412
|
+
bitmap_path = bitmap_path.with_estimates(cost=cost_estimate)
|
|
1413
|
+
|
|
1414
|
+
self._add_path(rel, bitmap_path)
|
|
1415
|
+
if workers:
|
|
1416
|
+
self._add_path(rel, bitmap_path, is_partial=True)
|
|
1417
|
+
|
|
1418
|
+
def _create_memoize_path(
|
|
1419
|
+
self, path: QueryPlan, *, cache_key: ColumnReference
|
|
1420
|
+
) -> QueryPlan:
|
|
1421
|
+
"""Constructs and initializes a memo path for a specific relation.
|
|
1422
|
+
|
|
1423
|
+
The `cache_key` is the column that identifies different entries in the memo table.
|
|
1424
|
+
|
|
1425
|
+
This method assumes that memoization is actually enabled.
|
|
1426
|
+
"""
|
|
1427
|
+
workers = path.get("estimated_workers", None)
|
|
1428
|
+
|
|
1429
|
+
memo_path = QueryPlan(
|
|
1430
|
+
IntermediateOperator.Memoize,
|
|
1431
|
+
children=path,
|
|
1432
|
+
lookup_key=ColumnExpression(cache_key),
|
|
1433
|
+
estimated_cardinality=path.estimated_cardinality,
|
|
1434
|
+
estimated_workers=workers,
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
if self._is_pg_cost(self.cost_model):
|
|
1438
|
+
# Cost estimation happens as part of the parent join, so we just use a dummy here
|
|
1439
|
+
# see comment in generate_execution_plan() for details. We use the same reasoning here.
|
|
1440
|
+
return memo_path
|
|
1441
|
+
|
|
1442
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, memo_path)
|
|
1443
|
+
memo_path = memo_path.with_estimates(cost=cost_estimate)
|
|
1444
|
+
return memo_path
|
|
1445
|
+
|
|
1446
|
+
def _create_materialize_path(self, path: QueryPlan) -> QueryPlan:
|
|
1447
|
+
"""Constructs and initializes a materialize path for a specific relation.
|
|
1448
|
+
|
|
1449
|
+
This method assumes that materialization is actually enabled.
|
|
1450
|
+
"""
|
|
1451
|
+
workers = path.get("estimated_workers", None)
|
|
1452
|
+
|
|
1453
|
+
mat_path = QueryPlan(
|
|
1454
|
+
IntermediateOperator.Materialize,
|
|
1455
|
+
children=path,
|
|
1456
|
+
estimated_cardinality=path.estimated_cardinality,
|
|
1457
|
+
estimated_workers=workers,
|
|
1458
|
+
)
|
|
1459
|
+
|
|
1460
|
+
if self._is_pg_cost(self.cost_model):
|
|
1461
|
+
# Cost estimation happens as part of the parent join, so we just use a dummy here
|
|
1462
|
+
# see comment in generate_execution_plan() for details. We use the same reasoning here.
|
|
1463
|
+
return mat_path
|
|
1464
|
+
|
|
1465
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, mat_path)
|
|
1466
|
+
mat_path = mat_path.with_estimates(cost=cost_estimate)
|
|
1467
|
+
return mat_path
|
|
1468
|
+
|
|
1469
|
+
def _create_sort_path(
|
|
1470
|
+
self, path: QueryPlan, *, sort_key: ColumnReference
|
|
1471
|
+
) -> QueryPlan:
|
|
1472
|
+
"""Constructs and initializes a sort path for a specific relation on a specific column.
|
|
1473
|
+
|
|
1474
|
+
The column to sort by is specified by `sort_key`. Notice that the sort path will always be created, even if the path
|
|
1475
|
+
is already sorted by the key.
|
|
1476
|
+
|
|
1477
|
+
This method assumes that sorting is actually enabled.
|
|
1478
|
+
"""
|
|
1479
|
+
workers = path.get("estimated_workers", None)
|
|
1480
|
+
|
|
1481
|
+
sort_path = QueryPlan(
|
|
1482
|
+
IntermediateOperator.Sort,
|
|
1483
|
+
children=path,
|
|
1484
|
+
sort_keys=[SortKey.of(sort_key)],
|
|
1485
|
+
estimated_cardinality=path.estimated_cardinality,
|
|
1486
|
+
estimated_workers=workers,
|
|
1487
|
+
)
|
|
1488
|
+
|
|
1489
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, sort_path)
|
|
1490
|
+
sort_path = sort_path.with_estimates(cost=cost_estimate)
|
|
1491
|
+
return sort_path
|
|
1492
|
+
|
|
1493
|
+
def _create_nestloop_path(
|
|
1494
|
+
self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
|
|
1495
|
+
) -> QueryPlan:
|
|
1496
|
+
"""Constructs and initializes a nested loop join path for a specific intermediate.
|
|
1497
|
+
|
|
1498
|
+
This method assumes that nested loop joins are actually enabled.
|
|
1499
|
+
|
|
1500
|
+
Parameters
|
|
1501
|
+
----------
|
|
1502
|
+
query : SqlQuery
|
|
1503
|
+
The query that is currently being optimized.
|
|
1504
|
+
join_rel : _RelOptInfo
|
|
1505
|
+
The RelOptInfo of the join to construct
|
|
1506
|
+
outer_path : QueryPlan
|
|
1507
|
+
The access path for the outer relation in the join
|
|
1508
|
+
inner_path : QueryPlan
|
|
1509
|
+
The access path for the inner relation in the join
|
|
1510
|
+
cost_model : CostModel
|
|
1511
|
+
The cost model to evaluate the new path
|
|
1512
|
+
"""
|
|
1513
|
+
join_condition = self.predicates.joins_between(
|
|
1514
|
+
outer_path.tables(), inner_path.tables()
|
|
1515
|
+
)
|
|
1516
|
+
workers = outer_path.get("estimated_workers", None)
|
|
1517
|
+
|
|
1518
|
+
nlj_path = QueryPlan(
|
|
1519
|
+
JoinOperator.NestedLoopJoin,
|
|
1520
|
+
children=[outer_path, inner_path],
|
|
1521
|
+
estimated_cardinality=join_rel.cardinality,
|
|
1522
|
+
filter_predicate=join_condition,
|
|
1523
|
+
estimated_workers=workers,
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
# We need to be very hacky here. Let's explain why:
|
|
1527
|
+
# PG is rather picky about which operators can be used for which queries. For example, it is generally not possible to
|
|
1528
|
+
# perform an index scan for an arbitrary SELECT * FROM relation query. Instead, PG requires the existence of a filter
|
|
1529
|
+
# condition on the indexed column. Likewise, Memoize and Materialize can never be the final operator in a plan, and
|
|
1530
|
+
# must be part of a nested-loop join.
|
|
1531
|
+
# Therefore, it might not be possible to correctly estimate the cost of these operators on their own, but only in
|
|
1532
|
+
# conjunction with a nested-loop join.
|
|
1533
|
+
# To account for this, we check whether we have any of these special/"weird" operators on the inner side of our NLJ.
|
|
1534
|
+
# If we do, and our target cost model is actually PG itself, we ask PG to estimate the cost for the entire NLJ plan
|
|
1535
|
+
# and just extract the costs that we require.
|
|
1536
|
+
|
|
1537
|
+
weird_ops: set[PhysicalOperator] = {
|
|
1538
|
+
ScanOperator.IndexScan,
|
|
1539
|
+
ScanOperator.IndexOnlyScan,
|
|
1540
|
+
IntermediateOperator.Memoize,
|
|
1541
|
+
IntermediateOperator.Materialize,
|
|
1542
|
+
}
|
|
1543
|
+
if self._is_pg_cost(self.cost_model) and inner_path.operator in weird_ops:
|
|
1544
|
+
native_plan = self._pg_plan(nlj_path)
|
|
1545
|
+
|
|
1546
|
+
if native_plan:
|
|
1547
|
+
nlj_node = native_plan.find_first_node(
|
|
1548
|
+
lambda n: n.operator == JoinOperator.NestedLoopJoin
|
|
1549
|
+
)
|
|
1550
|
+
weird_node = native_plan.find_first_node(
|
|
1551
|
+
lambda n: n.operator == inner_path.operator, direction="inner"
|
|
1552
|
+
)
|
|
1553
|
+
inner_cost = weird_node.estimated_cost if weird_node else math.inf
|
|
1554
|
+
nlj_cost = (
|
|
1555
|
+
nlj_node.estimated_cost if not math.isinf(inner_cost) else math.inf
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
updated_inner = inner_path.with_estimates(cost=inner_cost)
|
|
1559
|
+
nlj_path = QueryPlan(
|
|
1560
|
+
JoinOperator.NestedLoopJoin,
|
|
1561
|
+
children=[outer_path, updated_inner],
|
|
1562
|
+
estimated_cardinality=join_rel.cardinality,
|
|
1563
|
+
estimated_cost=nlj_cost,
|
|
1564
|
+
filter_predicate=join_condition,
|
|
1565
|
+
estimated_workers=workers,
|
|
1566
|
+
)
|
|
1567
|
+
else:
|
|
1568
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, nlj_path)
|
|
1569
|
+
nlj_path = nlj_path.with_estimates(cost=cost_estimate)
|
|
1570
|
+
|
|
1571
|
+
else:
|
|
1572
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, nlj_path)
|
|
1573
|
+
nlj_path = nlj_path.with_estimates(cost=cost_estimate)
|
|
1574
|
+
|
|
1575
|
+
return nlj_path
|
|
1576
|
+
|
|
1577
|
+
def _create_mergejoin_path(
|
|
1578
|
+
self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
|
|
1579
|
+
) -> QueryPlan:
|
|
1580
|
+
"""Constructs and initializes a merge join path for a specific intermediate.
|
|
1581
|
+
|
|
1582
|
+
This method assumes that merge joins are actually enabled and that the input paths are already sorted by the join key.
|
|
1583
|
+
However, we take a conservative approach and only assume sorting by the first sort key of each path.
|
|
1584
|
+
|
|
1585
|
+
Parameters
|
|
1586
|
+
----------
|
|
1587
|
+
query : SqlQuery
|
|
1588
|
+
The query that is currently being optimized.
|
|
1589
|
+
join_rel : _RelOptInfo
|
|
1590
|
+
The RelOptInfo of the join to construct
|
|
1591
|
+
outer_path : QueryPlan
|
|
1592
|
+
The access path for the outer relation in the join
|
|
1593
|
+
inner_path : QueryPlan
|
|
1594
|
+
The access path for the inner relation in the join
|
|
1595
|
+
cost_model : CostModel
|
|
1596
|
+
The cost model to evaluate the new path
|
|
1597
|
+
"""
|
|
1598
|
+
|
|
1599
|
+
# This function assumes that outer_path and inner_path are already sorted appropriately.
|
|
1600
|
+
merge_key = outer_path.sort_keys[0].merge_with(inner_path.sort_keys[0])
|
|
1601
|
+
join_condition = self.predicates.joins_between(
|
|
1602
|
+
outer_path.tables(), inner_path.tables()
|
|
1603
|
+
)
|
|
1604
|
+
workers = outer_path.get("estimated_workers", None)
|
|
1605
|
+
|
|
1606
|
+
merge_path = QueryPlan(
|
|
1607
|
+
JoinOperator.SortMergeJoin,
|
|
1608
|
+
children=[outer_path, inner_path],
|
|
1609
|
+
sort_keys=[merge_key],
|
|
1610
|
+
estimated_cardinality=join_rel.cardinality,
|
|
1611
|
+
filter_predicate=join_condition,
|
|
1612
|
+
estimated_workers=workers,
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, merge_path)
|
|
1616
|
+
merge_path = merge_path.with_estimates(cost=cost_estimate)
|
|
1617
|
+
return merge_path
|
|
1618
|
+
|
|
1619
|
+
def _create_hashjoin_path(
|
|
1620
|
+
self, join_rel: RelOptInfo, *, outer_path: QueryPlan, inner_path: QueryPlan
|
|
1621
|
+
) -> QueryPlan:
|
|
1622
|
+
"""Constructs and initializes a hash join path for a specific intermediate.
|
|
1623
|
+
|
|
1624
|
+
This method assumes that hash joins are actually enabled.
|
|
1625
|
+
|
|
1626
|
+
Parameters
|
|
1627
|
+
----------
|
|
1628
|
+
query : SqlQuery
|
|
1629
|
+
The query that is currently being optimized.
|
|
1630
|
+
join_rel : _RelOptInfo
|
|
1631
|
+
The RelOptInfo of the join to construct
|
|
1632
|
+
outer_path : QueryPlan
|
|
1633
|
+
The access path for the outer relation in the join
|
|
1634
|
+
inner_path : QueryPlan
|
|
1635
|
+
The access path for the inner relation in the join
|
|
1636
|
+
cost_model : CostModel
|
|
1637
|
+
The cost model to evaluate the new path
|
|
1638
|
+
"""
|
|
1639
|
+
join_condition = self.predicates.joins_between(
|
|
1640
|
+
outer_path.tables(), inner_path.tables()
|
|
1641
|
+
)
|
|
1642
|
+
workers = outer_path.get("estimated_workers", None)
|
|
1643
|
+
|
|
1644
|
+
hash_path = QueryPlan(
|
|
1645
|
+
JoinOperator.HashJoin,
|
|
1646
|
+
children=[outer_path, inner_path],
|
|
1647
|
+
estimated_cardinality=join_rel.cardinality,
|
|
1648
|
+
filter_predicate=join_condition,
|
|
1649
|
+
estimated_workers=workers,
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
cost_estimate = self.cost_model.estimate_cost(self.query, hash_path)
|
|
1653
|
+
hash_path = hash_path.with_estimates(cost=cost_estimate)
|
|
1654
|
+
return hash_path
|
|
1655
|
+
|
|
1656
|
+
def _estimate_workers(self, relation: TableReference | str) -> int:
|
|
1657
|
+
"""Computes the number of worker processes to use for specific base relation or index.
|
|
1658
|
+
|
|
1659
|
+
Parameters
|
|
1660
|
+
----------
|
|
1661
|
+
relation : TableReference | str
|
|
1662
|
+
The base table or index to estimate. Indexes should be represented by their name whereas base tables can be
|
|
1663
|
+
supplied either as a `TableReference` or by name.
|
|
1664
|
+
"""
|
|
1665
|
+
if not self._max_workers:
|
|
1666
|
+
return 0
|
|
1667
|
+
n_pages = self.target_db.statistics().n_pages(relation)
|
|
1668
|
+
workers = math.log(n_pages, 3)
|
|
1669
|
+
if self._max_workers is None:
|
|
1670
|
+
return round(workers)
|
|
1671
|
+
return min(self._max_workers, round(workers))
|
|
1672
|
+
|
|
1673
|
+
def _pg_cost_estimate(self, path: QueryPlan) -> float:
|
|
1674
|
+
query_fragment = transform.extract_query_fragment(self.query, path.tables())
|
|
1675
|
+
hinted_query = self.target_db.hinting().generate_hints(query_fragment, path)
|
|
1676
|
+
try:
|
|
1677
|
+
cost = self.target_db.optimizer().cost_estimate(hinted_query)
|
|
1678
|
+
return cost
|
|
1679
|
+
except DatabaseServerError:
|
|
1680
|
+
return math.inf
|
|
1681
|
+
|
|
1682
|
+
def _is_pg_cost(self, cost_model: CostModel) -> bool:
|
|
1683
|
+
return (
|
|
1684
|
+
isinstance(cost_model, native.NativeCostModel)
|
|
1685
|
+
and cost_model.target_db == self.target_db
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
def _pg_plan(self, path: QueryPlan) -> Optional[QueryPlan]:
|
|
1689
|
+
query_fragment = transform.extract_query_fragment(self.query, path.tables())
|
|
1690
|
+
hinted_query = self.target_db.hinting().generate_hints(query_fragment, path)
|
|
1691
|
+
try:
|
|
1692
|
+
native_plan = self.target_db.optimizer().query_plan(hinted_query)
|
|
1693
|
+
return native_plan
|
|
1694
|
+
except DatabaseServerError:
|
|
1695
|
+
return None
|
|
1696
|
+
|
|
1697
|
+
def _determine_join_keys(
|
|
1698
|
+
self, *, outer_rel: RelOptInfo, inner_rel: RelOptInfo
|
|
1699
|
+
) -> list[AbstractPredicate]:
|
|
1700
|
+
"""Determines all available join predicates between two relations.
|
|
1701
|
+
|
|
1702
|
+
The predicates are implicitly ANDed together.
|
|
1703
|
+
"""
|
|
1704
|
+
join_predicates = self.query.predicates().joins_between(
|
|
1705
|
+
outer_rel.intermediate, inner_rel.intermediate
|
|
1706
|
+
)
|
|
1707
|
+
if not join_predicates:
|
|
1708
|
+
raise LogicError(
|
|
1709
|
+
"Cross product detected. This should never happen so deep down in the "
|
|
1710
|
+
"optimization process. Intermediates are "
|
|
1711
|
+
f"{outer_rel} and {inner_rel}"
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1714
|
+
match join_predicates:
|
|
1715
|
+
case CompoundPredicate(op, children) if op == CompoundOperator.And:
|
|
1716
|
+
join_keys: Sequence[AbstractPredicate] = children
|
|
1717
|
+
case _:
|
|
1718
|
+
join_keys = [join_predicates]
|
|
1719
|
+
|
|
1720
|
+
return join_keys
|
|
1721
|
+
|
|
1722
|
+
def _extract_join_columns(
|
|
1723
|
+
self,
|
|
1724
|
+
join_key: AbstractPredicate,
|
|
1725
|
+
*,
|
|
1726
|
+
outer_rel: RelOptInfo,
|
|
1727
|
+
inner_rel: RelOptInfo,
|
|
1728
|
+
) -> tuple[ColumnReference, ColumnReference]:
|
|
1729
|
+
"""Provides the join columns that are joined together in the format (outer_col, inner_col).
|
|
1730
|
+
|
|
1731
|
+
This method assumes that we indeed only perform a binary equi-join and will break otherwise.
|
|
1732
|
+
"""
|
|
1733
|
+
partners = join_key.join_partners()
|
|
1734
|
+
if len(partners) != 2:
|
|
1735
|
+
# TODO: in all further processing, we ignore the case where a path might be sorted by more than one join key
|
|
1736
|
+
# already this should occur very rarely, but it might provide a decent performance boost in those situations
|
|
1737
|
+
return None, None
|
|
1738
|
+
|
|
1739
|
+
partner: tuple[ColumnReference, ColumnReference] = util.simplify(partners)
|
|
1740
|
+
first_col, second_col = partner
|
|
1741
|
+
|
|
1742
|
+
if (
|
|
1743
|
+
first_col.table in outer_rel.intermediate
|
|
1744
|
+
and second_col.table in inner_rel.intermediate
|
|
1745
|
+
):
|
|
1746
|
+
return first_col, second_col
|
|
1747
|
+
elif (
|
|
1748
|
+
first_col.table in inner_rel.intermediate
|
|
1749
|
+
and second_col.table in outer_rel.intermediate
|
|
1750
|
+
):
|
|
1751
|
+
return second_col, first_col
|
|
1752
|
+
else:
|
|
1753
|
+
raise LogicError()
|
|
1754
|
+
|
|
1755
|
+
def _is_sorted_by(self, path: QueryPlan, column: ColumnReference) -> bool:
|
|
1756
|
+
"""Checks, whether a specific path is sorted by some column.
|
|
1757
|
+
|
|
1758
|
+
The column has to be the dominating part of the ordering, i.e. it is not sufficient that the column appears somewhere
|
|
1759
|
+
on the join path, it has to be the first column.
|
|
1760
|
+
"""
|
|
1761
|
+
if not path.sort_keys:
|
|
1762
|
+
return False
|
|
1763
|
+
|
|
1764
|
+
primary_sorting = path.sort_keys[0]
|
|
1765
|
+
return primary_sorting.is_compatible_with(column)
|
|
1766
|
+
|
|
1767
|
+
def _sorting_subsumes(self, sorting: Sorting, *, other: Sorting) -> bool:
|
|
1768
|
+
"""Checks, whether some sorting is "included" in another sorting.
|
|
1769
|
+
|
|
1770
|
+
We define subsumption as follows:
|
|
1771
|
+
- If both sortings are equal, they subsume each other
|
|
1772
|
+
- If one sorting is longer than the other, but the shorter one is a prefix of the larger one, the larger one subsumes
|
|
1773
|
+
the smaller one
|
|
1774
|
+
|
|
1775
|
+
Parameters
|
|
1776
|
+
----------
|
|
1777
|
+
sorting : Sorting
|
|
1778
|
+
The sorting which should subsume the `other` sorting
|
|
1779
|
+
other : Sorting
|
|
1780
|
+
The sorting being subsumed
|
|
1781
|
+
"""
|
|
1782
|
+
if sorting and not other:
|
|
1783
|
+
# we should always be able to evict other paths if we are sorted (and cheaper) and the other path is not
|
|
1784
|
+
# notice that we only check for sorting here, costs are handled elsewhere
|
|
1785
|
+
return True
|
|
1786
|
+
if other and not sorting:
|
|
1787
|
+
# we should never evict if we are not sorted but the other path is
|
|
1788
|
+
return False
|
|
1789
|
+
|
|
1790
|
+
if len(other) > len(sorting):
|
|
1791
|
+
# we should never evict if the other path is more precise
|
|
1792
|
+
return False
|
|
1793
|
+
|
|
1794
|
+
for i, key in enumerate(sorting):
|
|
1795
|
+
if i >= len(other):
|
|
1796
|
+
# Our current path has more sort keys than the other path (i.e. it is more specific) and so far all sort keys
|
|
1797
|
+
# have been equivalent. The other sorting is subsumed by our sorting.
|
|
1798
|
+
return True
|
|
1799
|
+
|
|
1800
|
+
other_key = other[i]
|
|
1801
|
+
if not key.is_compatible_with(other_key):
|
|
1802
|
+
return False
|
|
1803
|
+
|
|
1804
|
+
return True
|
|
1805
|
+
|
|
1806
|
+
def _same_sorting(self, sorting: Sorting | None, *, other: Sorting | None) -> bool:
|
|
1807
|
+
"""Checks, whether two sort orders are exactly equivalent."""
|
|
1808
|
+
if sorting is None and other is None:
|
|
1809
|
+
return True
|
|
1810
|
+
if sorting is None or other is None:
|
|
1811
|
+
return False
|
|
1812
|
+
|
|
1813
|
+
if len(sorting) != len(other):
|
|
1814
|
+
return False
|
|
1815
|
+
|
|
1816
|
+
for key, other_key in zip(sorting, other):
|
|
1817
|
+
if not key.is_compatible_with(other_key):
|
|
1818
|
+
return False
|
|
1819
|
+
|
|
1820
|
+
return True
|
|
1821
|
+
|
|
1822
|
+
def _warn(self, msg: str) -> None:
|
|
1823
|
+
if not self._verbose:
|
|
1824
|
+
return
|
|
1825
|
+
warnings.warn(msg, category=DPWarning)
|