PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/_pipelines.py
ADDED
|
@@ -0,0 +1,1121 @@
|
|
|
1
|
+
"""Provides PostBOUND's main optimization pipeline.
|
|
2
|
+
|
|
3
|
+
In fact, PostBOUND does not provide a single pipeline implementation. Rather, different pipeline types exists to accomodate
|
|
4
|
+
different use-cases. See the documentation of the general `OptimizationPipeline` base class for details. That class serves as
|
|
5
|
+
the smallest common denominator among all pipeline implementations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import abc
|
|
11
|
+
from typing import Optional, Protocol, Self
|
|
12
|
+
|
|
13
|
+
from ._hints import PhysicalOperatorAssignment, PlanParameterization
|
|
14
|
+
from ._qep import QueryPlan
|
|
15
|
+
from ._stages import (
|
|
16
|
+
CardinalityEstimator,
|
|
17
|
+
CompleteOptimizationAlgorithm,
|
|
18
|
+
CostModel,
|
|
19
|
+
IncrementalOptimizationStep,
|
|
20
|
+
JoinOrderOptimization,
|
|
21
|
+
ParameterGeneration,
|
|
22
|
+
PhysicalOperatorSelection,
|
|
23
|
+
PlanEnumerator,
|
|
24
|
+
)
|
|
25
|
+
from ._validation import (
|
|
26
|
+
EmptyPreCheck,
|
|
27
|
+
OptimizationPreCheck,
|
|
28
|
+
UnsupportedQueryError,
|
|
29
|
+
UnsupportedSystemError,
|
|
30
|
+
merge_checks,
|
|
31
|
+
)
|
|
32
|
+
from .db._db import Database, DatabasePool
|
|
33
|
+
from .qal._qal import SqlQuery
|
|
34
|
+
from .util._errors import StateError
|
|
35
|
+
from .util.jsonize import jsondict
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OptimizationPipeline(abc.ABC):
|
|
39
|
+
"""The optimization pipeline is the main tool to apply different strategies to optimize SQL queries.
|
|
40
|
+
|
|
41
|
+
Depending on the specific scenario, different concrete pipeline implementations exist. For example, to apply multi-stage
|
|
42
|
+
optimization design (e.g. consisting of join ordering and a subsequent physical operator selection), the
|
|
43
|
+
`MultiStageOptimizationPipeline` exists. Similarly, for optimization algorithms that perform join ordering and operator
|
|
44
|
+
selection in one process, an `IntegratedOptimizationPipeline` is available. The `TextBookOptimizationPipeline` is modelled
|
|
45
|
+
after the traditional interplay of cardinality estimator, cost model and plan enumerator. Lastly, to model approaches that
|
|
46
|
+
subsequently improve query plans by correcting some previous optimization decisions (e.g. transforming a hash join to a
|
|
47
|
+
nested loop join), the `IncrementalOptimizationPipeline` is provided. Consult the individual pipeline documentation for
|
|
48
|
+
more details. This class only describes the basic interface that is shared by all the pipeline implementations.
|
|
49
|
+
|
|
50
|
+
If in doubt what the best pipeline implementation is, it is probably best to start with the
|
|
51
|
+
`MultiStageOptimizationPipeline` or the `TextBookOptimizationPipeline`, since they are the most flexible.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
@abc.abstractmethod
|
|
55
|
+
def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
|
|
56
|
+
"""Applies the current pipeline configuration to obtain an optimized plan for the input query.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
query : SqlQuery
|
|
61
|
+
The query that should be optimized
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
QueryPlan
|
|
66
|
+
An optimized query execution plan for the input query.
|
|
67
|
+
|
|
68
|
+
If the optimization strategies only provide partial optimization decisions (e.g. physical operators for a subset of
|
|
69
|
+
the joins), it is up to the pipeline to fill the gaps in order to provide a complete execution plan. A typical
|
|
70
|
+
approach could be to delegate this task to the optimizer of the target database by providing it the partial
|
|
71
|
+
optimization information.
|
|
72
|
+
|
|
73
|
+
Raises
|
|
74
|
+
------
|
|
75
|
+
UnsupportedQueryError
|
|
76
|
+
If the selected optimization algorithms cannot be applied to the specific query, e.g. because it contains
|
|
77
|
+
unsupported features.
|
|
78
|
+
"""
|
|
79
|
+
raise NotImplementedError
|
|
80
|
+
|
|
81
|
+
def optimize_query(self, query: SqlQuery) -> SqlQuery:
|
|
82
|
+
"""Applies the current pipeline configuration to optimize the input query.
|
|
83
|
+
|
|
84
|
+
This process also involves the generation of appropriate optimization information that enforces the selected
|
|
85
|
+
optimization decision when the query is executed on an actual database system.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
query : SqlQuery
|
|
90
|
+
The query that should be optimized
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
SqlQuery
|
|
95
|
+
A transformed query that encapsulates all the optimization decisions made by the pipeline. What this
|
|
96
|
+
actually means depends on the selected optimization strategies, as well as specifics of the target database
|
|
97
|
+
system:
|
|
98
|
+
|
|
99
|
+
Depending on the optimization strategy the optimization decisions can range from simple operator selections
|
|
100
|
+
(such as "no nested loop join for this join") to entire physical query execution plans (consisting of a
|
|
101
|
+
join order, as well as scan and join operators for all parts of the plan) and anything in between. For
|
|
102
|
+
novel cardinality estimation approaches, the optimization info could also be structured such that the
|
|
103
|
+
default cardinality estimates are overwritten.
|
|
104
|
+
|
|
105
|
+
Furthermore, the way the optimization info is expressed depends on the selected database system. Most systems
|
|
106
|
+
do not allow a direct modification of the query optimizer's implementation. Therefore, PostBOUND takes an indirect
|
|
107
|
+
approach: it emits system-specific hints that enable corrections for individual optimizer decisions (such as
|
|
108
|
+
disabling a specific physical operator). For example, PostgreSQL allows to use planner options such as
|
|
109
|
+
``SET enable_nestloop = 'off'`` to disable nested loop joins for the all subsequent queries in the current
|
|
110
|
+
connection. MySQL provides hints like ``BNL(R S)`` to recommend a block-nested loop join or hash join (depending
|
|
111
|
+
on the MySQL version) to the optimizer for a specific join. These hints are inserted into comment blocks in the
|
|
112
|
+
final SQL query. Likewise, some systems treat certain SQL keywords differently or provide their own extensions.
|
|
113
|
+
This also allows to modify the underlying plans. For example, when SQLite encouters a *CROSS JOIN* syntax in the
|
|
114
|
+
*FROM* clause, it does not try to optimize the join order and uses the order in which the tables are specified in
|
|
115
|
+
the relation instead.
|
|
116
|
+
|
|
117
|
+
Therefore, the resulting query will differ from the original input query in a number of ways. However, the
|
|
118
|
+
produced result sets should still be equivalent. If this is not the case, something went severly wrong
|
|
119
|
+
during query optimization. Take a look at the `db` module for more details on the database system support
|
|
120
|
+
and the query generation capabilities.
|
|
121
|
+
|
|
122
|
+
Raises
|
|
123
|
+
------
|
|
124
|
+
UnsupportedQueryError
|
|
125
|
+
If the selected optimization algorithms cannot be applied to the specific query, e.g. because it contains
|
|
126
|
+
unsupported features.
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
References
|
|
130
|
+
----------
|
|
131
|
+
|
|
132
|
+
.. PostgreSQL query planning options: https://www.postgresql.org/docs/15/runtime-config-query.html
|
|
133
|
+
.. MySQL optimizer hints: https://dev.mysql.com/doc/refman/8.0/en/optimizer-hints.html
|
|
134
|
+
.. SQLite *CROSS JOIN* handling: https://www.sqlite.org/optoverview.html#crossjoin
|
|
135
|
+
"""
|
|
136
|
+
execution_plan = self.query_execution_plan(query)
|
|
137
|
+
hinting_service = self.target_database().hinting()
|
|
138
|
+
return hinting_service.generate_hints(query, execution_plan)
|
|
139
|
+
|
|
140
|
+
@abc.abstractmethod
|
|
141
|
+
def target_database(self) -> Database:
|
|
142
|
+
"""Provides the current target database.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
Database
|
|
147
|
+
The database for which the input queries should be optimized
|
|
148
|
+
"""
|
|
149
|
+
raise NotImplementedError
|
|
150
|
+
|
|
151
|
+
@abc.abstractmethod
|
|
152
|
+
def describe(self) -> jsondict:
|
|
153
|
+
"""Generates a description of the current pipeline configuration.
|
|
154
|
+
|
|
155
|
+
This description is intended to transparently document which optimization strategies have been selected and
|
|
156
|
+
how they have been instantiated. It can be JSON-serialized and will be included in the output of the benchmarking
|
|
157
|
+
utilities.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
jsondict
|
|
162
|
+
The actual description
|
|
163
|
+
"""
|
|
164
|
+
raise NotImplementedError
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class IntegratedOptimizationPipeline(OptimizationPipeline):
|
|
168
|
+
"""This pipeline is intended for algorithms that calculate the entire query plan in a single process.
|
|
169
|
+
|
|
170
|
+
To configure the pipeline, use the `set_optimization_algorithm` method followed by the `build` method (in line with the
|
|
171
|
+
other pipelines).
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
target_db : Optional[Database], optional
|
|
176
|
+
The database for which the optimized queries should be generated. If this is not given, he default database is
|
|
177
|
+
extracted from the `DatabasePool`.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def __init__(self, target_db: Optional[Database] = None) -> None:
|
|
181
|
+
self._target_db = (
|
|
182
|
+
target_db
|
|
183
|
+
if target_db is not None
|
|
184
|
+
else DatabasePool.get_instance().current_database()
|
|
185
|
+
)
|
|
186
|
+
self._optimization_algorithm: Optional[CompleteOptimizationAlgorithm] = None
|
|
187
|
+
self._build = False
|
|
188
|
+
super().__init__()
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def target_db(self) -> Database:
|
|
192
|
+
"""The database for which optimized queries should be generated.
|
|
193
|
+
|
|
194
|
+
When assigning a new target database, the pipeline has to be build again.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
Database
|
|
199
|
+
The currently selected database system
|
|
200
|
+
|
|
201
|
+
See Also
|
|
202
|
+
--------
|
|
203
|
+
CompleteOptimizationAlgorithm.pre_check
|
|
204
|
+
"""
|
|
205
|
+
return self._target_db
|
|
206
|
+
|
|
207
|
+
@target_db.setter
|
|
208
|
+
def target_db(self, system: Database) -> None:
|
|
209
|
+
self._build = False
|
|
210
|
+
self._target_db = system
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def optimization_algorithm(self) -> Optional[CompleteOptimizationAlgorithm]:
|
|
214
|
+
"""The optimization algorithm is used each time a query should be optimized.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
Optional[CompleteOptimizationAlgorithm]
|
|
219
|
+
The currently selected optimization algorithm, if any.
|
|
220
|
+
"""
|
|
221
|
+
return self._optimization_algorithm
|
|
222
|
+
|
|
223
|
+
def setup_optimization_algorithm(
|
|
224
|
+
self, algorithm: CompleteOptimizationAlgorithm
|
|
225
|
+
) -> Self:
|
|
226
|
+
"""Configures the pipeline to use the given optimization algorithm.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
algorithm : CompleteOptimizationAlgorithm
|
|
231
|
+
The new optimization algorithm to use. No compatibility checks are performed, yet. This is done when building the
|
|
232
|
+
pipeline.
|
|
233
|
+
|
|
234
|
+
Returns
|
|
235
|
+
-------
|
|
236
|
+
IntegratedOptimizationPipeline
|
|
237
|
+
The current pipeline to allow for easy method-chaining.
|
|
238
|
+
"""
|
|
239
|
+
self._optimization_algorithm = algorithm
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def build(self) -> Self:
|
|
243
|
+
"""Constructs the optimization pipeline.
|
|
244
|
+
|
|
245
|
+
This includes checking the selected optimization algorithm for compatibility with the `target_db`. Afterwards, the
|
|
246
|
+
pipeline is ready to optimize queries.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
IntegratedOptimizationPipeline
|
|
251
|
+
The current pipeline to allow for easy method-chaining.
|
|
252
|
+
|
|
253
|
+
Raises
|
|
254
|
+
------
|
|
255
|
+
UnsupportedSystemError
|
|
256
|
+
If the new optimization algorithm is not compatible with the current target database system.
|
|
257
|
+
|
|
258
|
+
See Also
|
|
259
|
+
--------
|
|
260
|
+
CompleteOptimizationAlgorithm.pre_check
|
|
261
|
+
"""
|
|
262
|
+
pre_check = self._optimization_algorithm.pre_check()
|
|
263
|
+
if pre_check is not None:
|
|
264
|
+
pre_check.check_supported_database_system(
|
|
265
|
+
self._target_db
|
|
266
|
+
).ensure_all_passed()
|
|
267
|
+
self._build = True
|
|
268
|
+
return self
|
|
269
|
+
|
|
270
|
+
def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
|
|
271
|
+
if not self._build:
|
|
272
|
+
raise StateError(
|
|
273
|
+
"No algorithm has been selected. Don't forget to call `build()` after setting the algorithm."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
pre_check = self.optimization_algorithm.pre_check()
|
|
277
|
+
if pre_check is not None:
|
|
278
|
+
pre_check.check_supported_query(query).ensure_all_passed()
|
|
279
|
+
|
|
280
|
+
physical_qep = self.optimization_algorithm.optimize_query(query)
|
|
281
|
+
return physical_qep
|
|
282
|
+
|
|
283
|
+
def target_database(self) -> Database:
|
|
284
|
+
return self._target_db
|
|
285
|
+
|
|
286
|
+
def describe(self) -> jsondict:
|
|
287
|
+
algorithm_description = (
|
|
288
|
+
self._optimization_algorithm.describe()
|
|
289
|
+
if self._optimization_algorithm is not None
|
|
290
|
+
else "no_algorithm"
|
|
291
|
+
)
|
|
292
|
+
return {
|
|
293
|
+
"name": "integrated_pipeline",
|
|
294
|
+
"database_system": self._target_db.describe(),
|
|
295
|
+
"optimization_algorithm": algorithm_description,
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
def __repr__(self) -> str:
|
|
299
|
+
return str(self)
|
|
300
|
+
|
|
301
|
+
def __str__(self) -> str:
|
|
302
|
+
return f"IntegratedOptimization [{self._optimization_algorithm}]"
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class TextBookOptimizationPipeline(OptimizationPipeline):
|
|
306
|
+
"""This pipeline is modelled after the traditional approach to query optimization as used in most real-world systems.
|
|
307
|
+
|
|
308
|
+
The optimizer consists of a cardinality estimator that calculates the size of intermediate results, a cost model that
|
|
309
|
+
quantifies how expensive specific access paths for the intermediates are, and an enumerator that generates the
|
|
310
|
+
intermediates in the first place.
|
|
311
|
+
|
|
312
|
+
To configure the pipeline, specific strategies for each of the three components have to be assigned.
|
|
313
|
+
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
target_db : Database
|
|
317
|
+
The database for which the optimized queries should be generated.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, target_db: Database) -> None:
|
|
321
|
+
from .db.postgres import PostgresInterface
|
|
322
|
+
from .optimizer.dynprog import (
|
|
323
|
+
DynamicProgrammingEnumerator,
|
|
324
|
+
PostgresDynProg,
|
|
325
|
+
)
|
|
326
|
+
from .optimizer.native import (
|
|
327
|
+
NativeCardinalityEstimator,
|
|
328
|
+
NativeCostModel,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
self._target_db = target_db
|
|
332
|
+
self._card_est: CardinalityEstimator = NativeCardinalityEstimator()
|
|
333
|
+
self._cost_model: CostModel = NativeCostModel()
|
|
334
|
+
|
|
335
|
+
if isinstance(target_db, PostgresInterface):
|
|
336
|
+
self._plan_enumerator = PostgresDynProg(target_db=target_db)
|
|
337
|
+
self._plan_enumerator.infer_settings()
|
|
338
|
+
else:
|
|
339
|
+
self._plan_enumerator = DynamicProgrammingEnumerator(target_db=target_db)
|
|
340
|
+
|
|
341
|
+
self._support_check = EmptyPreCheck()
|
|
342
|
+
self._build = False
|
|
343
|
+
|
|
344
|
+
def target_database(self) -> Database:
|
|
345
|
+
return self._target_db
|
|
346
|
+
|
|
347
|
+
def setup_cardinality_estimator(self, estimator: CardinalityEstimator) -> Self:
|
|
348
|
+
"""Configures the cardinality estimator of the optimizer.
|
|
349
|
+
|
|
350
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
351
|
+
|
|
352
|
+
Parameters
|
|
353
|
+
----------
|
|
354
|
+
estimator : CardinalityEstimator
|
|
355
|
+
The estimator to be used
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
self
|
|
360
|
+
The current pipeline to allow for easy method-chaining.
|
|
361
|
+
"""
|
|
362
|
+
self._build = False
|
|
363
|
+
self._card_est = estimator
|
|
364
|
+
return self
|
|
365
|
+
|
|
366
|
+
def setup_cost_model(self, cost_model: CostModel) -> Self:
|
|
367
|
+
"""Configures the cost model of the optimizer.
|
|
368
|
+
|
|
369
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
370
|
+
|
|
371
|
+
Parameters
|
|
372
|
+
----------
|
|
373
|
+
cost_model : CostModel
|
|
374
|
+
The cost model to be used
|
|
375
|
+
|
|
376
|
+
Returns
|
|
377
|
+
-------
|
|
378
|
+
self
|
|
379
|
+
The current pipeline to allow for easy method-chaining.
|
|
380
|
+
"""
|
|
381
|
+
self._build = False
|
|
382
|
+
self._cost_model = cost_model
|
|
383
|
+
return self
|
|
384
|
+
|
|
385
|
+
def setup_plan_enumerator(self, plan_enumerator: PlanEnumerator) -> Self:
|
|
386
|
+
"""Configures the plan enumerator of the optimizer.
|
|
387
|
+
|
|
388
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
389
|
+
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
plan_enumerator : PlanEnumerator
|
|
393
|
+
The enumerator to be used
|
|
394
|
+
|
|
395
|
+
Returns
|
|
396
|
+
-------
|
|
397
|
+
self
|
|
398
|
+
The current pipeline to allow for easy method-chaining.
|
|
399
|
+
"""
|
|
400
|
+
self._build = False
|
|
401
|
+
self._plan_enumerator = plan_enumerator
|
|
402
|
+
return self
|
|
403
|
+
|
|
404
|
+
def use(self, component: PlanEnumerator | CostModel | CardinalityEstimator) -> Self:
|
|
405
|
+
"""Shortcut method to setup the pipeline. Delegates to the appropriate setup_XXX method."""
|
|
406
|
+
match component:
|
|
407
|
+
case PlanEnumerator():
|
|
408
|
+
return self.setup_plan_enumerator(component)
|
|
409
|
+
case CostModel():
|
|
410
|
+
return self.setup_cost_model(component)
|
|
411
|
+
case CardinalityEstimator():
|
|
412
|
+
return self.setup_cardinality_estimator(component)
|
|
413
|
+
case _:
|
|
414
|
+
raise TypeError(f"Unsupported component type: {type(component)}")
|
|
415
|
+
|
|
416
|
+
def build(self) -> Self:
|
|
417
|
+
"""Constructs the optimization pipeline.
|
|
418
|
+
|
|
419
|
+
This includes checking all strategies for compatibility with the `target_db`. Afterwards, the pipeline is ready to
|
|
420
|
+
optimize queries.
|
|
421
|
+
|
|
422
|
+
Returns
|
|
423
|
+
-------
|
|
424
|
+
self
|
|
425
|
+
The current pipeline to allow for easy method-chaining.
|
|
426
|
+
|
|
427
|
+
Raises
|
|
428
|
+
------
|
|
429
|
+
UnsupportedSystemError
|
|
430
|
+
If any of the selected optimization stages is not compatible with the `target_db`.
|
|
431
|
+
"""
|
|
432
|
+
if self._card_est is None:
|
|
433
|
+
raise StateError("Missing cardinality estimator")
|
|
434
|
+
if self._cost_model is None:
|
|
435
|
+
raise StateError("Missing cost model")
|
|
436
|
+
if self._plan_enumerator is None:
|
|
437
|
+
raise StateError("Missing plan enumerator")
|
|
438
|
+
|
|
439
|
+
self._support_check = merge_checks(
|
|
440
|
+
[
|
|
441
|
+
self._card_est.pre_check(),
|
|
442
|
+
self._cost_model.pre_check(),
|
|
443
|
+
self._plan_enumerator.pre_check(),
|
|
444
|
+
]
|
|
445
|
+
)
|
|
446
|
+
self._support_check.check_supported_database_system(
|
|
447
|
+
self._target_db
|
|
448
|
+
).ensure_all_passed(self._target_db)
|
|
449
|
+
|
|
450
|
+
self._build = True
|
|
451
|
+
return self
|
|
452
|
+
|
|
453
|
+
def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
|
|
454
|
+
if not self._build:
|
|
455
|
+
raise StateError("Pipeline has not been build")
|
|
456
|
+
self._support_check.check_supported_query(query).ensure_all_passed(query)
|
|
457
|
+
|
|
458
|
+
return self._plan_enumerator.generate_execution_plan(
|
|
459
|
+
query, cardinality_estimator=self._card_est, cost_model=self._cost_model
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
def describe(self) -> jsondict:
|
|
463
|
+
return {
|
|
464
|
+
"name": "textbook_pipeline",
|
|
465
|
+
"database_system": self._target_db.describe(),
|
|
466
|
+
"plan_enumerator": self._plan_enumerator.describe()
|
|
467
|
+
if self._plan_enumerator is not None
|
|
468
|
+
else None,
|
|
469
|
+
"cost_model": self._cost_model.describe()
|
|
470
|
+
if self._cost_model is not None
|
|
471
|
+
else None,
|
|
472
|
+
"cardinality_estimator": self._card_est.describe()
|
|
473
|
+
if self._card_est is not None
|
|
474
|
+
else None,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
def __repr__(self) -> str:
|
|
478
|
+
return str(self)
|
|
479
|
+
|
|
480
|
+
def __str__(self) -> str:
|
|
481
|
+
components = [self._plan_enumerator, self._cost_model, self._card_est]
|
|
482
|
+
opt_chain = " + ".join(str(comp) for comp in components)
|
|
483
|
+
return f"TextBookOptimization [{opt_chain}]"
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
class MultiStageOptimizationPipeline(OptimizationPipeline):
|
|
487
|
+
"""This optimization pipeline performs query optimization in separate phases.
|
|
488
|
+
|
|
489
|
+
The pipeline is organized in two large stages (join ordering and physical operator selection), which are
|
|
490
|
+
accompanied by initial pre check and a final plan parameterization steps. In total, those four individual steps
|
|
491
|
+
completely specify the optimization settings that should be applied to an incoming query. For each of the steps
|
|
492
|
+
general interface exist that must be implemented by the selected strategies.
|
|
493
|
+
|
|
494
|
+
The steps are applied in consecutive order and perform the following tasks:
|
|
495
|
+
|
|
496
|
+
1. the incoming query is checked for unsupported features
|
|
497
|
+
2. an optimized join order for the query is calculated
|
|
498
|
+
3. appropriate physical operators are determined, depending on the join order
|
|
499
|
+
4. the query plan (join order + physical operators) is further parameterized, for example with custom cardinality estimates
|
|
500
|
+
|
|
501
|
+
All steps are optional. If they are not specified, no operation will be performed at the specific stage. Effectively, this
|
|
502
|
+
means that the query optimizer of the target database system needs to step in and "fill the gaps". For example, if no
|
|
503
|
+
join ordering is performed, the native optimizer needs to come up with a join order. But, the native optimizer will use
|
|
504
|
+
the selected physical operators to perform these joins. Likewise, specifying only a join order means that the native
|
|
505
|
+
optimizer will select its own physical operators. If cardinalities are provided, they are used to guide the native
|
|
506
|
+
optimizer. As an extreme case, one can skip join ordering and physical operator selection completely and only compute
|
|
507
|
+
cardinality estimates in the parameterization step. This way, a different cardinality estimator can be simulated without
|
|
508
|
+
using the `TextBookOptimizationPipeline`. This has the advantage that no default strategies for cost estimation and plan
|
|
509
|
+
enumeration need to to be simulated and the actual algorithms from the target database are used.
|
|
510
|
+
|
|
511
|
+
Once the optimization settings have been selected via the *setup* methods (or alternatively via the `load_settings`
|
|
512
|
+
functionality), the pipeline has to be build using the `build` method. Afterwards, it is ready to optimize
|
|
513
|
+
input queries.
|
|
514
|
+
|
|
515
|
+
A pipeline depends on a specific database system. This is necessary to produce the appropriate metadata for an
|
|
516
|
+
input query (i.e. to apply the specifics that enforce the optimized query plan during query execution for the
|
|
517
|
+
database system). This field can be changed between optimization calls to use the same pipeline for different
|
|
518
|
+
systems.
|
|
519
|
+
|
|
520
|
+
As a shortcut, `load_settings` can be used to initialize a pipeline with pre-defined optimization strategies.
|
|
521
|
+
|
|
522
|
+
Parameters
|
|
523
|
+
----------
|
|
524
|
+
target_db : Database
|
|
525
|
+
The database for which the optimized queries should be generated.
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
Examples
|
|
529
|
+
--------
|
|
530
|
+
>>> pipeline = pb.MultiStageOptimizationPipline(postgres_db)
|
|
531
|
+
>>> pipeline.load_settings(ues_settings)
|
|
532
|
+
>>> pipeline.build()
|
|
533
|
+
>>> pipeline.optimize_query(join_order_benchmark["1a"])
|
|
534
|
+
"""
|
|
535
|
+
|
|
536
|
+
def __init__(self, target_db: Database) -> None:
|
|
537
|
+
self._target_db = target_db
|
|
538
|
+
self._pre_check: OptimizationPreCheck | None = EmptyPreCheck()
|
|
539
|
+
self._join_order_enumerator: JoinOrderOptimization | None = None
|
|
540
|
+
self._physical_operator_selection: PhysicalOperatorSelection | None = None
|
|
541
|
+
self._plan_parameterization: ParameterGeneration | None = None
|
|
542
|
+
self._build = False
|
|
543
|
+
|
|
544
|
+
@property
|
|
545
|
+
def target_db(self) -> Database:
|
|
546
|
+
"""The database for which optimized queries should be generated.
|
|
547
|
+
|
|
548
|
+
When assigning a new target database, the pipeline needs to be build again.
|
|
549
|
+
|
|
550
|
+
Returns
|
|
551
|
+
-------
|
|
552
|
+
Database
|
|
553
|
+
The currently selected database system
|
|
554
|
+
"""
|
|
555
|
+
return self._target_db
|
|
556
|
+
|
|
557
|
+
@target_db.setter
|
|
558
|
+
def target_db(self, new_db: Database) -> None:
|
|
559
|
+
self._target_db = new_db
|
|
560
|
+
self._build = False
|
|
561
|
+
|
|
562
|
+
@property
|
|
563
|
+
def pre_check(self) -> Optional[OptimizationPreCheck]:
|
|
564
|
+
"""An overarching check that should be applied to all queries before they are optimized.
|
|
565
|
+
|
|
566
|
+
This check complements the pre checks of the individual stages and can be used to enforce experiment-specific
|
|
567
|
+
constraints.
|
|
568
|
+
|
|
569
|
+
Returns
|
|
570
|
+
-------
|
|
571
|
+
Optional[OptimizationPreCheck]
|
|
572
|
+
The current check, if any. Can also be an `EmptyPreCheck` instance.
|
|
573
|
+
"""
|
|
574
|
+
return self._pre_check
|
|
575
|
+
|
|
576
|
+
@property
|
|
577
|
+
def join_order_enumerator(self) -> Optional[JoinOrderOptimization]:
|
|
578
|
+
"""The selected join order optimization algorithm.
|
|
579
|
+
|
|
580
|
+
Returns
|
|
581
|
+
-------
|
|
582
|
+
Optional[JoinOrderOptimization]
|
|
583
|
+
The current algorithm, if any has been selected.
|
|
584
|
+
"""
|
|
585
|
+
return self._join_order_enumerator
|
|
586
|
+
|
|
587
|
+
@property
|
|
588
|
+
def physical_operator_selection(self) -> Optional[PhysicalOperatorSelection]:
|
|
589
|
+
"""The selected operator selection algorithm.
|
|
590
|
+
|
|
591
|
+
Returns
|
|
592
|
+
-------
|
|
593
|
+
Optional[PhysicalOperatorSelection]
|
|
594
|
+
The current algorithm, if any has been selected.
|
|
595
|
+
"""
|
|
596
|
+
return self._physical_operator_selection
|
|
597
|
+
|
|
598
|
+
@property
|
|
599
|
+
def plan_parameterization(self) -> Optional[ParameterGeneration]:
|
|
600
|
+
"""The selected parameterization algorithm.
|
|
601
|
+
|
|
602
|
+
Returns
|
|
603
|
+
-------
|
|
604
|
+
Optional[ParameterGeneration]
|
|
605
|
+
The current algorithm, if any has been selected.
|
|
606
|
+
"""
|
|
607
|
+
return self._plan_parameterization
|
|
608
|
+
|
|
609
|
+
def setup_query_support_check(self, check: OptimizationPreCheck) -> Self:
|
|
610
|
+
"""Configures the pre-check that should be executed for each query.
|
|
611
|
+
|
|
612
|
+
This check will be combined with any additional checks that are required by the actual optimization strategies.
|
|
613
|
+
Setting a new check requires the pipeline to be build again.
|
|
614
|
+
|
|
615
|
+
Parameters
|
|
616
|
+
----------
|
|
617
|
+
check : OptimizationPreCheck
|
|
618
|
+
The new check
|
|
619
|
+
|
|
620
|
+
Returns
|
|
621
|
+
-------
|
|
622
|
+
self
|
|
623
|
+
The current pipeline to allow for easy method-chaining.
|
|
624
|
+
"""
|
|
625
|
+
self._pre_check = check
|
|
626
|
+
self._build = False
|
|
627
|
+
return self
|
|
628
|
+
|
|
629
|
+
def setup_join_order_optimization(self, enumerator: JoinOrderOptimization) -> Self:
|
|
630
|
+
"""Configures the pipeline to obtain an optimized join order.
|
|
631
|
+
|
|
632
|
+
The actual strategy can either produce a purely logical join order, or an initial physical query execution plan
|
|
633
|
+
that also specifies how the individual joins should be executed. All later stages are expected to work with
|
|
634
|
+
these two cases.
|
|
635
|
+
|
|
636
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
637
|
+
|
|
638
|
+
Parameters
|
|
639
|
+
----------
|
|
640
|
+
enumerator : JoinOrderOptimization
|
|
641
|
+
The new join order optimization algorithm
|
|
642
|
+
|
|
643
|
+
Returns
|
|
644
|
+
-------
|
|
645
|
+
self
|
|
646
|
+
The current pipeline to allow for easy method-chaining.
|
|
647
|
+
"""
|
|
648
|
+
self._join_order_enumerator = enumerator
|
|
649
|
+
self._build = False
|
|
650
|
+
return self
|
|
651
|
+
|
|
652
|
+
def setup_physical_operator_selection(
|
|
653
|
+
self, selector: PhysicalOperatorSelection
|
|
654
|
+
) -> Self:
|
|
655
|
+
"""Configures the algorithm to assign physical operators to the query.
|
|
656
|
+
|
|
657
|
+
This algorithm receives the input query as well as the join order (if there is one) as input. In a special
|
|
658
|
+
case, this join order can also provide an initial assignment of physical operators. These settings can then
|
|
659
|
+
be further adapted by the selected algorithm (or completely overwritten).
|
|
660
|
+
|
|
661
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
662
|
+
|
|
663
|
+
Paramters
|
|
664
|
+
---------
|
|
665
|
+
selector : PhysicalOperatorSelection
|
|
666
|
+
The new operator selection algorithm
|
|
667
|
+
|
|
668
|
+
Returns
|
|
669
|
+
-------
|
|
670
|
+
self
|
|
671
|
+
The current pipeline to allow for easy method-chaining.
|
|
672
|
+
"""
|
|
673
|
+
self._physical_operator_selection = selector
|
|
674
|
+
self._build = False
|
|
675
|
+
return self
|
|
676
|
+
|
|
677
|
+
def setup_plan_parameterization(self, param_generator: ParameterGeneration) -> Self:
|
|
678
|
+
"""Configures the algorithm to parameterize the query plan.
|
|
679
|
+
|
|
680
|
+
This algorithm receives the input query as well as the join order and the physical operators (if those have
|
|
681
|
+
been determined yet) as input.
|
|
682
|
+
|
|
683
|
+
Setting a new algorithm requires the pipeline to be build again.
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
param_generator : ParameterGeneration
|
|
688
|
+
The new parameterization algorithm
|
|
689
|
+
|
|
690
|
+
Returns
|
|
691
|
+
-------
|
|
692
|
+
self
|
|
693
|
+
The current pipeline to allow for easy method-chaining.
|
|
694
|
+
"""
|
|
695
|
+
self._plan_parameterization = param_generator
|
|
696
|
+
self._build = False
|
|
697
|
+
return self
|
|
698
|
+
|
|
699
|
+
def use(
|
|
700
|
+
self,
|
|
701
|
+
component: JoinOrderOptimization
|
|
702
|
+
| PhysicalOperatorSelection
|
|
703
|
+
| ParameterGeneration,
|
|
704
|
+
) -> Self:
|
|
705
|
+
"""Shortcut method to setup the pipeline. Delegates to the appropriate setup_XXX method."""
|
|
706
|
+
match component:
|
|
707
|
+
case JoinOrderOptimization():
|
|
708
|
+
return self.setup_join_order_optimization(component)
|
|
709
|
+
case PhysicalOperatorSelection():
|
|
710
|
+
return self.setup_physical_operator_selection(component)
|
|
711
|
+
case ParameterGeneration():
|
|
712
|
+
return self.setup_plan_parameterization(component)
|
|
713
|
+
case _:
|
|
714
|
+
raise TypeError(f"Unsupported component type: {type(component)}")
|
|
715
|
+
|
|
716
|
+
def load_settings(self, optimization_settings: OptimizationSettings) -> Self:
|
|
717
|
+
"""Applies all the optimization settings from a pre-defined optimization strategy to the pipeline.
|
|
718
|
+
|
|
719
|
+
This is just a shorthand method to skip calling all setup methods individually for a fixed combination of
|
|
720
|
+
optimization settings. After the settings have been loaded, they can be overwritten again using the *setup*
|
|
721
|
+
methods.
|
|
722
|
+
|
|
723
|
+
Loading new presets requires the pipeline to be build again.
|
|
724
|
+
|
|
725
|
+
Parameters
|
|
726
|
+
----------
|
|
727
|
+
optimization_settings : OptimizationSettings
|
|
728
|
+
The specific settings
|
|
729
|
+
|
|
730
|
+
Returns
|
|
731
|
+
-------
|
|
732
|
+
self
|
|
733
|
+
The current pipeline to allow for easy method-chaining.
|
|
734
|
+
"""
|
|
735
|
+
support_check = optimization_settings.query_pre_check()
|
|
736
|
+
if support_check:
|
|
737
|
+
self.setup_query_support_check(support_check)
|
|
738
|
+
join_ordering = optimization_settings.build_join_order_optimizer()
|
|
739
|
+
if join_ordering:
|
|
740
|
+
self.setup_join_order_optimization(join_ordering)
|
|
741
|
+
operator_selection = optimization_settings.build_physical_operator_selection()
|
|
742
|
+
if operator_selection:
|
|
743
|
+
self.setup_physical_operator_selection(operator_selection)
|
|
744
|
+
plan_parameterization = optimization_settings.build_plan_parameterization()
|
|
745
|
+
if plan_parameterization:
|
|
746
|
+
self.setup_plan_parameterization(plan_parameterization)
|
|
747
|
+
self._build = False
|
|
748
|
+
return self
|
|
749
|
+
|
|
750
|
+
def build(self) -> Self:
|
|
751
|
+
"""Constructs the optimization pipeline.
|
|
752
|
+
|
|
753
|
+
This includes filling all undefined optimization steps with empty strategies and checking all strategies for
|
|
754
|
+
compatibility with the `target_db`. Afterwards, the pipeline is ready to optimize queries.
|
|
755
|
+
|
|
756
|
+
Returns
|
|
757
|
+
-------
|
|
758
|
+
self
|
|
759
|
+
The current pipeline to allow for easy method-chaining.
|
|
760
|
+
|
|
761
|
+
Raises
|
|
762
|
+
------
|
|
763
|
+
UnsupportedSystemError
|
|
764
|
+
If any of the selected optimization stages is not compatible with the `target_db`.
|
|
765
|
+
"""
|
|
766
|
+
all_checks = [self.pre_check]
|
|
767
|
+
if self.join_order_enumerator is not None:
|
|
768
|
+
all_checks.append(self.join_order_enumerator.pre_check())
|
|
769
|
+
if self.physical_operator_selection is not None:
|
|
770
|
+
all_checks.append(self.physical_operator_selection.pre_check())
|
|
771
|
+
if self.plan_parameterization is not None:
|
|
772
|
+
all_checks.append(self.plan_parameterization.pre_check())
|
|
773
|
+
|
|
774
|
+
self._pre_check = merge_checks(all_checks)
|
|
775
|
+
|
|
776
|
+
db_check_result = self._pre_check.check_supported_database_system(
|
|
777
|
+
self._target_db
|
|
778
|
+
)
|
|
779
|
+
if not db_check_result.passed:
|
|
780
|
+
raise UnsupportedSystemError(self.target_db, db_check_result.failure_reason)
|
|
781
|
+
|
|
782
|
+
self._build = True
|
|
783
|
+
return self
|
|
784
|
+
|
|
785
|
+
def target_database(self) -> Database:
|
|
786
|
+
return self.target_db
|
|
787
|
+
|
|
788
|
+
def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
|
|
789
|
+
optimized_query = self.optimize_query(query)
|
|
790
|
+
return self.target_db.optimizer().query_plan(optimized_query)
|
|
791
|
+
|
|
792
|
+
def optimize_query(self, query: SqlQuery) -> SqlQuery:
|
|
793
|
+
self._assert_is_build()
|
|
794
|
+
supported_query_check = self._pre_check.check_supported_query(query)
|
|
795
|
+
if not supported_query_check.passed:
|
|
796
|
+
raise UnsupportedQueryError(query, supported_query_check.failure_reason)
|
|
797
|
+
|
|
798
|
+
join_order = (
|
|
799
|
+
None
|
|
800
|
+
if self.join_order_enumerator is None
|
|
801
|
+
else self.join_order_enumerator.optimize_join_order(query)
|
|
802
|
+
)
|
|
803
|
+
physical_operators = (
|
|
804
|
+
PhysicalOperatorAssignment()
|
|
805
|
+
if self.physical_operator_selection is None
|
|
806
|
+
else self.physical_operator_selection.select_physical_operators(
|
|
807
|
+
query, join_order
|
|
808
|
+
)
|
|
809
|
+
)
|
|
810
|
+
plan_parameters = (
|
|
811
|
+
PlanParameterization()
|
|
812
|
+
if self.plan_parameterization is None
|
|
813
|
+
else self.plan_parameterization.generate_plan_parameters(
|
|
814
|
+
query, join_order, physical_operators
|
|
815
|
+
)
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
return self._target_db.hinting().generate_hints(
|
|
819
|
+
query,
|
|
820
|
+
join_order=join_order,
|
|
821
|
+
physical_operators=physical_operators,
|
|
822
|
+
plan_parameters=plan_parameters,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
def describe(self) -> jsondict:
|
|
826
|
+
return {
|
|
827
|
+
"name": "multi_stage_pipeline",
|
|
828
|
+
"database_system": self._target_db.describe(),
|
|
829
|
+
"query_pre_check": self._pre_check.describe() if self._pre_check else None,
|
|
830
|
+
"join_ordering": self._join_order_enumerator.describe()
|
|
831
|
+
if self._join_order_enumerator
|
|
832
|
+
else None,
|
|
833
|
+
"operator_selection": (
|
|
834
|
+
self._physical_operator_selection.describe()
|
|
835
|
+
if self._physical_operator_selection
|
|
836
|
+
else None
|
|
837
|
+
),
|
|
838
|
+
"plan_parameterization": self._plan_parameterization.describe()
|
|
839
|
+
if self._plan_parameterization
|
|
840
|
+
else None,
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
def _assert_is_build(self) -> None:
|
|
844
|
+
"""Raises an error if the pipeline has not been build yet."""
|
|
845
|
+
if not self._build:
|
|
846
|
+
raise StateError("Pipeline has not been build")
|
|
847
|
+
|
|
848
|
+
def __repr__(self) -> str:
|
|
849
|
+
return str(self)
|
|
850
|
+
|
|
851
|
+
def __str__(self) -> str:
|
|
852
|
+
components = [
|
|
853
|
+
self._join_order_enumerator,
|
|
854
|
+
self._physical_operator_selection,
|
|
855
|
+
self._plan_parameterization,
|
|
856
|
+
]
|
|
857
|
+
opt_chain = " -> ".join(str(comp) for comp in components)
|
|
858
|
+
return f"MultiStageOptimization [{opt_chain}]"
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
class IncrementalOptimizationPipeline(OptimizationPipeline):
|
|
862
|
+
"""This optimization pipeline can be thought of as a generalization of the `MultiStageOptimizationPipeline`.
|
|
863
|
+
|
|
864
|
+
Instead of only operating in two stages, an arbitrary amount of optimization steps can be applied. During each
|
|
865
|
+
step an entire physical query execution plan is received as input and also produced as output. Therefore, partial
|
|
866
|
+
operator assignments or cardinality estimates are not supported by this pipeline. The incremental nature probably
|
|
867
|
+
makes it the most usefull for optimization strategies that continously improve query plans.
|
|
868
|
+
|
|
869
|
+
Parameters
|
|
870
|
+
----------
|
|
871
|
+
target_db : Database
|
|
872
|
+
The database for which the optimized queries should be generated.
|
|
873
|
+
"""
|
|
874
|
+
|
|
875
|
+
def __init__(self, target_db: Database) -> None:
|
|
876
|
+
self._target_db = target_db
|
|
877
|
+
self._initial_plan_generator: Optional[CompleteOptimizationAlgorithm] = None
|
|
878
|
+
self._optimization_steps: list[IncrementalOptimizationStep] = []
|
|
879
|
+
|
|
880
|
+
@property
|
|
881
|
+
def target_db(self) -> Database:
|
|
882
|
+
"""The database for which optimized queries should be generated.
|
|
883
|
+
|
|
884
|
+
When a new target database is selected, all optimization steps are checked for support of the new database.
|
|
885
|
+
|
|
886
|
+
Returns
|
|
887
|
+
-------
|
|
888
|
+
Database
|
|
889
|
+
_description_
|
|
890
|
+
|
|
891
|
+
Raises
|
|
892
|
+
------
|
|
893
|
+
UnsupportedSystemError
|
|
894
|
+
If any of the optimization steps or the initial plan generator cannot work with the target database
|
|
895
|
+
"""
|
|
896
|
+
return self._target_db
|
|
897
|
+
|
|
898
|
+
@target_db.setter
|
|
899
|
+
def target_db(self, database: Database) -> None:
|
|
900
|
+
self._ensure_pipeline_integrity(database=database)
|
|
901
|
+
self._target_db = database
|
|
902
|
+
|
|
903
|
+
@property
|
|
904
|
+
def initial_plan_generator(self) -> Optional[CompleteOptimizationAlgorithm]:
|
|
905
|
+
"""Strategy to construct the first physical query execution plan to start the incremental optimization.
|
|
906
|
+
|
|
907
|
+
If no initial generator is selected, the initial plan will be derived from the optimizer of the target
|
|
908
|
+
database.
|
|
909
|
+
|
|
910
|
+
Returns
|
|
911
|
+
-------
|
|
912
|
+
Optional[CompleteOptimizationAlgorithm]
|
|
913
|
+
The current initial generator.
|
|
914
|
+
|
|
915
|
+
Raises
|
|
916
|
+
------
|
|
917
|
+
UnsupportedSystemError
|
|
918
|
+
If the initial generator does not work with the current `target_db`
|
|
919
|
+
"""
|
|
920
|
+
return self._initial_plan_generator
|
|
921
|
+
|
|
922
|
+
@initial_plan_generator.setter
|
|
923
|
+
def initial_plan_generator(
|
|
924
|
+
self, plan_generator: Optional[CompleteOptimizationAlgorithm]
|
|
925
|
+
) -> None:
|
|
926
|
+
self._ensure_pipeline_integrity(initial_plan_generator=plan_generator)
|
|
927
|
+
self._initial_plan_generator = plan_generator
|
|
928
|
+
|
|
929
|
+
def add_optimization_step(self, next_step: IncrementalOptimizationStep) -> Self:
|
|
930
|
+
"""Expands the optimization pipeline by another stage.
|
|
931
|
+
|
|
932
|
+
The given step will be applied at the end of the pipeline. The very first optimization steps receives an
|
|
933
|
+
initial plan that has either been generated via the `initial_plan_generator` (if it has been setup), or by
|
|
934
|
+
retrieving the query execution plan from the `target_db`.
|
|
935
|
+
|
|
936
|
+
Parameters
|
|
937
|
+
----------
|
|
938
|
+
next_step : IncrementalOptimizationStep
|
|
939
|
+
The next optimization stage
|
|
940
|
+
|
|
941
|
+
Returns
|
|
942
|
+
-------
|
|
943
|
+
IncrementalOptimizationPipeline
|
|
944
|
+
If any of the optimization steps does not work with the target database
|
|
945
|
+
"""
|
|
946
|
+
self._ensure_pipeline_integrity(additional_optimization_step=next_step)
|
|
947
|
+
self._optimization_steps.append(next_step)
|
|
948
|
+
return self
|
|
949
|
+
|
|
950
|
+
def target_database(self) -> Database:
|
|
951
|
+
return self.target_db
|
|
952
|
+
|
|
953
|
+
def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
|
|
954
|
+
self._ensure_supported_query(query)
|
|
955
|
+
current_plan = (
|
|
956
|
+
self.initial_plan_generator.optimize_query(query)
|
|
957
|
+
if self.initial_plan_generator is not None
|
|
958
|
+
else self.target_db.optimizer().query_plan(query)
|
|
959
|
+
)
|
|
960
|
+
for optimization_step in self._optimization_steps:
|
|
961
|
+
current_plan = optimization_step.optimize_query(query, current_plan)
|
|
962
|
+
return current_plan
|
|
963
|
+
|
|
964
|
+
def describe(self) -> jsondict:
|
|
965
|
+
return {
|
|
966
|
+
"name": "incremental_pipeline",
|
|
967
|
+
"database_system": self._target_db.describe(),
|
|
968
|
+
"initial_plan": (
|
|
969
|
+
self._initial_plan_generator.describe()
|
|
970
|
+
if self._initial_plan_generator is not None
|
|
971
|
+
else "native"
|
|
972
|
+
),
|
|
973
|
+
"steps": [step.describe() for step in self._optimization_steps],
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
def _ensure_pipeline_integrity(
|
|
977
|
+
self,
|
|
978
|
+
*,
|
|
979
|
+
database: Optional[Database] = None,
|
|
980
|
+
initial_plan_generator: Optional[CompleteOptimizationAlgorithm] = None,
|
|
981
|
+
additional_optimization_step: Optional[IncrementalOptimizationStep] = None,
|
|
982
|
+
) -> None:
|
|
983
|
+
"""Checks that all selected optimization strategies work with the target database.
|
|
984
|
+
|
|
985
|
+
This method should be called when individual parts of the pipeline have been updated. The updated parts are
|
|
986
|
+
supplied as parameters. All other parameters are inferred from the current pipeline state.
|
|
987
|
+
|
|
988
|
+
Parameters
|
|
989
|
+
----------
|
|
990
|
+
database : Optional[Database], optional
|
|
991
|
+
The new target database system if it has been updated, by default None
|
|
992
|
+
initial_plan_generator : Optional[CompleteOptimizationAlgorithm], optional
|
|
993
|
+
The new initial plan generator if it has been updated, by default None
|
|
994
|
+
additional_optimization_step : Optional[IncrementalOptimizationStep], optional
|
|
995
|
+
The next optimization step, if a new one has been added, by default None
|
|
996
|
+
|
|
997
|
+
Raises
|
|
998
|
+
------
|
|
999
|
+
UnsupportedSystemError
|
|
1000
|
+
If one of the optimization algorithms is not compatible with the target database
|
|
1001
|
+
"""
|
|
1002
|
+
database = self.target_db if database is None else database
|
|
1003
|
+
initial_plan_generator = (
|
|
1004
|
+
self._initial_plan_generator
|
|
1005
|
+
if initial_plan_generator is None
|
|
1006
|
+
else initial_plan_generator
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
if (
|
|
1010
|
+
initial_plan_generator is not None
|
|
1011
|
+
and initial_plan_generator.pre_check() is not None
|
|
1012
|
+
):
|
|
1013
|
+
initial_plan_generator.pre_check().check_supported_database_system(
|
|
1014
|
+
database
|
|
1015
|
+
).ensure_all_passed(database)
|
|
1016
|
+
|
|
1017
|
+
if (
|
|
1018
|
+
additional_optimization_step is not None
|
|
1019
|
+
and additional_optimization_step.pre_check() is not None
|
|
1020
|
+
):
|
|
1021
|
+
(
|
|
1022
|
+
additional_optimization_step.pre_check()
|
|
1023
|
+
.check_supported_database_system(database)
|
|
1024
|
+
.ensure_all_passed(database)
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
for incremental_step in self._optimization_steps:
|
|
1028
|
+
if incremental_step.pre_check() is None:
|
|
1029
|
+
continue
|
|
1030
|
+
incremental_step.pre_check().check_supported_database_system(
|
|
1031
|
+
database
|
|
1032
|
+
).ensure_all_passed(database)
|
|
1033
|
+
|
|
1034
|
+
def _ensure_supported_query(self, query: SqlQuery) -> None:
|
|
1035
|
+
"""Applies all relevant pre-checks to the input query.
|
|
1036
|
+
|
|
1037
|
+
Parameters
|
|
1038
|
+
----------
|
|
1039
|
+
query : SqlQuery
|
|
1040
|
+
The input query
|
|
1041
|
+
|
|
1042
|
+
Raises
|
|
1043
|
+
------
|
|
1044
|
+
UnsupportedQueryError
|
|
1045
|
+
If one of the optimization algorithms is not compatible with the input query
|
|
1046
|
+
"""
|
|
1047
|
+
if (
|
|
1048
|
+
self._initial_plan_generator is not None
|
|
1049
|
+
and self._initial_plan_generator.pre_check() is not None
|
|
1050
|
+
):
|
|
1051
|
+
self._initial_plan_generator.pre_check().check_supported_query(
|
|
1052
|
+
query
|
|
1053
|
+
).ensure_all_passed(query)
|
|
1054
|
+
for incremental_step in self._optimization_steps:
|
|
1055
|
+
if incremental_step.pre_check() is None:
|
|
1056
|
+
continue
|
|
1057
|
+
incremental_step.pre_check().check_supported_query(query).ensure_all_passed(
|
|
1058
|
+
query
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
def __repr__(self) -> str:
|
|
1062
|
+
return str(self)
|
|
1063
|
+
|
|
1064
|
+
def __str__(self) -> str:
|
|
1065
|
+
opt_chain = " -> ".join(str(comp) for comp in self._optimization_steps)
|
|
1066
|
+
return f"MultiStageOptimization [{opt_chain}]"
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
class OptimizationSettings(Protocol):
|
|
1070
|
+
"""Captures related settings for the optimization pipeline to make them more easily accessible.
|
|
1071
|
+
|
|
1072
|
+
All components are optional, depending on the specific optimization scenario/approach.
|
|
1073
|
+
"""
|
|
1074
|
+
|
|
1075
|
+
def query_pre_check(self) -> Optional[OptimizationPreCheck]:
|
|
1076
|
+
"""The required query pre-check.
|
|
1077
|
+
|
|
1078
|
+
Returns
|
|
1079
|
+
-------
|
|
1080
|
+
Optional[OptimizationPreCheck]
|
|
1081
|
+
The pre-check if one is necessary, or ``None`` otherwise.
|
|
1082
|
+
"""
|
|
1083
|
+
return None
|
|
1084
|
+
|
|
1085
|
+
def build_complete_optimizer(self) -> Optional[CompleteOptimizationAlgorithm]:
|
|
1086
|
+
return None
|
|
1087
|
+
|
|
1088
|
+
def build_join_order_optimizer(self) -> Optional[JoinOrderOptimization]:
|
|
1089
|
+
"""The algorithm that is used to obtain the optimized join order.
|
|
1090
|
+
|
|
1091
|
+
Returns
|
|
1092
|
+
-------
|
|
1093
|
+
Optional[JoinOrderOptimization]
|
|
1094
|
+
The optimization strategy for the join order, or ``None`` if the scenario does not include a join order
|
|
1095
|
+
optimization.
|
|
1096
|
+
"""
|
|
1097
|
+
return None
|
|
1098
|
+
|
|
1099
|
+
def build_physical_operator_selection(self) -> Optional[PhysicalOperatorSelection]:
|
|
1100
|
+
"""The algorithm that is used to determine the physical operators.
|
|
1101
|
+
|
|
1102
|
+
Returns
|
|
1103
|
+
-------
|
|
1104
|
+
Optional[PhysicalOperatorSelection]
|
|
1105
|
+
The optimization strategy for the physical operators, or ``None`` if the scenario does not include an operator
|
|
1106
|
+
optimization.
|
|
1107
|
+
"""
|
|
1108
|
+
return None
|
|
1109
|
+
|
|
1110
|
+
def build_plan_parameterization(self) -> Optional[ParameterGeneration]:
|
|
1111
|
+
"""The algorithm that is used to further parameterize the query plan.
|
|
1112
|
+
|
|
1113
|
+
Returns
|
|
1114
|
+
-------
|
|
1115
|
+
Optional[ParameterGeneration]
|
|
1116
|
+
The parameter optimization strategy, or ``None`` if the scenario does not include such a stage.
|
|
1117
|
+
"""
|
|
1118
|
+
return None
|
|
1119
|
+
|
|
1120
|
+
def build_incremental_optimizer(self) -> Optional[IncrementalOptimizationStep]:
|
|
1121
|
+
return None
|