PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1607 @@
|
|
|
1
|
+
"""Generalized implementation of the UES join order optimization algorihtm [1]_.
|
|
2
|
+
|
|
3
|
+
Our implementation differs from the original algorithm in a number of ways, most importantly by making policies more explicit
|
|
4
|
+
and enabling their variation. More specifically, we enable variation of the following parts of the algorithm:
|
|
5
|
+
|
|
6
|
+
- estimation strategy for the cardinality of (filtered) base tables
|
|
7
|
+
- estimation strategy for the cardinality of joins, thereby enabling the usage of different statistics. For example, this
|
|
8
|
+
enables top-k list statistics instead of only using the maximum value frequency
|
|
9
|
+
- deciding when to generate subqueries for primary key/foreign key joins
|
|
10
|
+
|
|
11
|
+
Additionally, our implementation has a stricter treatment of chains of primary key/foreign key joins. Consider a join of the
|
|
12
|
+
form A ⋈ B ⋈ C. Here, A ⋈ B is primary key/foreign key join with A acting as the foreign key partner and B acting as the
|
|
13
|
+
primary key partner. At the same time, B ⋈ C is also a primary key/foreign key join, but this time B acts as the foreign key
|
|
14
|
+
partner and C is the primary key partner. The original implementation did not specify how such situations should be handled and
|
|
15
|
+
multiple possible approaches exist (e.g. treating the entire join sequence as one large primary key/foreign key join or
|
|
16
|
+
invalidating the second join once the primary key/foreign key join between A and B has been performed). Our implementation can
|
|
17
|
+
use the first strategy (the join is treated as one large primary key/foreign key join and the subquery contains all the
|
|
18
|
+
related tables) but defaults to the second one.
|
|
19
|
+
|
|
20
|
+
References
|
|
21
|
+
----------
|
|
22
|
+
|
|
23
|
+
.. [1] A. Hertzschuch et al.: "Simplicity Done Right for Join Ordering", CIDR'2021
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import abc
|
|
29
|
+
import copy
|
|
30
|
+
import math
|
|
31
|
+
import operator
|
|
32
|
+
import typing
|
|
33
|
+
from collections.abc import Iterable
|
|
34
|
+
from typing import Generic, Optional
|
|
35
|
+
|
|
36
|
+
import numpy as np
|
|
37
|
+
|
|
38
|
+
from .. import util
|
|
39
|
+
from .._core import Cardinality, ColumnReference, JoinOperator, TableReference
|
|
40
|
+
from .._hints import PhysicalOperatorAssignment
|
|
41
|
+
from .._jointree import JoinTree, LogicalJoinTree
|
|
42
|
+
from .._stages import (
|
|
43
|
+
CardinalityEstimator,
|
|
44
|
+
JoinOrderOptimization,
|
|
45
|
+
JoinOrderOptimizationError,
|
|
46
|
+
OptimizationPreCheck,
|
|
47
|
+
PhysicalOperatorSelection,
|
|
48
|
+
)
|
|
49
|
+
from .._validation import (
|
|
50
|
+
DependentSubqueryPreCheck,
|
|
51
|
+
EmptyPreCheck,
|
|
52
|
+
EquiJoinPreCheck,
|
|
53
|
+
ImplicitQueryPreCheck,
|
|
54
|
+
SetOperationsPreCheck,
|
|
55
|
+
VirtualTablesPreCheck,
|
|
56
|
+
merge_checks,
|
|
57
|
+
)
|
|
58
|
+
from ..db._db import Database, DatabasePool, DatabaseStatistics
|
|
59
|
+
from ..qal._qal import (
|
|
60
|
+
AbstractPredicate,
|
|
61
|
+
BaseProjection,
|
|
62
|
+
ImplicitFromClause,
|
|
63
|
+
ImplicitSqlQuery,
|
|
64
|
+
Select,
|
|
65
|
+
SqlQuery,
|
|
66
|
+
Where,
|
|
67
|
+
)
|
|
68
|
+
from ._joingraph import JoinGraph, JoinPath
|
|
69
|
+
|
|
70
|
+
ColumnType = typing.TypeVar("ColumnType")
|
|
71
|
+
"""The type of the columns for which statistics are generated."""
|
|
72
|
+
|
|
73
|
+
StatsType = typing.TypeVar("StatsType")
|
|
74
|
+
"""The type of the actual statistics that are stored, e.g. single values or frequency lists."""
|
|
75
|
+
|
|
76
|
+
MaxFrequency = typing.NewType("MaxFrequency", int)
|
|
77
|
+
"""Type alias for maximum frequency statistics of columns (which are just integer values).
|
|
78
|
+
|
|
79
|
+
The maximum frequency of a column is the maximum number of occurrences of a column value within that column.
|
|
80
|
+
|
|
81
|
+
For example, consider a column R.a with values ``[a, b, a, a, b, c]``. In this case, maximum column frequency for R.a is 3.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
MostCommonElements = typing.NewType("MostCommonElements", list[tuple[ColumnType, int]])
|
|
85
|
+
"""Type alias for top-k lists statistics. The top-k list is generic over the actual column type."""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class StatsContainer(abc.ABC, Generic[StatsType]):
|
|
89
|
+
"""The statistics container eases the management of the statistics lifecycle.
|
|
90
|
+
|
|
91
|
+
It provides means to store different kinds of statistics as attributes and can take care of their update automatically.
|
|
92
|
+
Each statistics container instance is intended for one specific query and has to be initialized for that query using the
|
|
93
|
+
`setup_for_query` method.
|
|
94
|
+
|
|
95
|
+
A statistics container is abstract to enable a tailored implementation of the loading and updating procedures for
|
|
96
|
+
different statistics types.
|
|
97
|
+
|
|
98
|
+
Attributes
|
|
99
|
+
----------
|
|
100
|
+
base_table_estimates : dict[TableReference, int]
|
|
101
|
+
These statistics are intended for tables that are not part of the intermediate result, yet. The estimates approximate
|
|
102
|
+
the number of rows that are returned when scanning the table.
|
|
103
|
+
upper_bounds : dict[TableReference | jointree.LogicalJoinTree, int]
|
|
104
|
+
These statistics contain the cardinality estimates for intermediate results of the input query. Inserting new bounds
|
|
105
|
+
can result in an update of the column statistics.
|
|
106
|
+
attribute_frequencies : dict[ColumnReference, StatsType]
|
|
107
|
+
This statistic contains the current statistics value for individual columns. This is the main data structure that has
|
|
108
|
+
to be maintained during the query optimization process to update the column statistics once they become part of an
|
|
109
|
+
intermediate result (and get changed as part of the join process).
|
|
110
|
+
query : Optional[SqlQuery]
|
|
111
|
+
Stores the query that this container is created for
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
def __init__(self) -> None:
|
|
115
|
+
self.base_table_estimates: dict[TableReference, int] = {}
|
|
116
|
+
self.upper_bounds: dict[TableReference | LogicalJoinTree, int] = {}
|
|
117
|
+
self.attribute_frequencies: dict[ColumnReference, StatsType] = {}
|
|
118
|
+
self.query: Optional[SqlQuery] = None
|
|
119
|
+
|
|
120
|
+
def setup_for_query(
|
|
121
|
+
self,
|
|
122
|
+
query: SqlQuery,
|
|
123
|
+
base_table_estimator: BaseTableEstimator,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Initializes the internal data of the statistics container for a specific query.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
query : SqlQuery
|
|
130
|
+
The query that
|
|
131
|
+
base_table_estimator : card_policy.BaseTableCardinalityEstimator
|
|
132
|
+
Estimator to inflate the `base_table_estimates` for all tables that are contained in the query. The estimator has
|
|
133
|
+
to set-up properly.
|
|
134
|
+
"""
|
|
135
|
+
self._reset_containers()
|
|
136
|
+
self.query = query
|
|
137
|
+
self._inflate_base_table_estimates(base_table_estimator)
|
|
138
|
+
self._inflate_attribute_frequencies()
|
|
139
|
+
|
|
140
|
+
def join_bounds(self) -> dict[LogicalJoinTree, int]:
|
|
141
|
+
"""Provides the cardinality estimates of all join trees that are currently stored in the container.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
dict[jointree.LogicalJoinTree, int]
|
|
146
|
+
The bounds for all intermediate results
|
|
147
|
+
"""
|
|
148
|
+
return {
|
|
149
|
+
join_tree: bound
|
|
150
|
+
for join_tree, bound in self.upper_bounds.items()
|
|
151
|
+
if isinstance(join_tree, JoinTree)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def trigger_frequency_update(
|
|
155
|
+
self,
|
|
156
|
+
join_tree: LogicalJoinTree,
|
|
157
|
+
joined_table: TableReference,
|
|
158
|
+
join_condition: AbstractPredicate,
|
|
159
|
+
) -> None:
|
|
160
|
+
"""Updates the `attribute_frequencies` according to a new n:m join.
|
|
161
|
+
|
|
162
|
+
The update procedure distinguishes between two different types of column statistics and uses different
|
|
163
|
+
(and statistics-dependent) update methods for each: partner columns and third-party columns.
|
|
164
|
+
|
|
165
|
+
Partner columns are those columns from the intermediate query result, that are directly involved in the join
|
|
166
|
+
predicate, i.e. they are a join partner for some column of the newly joined table. On the other hand, third
|
|
167
|
+
party columns are part of the intermediate result, but not directly involved in the join. In order to update
|
|
168
|
+
them, some sort of correlation info is usually required.
|
|
169
|
+
|
|
170
|
+
The precise update semantics depend on the specific statistic type. Hence, the updates are performed via abstract
|
|
171
|
+
methods.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
join_tree : jointree.LogicalJoinTree
|
|
176
|
+
A join order that indicates the last join that was performed. This is the join that is used to infer the necessary
|
|
177
|
+
updates.
|
|
178
|
+
joined_table : TableReference
|
|
179
|
+
The actual table that was joined. Remember that UES performs either primary key/foreign key joins, or joins with
|
|
180
|
+
exactly one n:m table join partner. In the first case, no frequency updates are necessary since cardinalities may
|
|
181
|
+
never increase when the foreign key is already part of an intermediate result. In the second case, there is exactly
|
|
182
|
+
one partner table that is denoted by this parameter.
|
|
183
|
+
join_condition : AbstractPredicate
|
|
184
|
+
The predicate that was used for the join. This is required to determine the columns that were directly involved in
|
|
185
|
+
the join. These columns have to be updated in a different way compared to other columns in the intermediate result.
|
|
186
|
+
"""
|
|
187
|
+
partner_tables = join_tree.tables() - {joined_table}
|
|
188
|
+
|
|
189
|
+
third_party_columns: set[ColumnReference] = set()
|
|
190
|
+
for third_party_table in partner_tables:
|
|
191
|
+
potential_partners = partner_tables - {third_party_table}
|
|
192
|
+
join_pred = self.query.predicates().joins_between(
|
|
193
|
+
third_party_table, potential_partners
|
|
194
|
+
)
|
|
195
|
+
if not join_pred:
|
|
196
|
+
continue
|
|
197
|
+
third_party_columns |= join_pred.columns()
|
|
198
|
+
|
|
199
|
+
for col1, col2 in join_condition.join_partners():
|
|
200
|
+
joined_column, partner_column = (
|
|
201
|
+
(col1, col2) if col1.table == joined_table else (col2, col1)
|
|
202
|
+
)
|
|
203
|
+
self._update_partner_column_frequency(joined_column, partner_column)
|
|
204
|
+
|
|
205
|
+
joined_columns_frequencies = {
|
|
206
|
+
joined_col: self.attribute_frequencies[joined_col]
|
|
207
|
+
for joined_col in join_condition.columns_of(joined_table)
|
|
208
|
+
}
|
|
209
|
+
lowest_joined_column_frequency = util.argmin(joined_columns_frequencies)
|
|
210
|
+
for third_party_column in third_party_columns:
|
|
211
|
+
self._update_third_party_column_frequency(
|
|
212
|
+
lowest_joined_column_frequency, third_party_column
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
@abc.abstractmethod
|
|
216
|
+
def describe(self) -> dict:
|
|
217
|
+
"""Generates a JSON-serializable description of the specific container, including the actual statistics type.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
dict
|
|
222
|
+
The description
|
|
223
|
+
"""
|
|
224
|
+
raise NotImplementedError
|
|
225
|
+
|
|
226
|
+
def _reset_containers(self) -> None:
|
|
227
|
+
"""Drops all currently stored statistics. This is a necessary preparation step when a new input query is encoutered."""
|
|
228
|
+
self.base_table_estimates = {}
|
|
229
|
+
self.upper_bounds = {}
|
|
230
|
+
self.attribute_frequencies = {}
|
|
231
|
+
self.query = None
|
|
232
|
+
|
|
233
|
+
def _inflate_base_table_estimates(self, base_table_estimator: BaseTableEstimator):
|
|
234
|
+
"""Retrieves the base table estimate for each table in the current query.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
base_table_estimator : card_policy.BaseTableCardinalityEstimator
|
|
239
|
+
The strategy that should be used to obtain the estimate.
|
|
240
|
+
"""
|
|
241
|
+
for table in self.query.tables():
|
|
242
|
+
table_estimate = base_table_estimator.estimate_for(table)
|
|
243
|
+
self.base_table_estimates[table] = table_estimate
|
|
244
|
+
|
|
245
|
+
@abc.abstractmethod
|
|
246
|
+
def _inflate_attribute_frequencies(self):
|
|
247
|
+
"""Loads the attribute frequency statistics for all required columns.
|
|
248
|
+
|
|
249
|
+
The precise statistics that have to be loaded, as well as the columns that require loading of statistics is completely
|
|
250
|
+
up to the specific statistics container.
|
|
251
|
+
"""
|
|
252
|
+
raise NotImplementedError
|
|
253
|
+
|
|
254
|
+
@abc.abstractmethod
|
|
255
|
+
def _update_partner_column_frequency(
|
|
256
|
+
self, joined_column: ColumnReference, partner_column: ColumnReference
|
|
257
|
+
) -> None:
|
|
258
|
+
"""Performs the frequency update for a partner column.
|
|
259
|
+
|
|
260
|
+
This implies that there is a join between the joined column and the partner column, and the partner column is already
|
|
261
|
+
part of the intermediate result. Likewise, the joined column has just become part of the intermediate result as of this
|
|
262
|
+
join.
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
joined_column : ColumnReference
|
|
267
|
+
A column that is already part of the intermediate result
|
|
268
|
+
partner_column : ColumnReference
|
|
269
|
+
A column of the relation that has just been joined with the current intermedaite result
|
|
270
|
+
"""
|
|
271
|
+
raise NotImplementedError
|
|
272
|
+
|
|
273
|
+
@abc.abstractmethod
|
|
274
|
+
def _update_third_party_column_frequency(
|
|
275
|
+
self, joined_column: ColumnReference, third_party_column: ColumnReference
|
|
276
|
+
) -> None:
|
|
277
|
+
"""Performs the frequency update for a third party column (see `trigger_frequency_update`).
|
|
278
|
+
|
|
279
|
+
This implies that there is a join between the joined column and some other column from the intermediate result. The
|
|
280
|
+
third party columns is already part of the intermediate result, but not directly involved in the join. The joined
|
|
281
|
+
column has just become part of the intermediate result as of this join.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
joined_column : ColumnReference
|
|
286
|
+
A column that is already part of the intermediate result
|
|
287
|
+
third_party_column : ColumnReference
|
|
288
|
+
A column of the relation that has just been joined with the current intermediate result
|
|
289
|
+
"""
|
|
290
|
+
raise NotImplementedError
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class MaxFrequencyStats(StatsContainer[MaxFrequency]):
|
|
294
|
+
"""Statistics container that stores the maximum frequency of the join columns.
|
|
295
|
+
|
|
296
|
+
The frequency updates happen pessimistically, which means that each column frequency of the intermediate result is
|
|
297
|
+
multiplied by the maximum frequency of the partner column. This ensures that no under-estimation is possible, but
|
|
298
|
+
over-estimates the true frequencies by a very large margin. However, in order to close this gap, correlation information is
|
|
299
|
+
required.
|
|
300
|
+
|
|
301
|
+
See Also
|
|
302
|
+
--------
|
|
303
|
+
MaxFrequency
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
def __init__(self, database_stats: DatabaseStatistics):
|
|
307
|
+
super().__init__()
|
|
308
|
+
self.database_stats = database_stats
|
|
309
|
+
|
|
310
|
+
def describe(self) -> dict:
|
|
311
|
+
return {"name": "max_column_frequency"}
|
|
312
|
+
|
|
313
|
+
def _inflate_attribute_frequencies(self):
|
|
314
|
+
referenced_columns = set()
|
|
315
|
+
for join_predicate in self.query.predicates().joins():
|
|
316
|
+
referenced_columns |= join_predicate.columns()
|
|
317
|
+
|
|
318
|
+
for column in referenced_columns:
|
|
319
|
+
top1_list = self.database_stats.most_common_values(column, k=1)
|
|
320
|
+
if not top1_list:
|
|
321
|
+
mcv_frequency = self._uniform_frequency(column)
|
|
322
|
+
else:
|
|
323
|
+
_, mcv_frequency = util.simplify(top1_list)
|
|
324
|
+
self.attribute_frequencies[column] = mcv_frequency
|
|
325
|
+
|
|
326
|
+
def _update_partner_column_frequency(
|
|
327
|
+
self, joined_column: ColumnReference, partner_column: ColumnReference
|
|
328
|
+
) -> None:
|
|
329
|
+
joined_frequency = self.attribute_frequencies[joined_column]
|
|
330
|
+
partner_frequency = self.attribute_frequencies[partner_column]
|
|
331
|
+
self.attribute_frequencies[joined_column] *= partner_frequency
|
|
332
|
+
self.attribute_frequencies[partner_column] *= joined_frequency
|
|
333
|
+
|
|
334
|
+
def _update_third_party_column_frequency(
|
|
335
|
+
self, joined_column: ColumnReference, third_party_column: ColumnReference
|
|
336
|
+
) -> None:
|
|
337
|
+
self.attribute_frequencies[third_party_column] *= self.attribute_frequencies[
|
|
338
|
+
joined_column
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
def _uniform_frequency(self, column: ColumnReference) -> float:
|
|
342
|
+
"""Calculates the value frequency for a column, assuming that all values are uniformly distributed.
|
|
343
|
+
|
|
344
|
+
Parameters
|
|
345
|
+
----------
|
|
346
|
+
column : ColumnReference
|
|
347
|
+
The column to calculate for
|
|
348
|
+
|
|
349
|
+
Returns
|
|
350
|
+
-------
|
|
351
|
+
float
|
|
352
|
+
The estimated frequency of all column values
|
|
353
|
+
"""
|
|
354
|
+
n_tuples = self.database_stats.total_rows(column.table)
|
|
355
|
+
n_tuples = 1 if n_tuples is None else n_tuples
|
|
356
|
+
n_distinct = self.database_stats.distinct_values(column)
|
|
357
|
+
n_distinct = 1 if n_distinct is None else n_distinct
|
|
358
|
+
return n_tuples / n_distinct
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class BaseTableEstimator(abc.ABC):
|
|
362
|
+
"""The base table estimator calculates cardinality estimates for filtered base tables.
|
|
363
|
+
|
|
364
|
+
Implementations could for example use direct computation based on advanced statistics, sampling strategies or
|
|
365
|
+
machine learning-based approaches.
|
|
366
|
+
|
|
367
|
+
Each strategy provides dict-like access to the estimates: ``estimator[my_table]`` works as expected.
|
|
368
|
+
|
|
369
|
+
Parameters
|
|
370
|
+
----------
|
|
371
|
+
name : str
|
|
372
|
+
The name of the actual estimation strategy.
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
def __init__(self, name: str) -> None:
|
|
376
|
+
self.name = name
|
|
377
|
+
|
|
378
|
+
@abc.abstractmethod
|
|
379
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
380
|
+
"""Enables the estimator to prepare internal data structures.
|
|
381
|
+
|
|
382
|
+
Parameters
|
|
383
|
+
----------
|
|
384
|
+
query : SqlQuery
|
|
385
|
+
The query for which cardinalities should be estimated.
|
|
386
|
+
"""
|
|
387
|
+
raise NotImplementedError
|
|
388
|
+
|
|
389
|
+
@abc.abstractmethod
|
|
390
|
+
def estimate_for(self, table: TableReference) -> Cardinality:
|
|
391
|
+
"""Calculates the cardinality for an arbitrary base table of the query.
|
|
392
|
+
|
|
393
|
+
If the query is not filtered, this method should fall back to `estimate_total_rows`. Furthermore, the table can be
|
|
394
|
+
assumed to not be part of any intermediate result, yet.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
table : TableReference
|
|
399
|
+
The table to estimate.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
Cardinality
|
|
404
|
+
The estimated number of rows
|
|
405
|
+
"""
|
|
406
|
+
raise NotImplementedError
|
|
407
|
+
|
|
408
|
+
@abc.abstractmethod
|
|
409
|
+
def estimate_total_rows(self, table: TableReference) -> Cardinality:
|
|
410
|
+
"""Calculates an estimate of the number of rows in the table, ignoring all filter predicates.
|
|
411
|
+
|
|
412
|
+
Parameters
|
|
413
|
+
----------
|
|
414
|
+
table : TableReference
|
|
415
|
+
The table to estimate.
|
|
416
|
+
|
|
417
|
+
Returns
|
|
418
|
+
-------
|
|
419
|
+
Cardinality
|
|
420
|
+
The estimated number of rows
|
|
421
|
+
"""
|
|
422
|
+
raise NotImplementedError
|
|
423
|
+
|
|
424
|
+
@abc.abstractmethod
|
|
425
|
+
def describe(self) -> dict:
|
|
426
|
+
"""Provides a JSON-serializable representation of the selected cardinality estimation strategy.
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
dict
|
|
431
|
+
The representation
|
|
432
|
+
"""
|
|
433
|
+
raise NotImplementedError
|
|
434
|
+
|
|
435
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
436
|
+
"""Provides requirements that an input query has to satisfy in order for the estimator to work properly.
|
|
437
|
+
|
|
438
|
+
Returns
|
|
439
|
+
-------
|
|
440
|
+
OptimizationPreCheck
|
|
441
|
+
The requirements check
|
|
442
|
+
"""
|
|
443
|
+
return EmptyPreCheck()
|
|
444
|
+
|
|
445
|
+
def __getitem__(self, item: TableReference) -> int:
|
|
446
|
+
return self.estimate_for(item)
|
|
447
|
+
|
|
448
|
+
def __repr__(self) -> str:
|
|
449
|
+
return str(self)
|
|
450
|
+
|
|
451
|
+
def __str__(self) -> str:
|
|
452
|
+
return f"BaseTableCardinalityEstimator[{self.name}]"
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class NativeCardinalityEstimator(BaseTableEstimator):
|
|
456
|
+
"""Provides cardinality estimates for base tables using the optimizer of some database system.
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
database : Database
|
|
461
|
+
The database system that should be used to obtain the estimates
|
|
462
|
+
"""
|
|
463
|
+
|
|
464
|
+
def __init__(self, database: Database) -> None:
|
|
465
|
+
super().__init__("native_optimizer")
|
|
466
|
+
self.database = database
|
|
467
|
+
self.query: SqlQuery | None = None
|
|
468
|
+
|
|
469
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
470
|
+
self.query = query
|
|
471
|
+
|
|
472
|
+
def estimate_for(self, table: TableReference) -> Cardinality:
|
|
473
|
+
filters = self.query.predicates().filters_for(table)
|
|
474
|
+
if not filters:
|
|
475
|
+
return self.estimate_total_rows(table)
|
|
476
|
+
|
|
477
|
+
select_clause = Select(BaseProjection.star())
|
|
478
|
+
from_clause = ImplicitFromClause.create_for(table)
|
|
479
|
+
where_clause = Where(filters) if filters else None
|
|
480
|
+
|
|
481
|
+
emulated_query = ImplicitSqlQuery(
|
|
482
|
+
select_clause=select_clause,
|
|
483
|
+
from_clause=from_clause,
|
|
484
|
+
where_clause=where_clause,
|
|
485
|
+
)
|
|
486
|
+
return self.database.optimizer().cardinality_estimate(emulated_query)
|
|
487
|
+
|
|
488
|
+
def estimate_total_rows(self, table: TableReference) -> Cardinality:
|
|
489
|
+
return Cardinality(self.database.statistics().total_rows(table, emulated=True))
|
|
490
|
+
|
|
491
|
+
def describe(self) -> dict:
|
|
492
|
+
return {"name": "native", "database": self.database.describe()}
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
class PreciseCardinalityEstimator(BaseTableEstimator):
|
|
496
|
+
"""Obtains true cardinality counts by executing COUNT queries against a database system.
|
|
497
|
+
|
|
498
|
+
This strategy provides a better reproducibility than the native estimates, but can be more compute-intense if caching is
|
|
499
|
+
disabled.
|
|
500
|
+
|
|
501
|
+
The executed COUNT queries account for all filters on the base table.
|
|
502
|
+
|
|
503
|
+
Parameters
|
|
504
|
+
----------
|
|
505
|
+
database : Database
|
|
506
|
+
The database system that should be used to obtain the estimates
|
|
507
|
+
"""
|
|
508
|
+
|
|
509
|
+
def __init__(self, database: Database) -> None:
|
|
510
|
+
super().__init__("precise_estimates")
|
|
511
|
+
self.database = database
|
|
512
|
+
self.query: SqlQuery | None = None
|
|
513
|
+
|
|
514
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
515
|
+
self.query = query
|
|
516
|
+
|
|
517
|
+
def estimate_for(self, table: TableReference) -> Cardinality:
|
|
518
|
+
select_clause = Select(BaseProjection.count_star())
|
|
519
|
+
from_clause = ImplicitFromClause.create_for(table)
|
|
520
|
+
|
|
521
|
+
filters = self.query.predicates().filters_for(table)
|
|
522
|
+
where_clause = Where(filters) if filters else None
|
|
523
|
+
|
|
524
|
+
emulated_query = ImplicitSqlQuery(
|
|
525
|
+
select_clause=select_clause,
|
|
526
|
+
from_clause=from_clause,
|
|
527
|
+
where_clause=where_clause,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
cache_enabled = (
|
|
531
|
+
self.database.statistics().cache_enabled
|
|
532
|
+
) # this should be treated like a statistics query
|
|
533
|
+
return Cardinality(
|
|
534
|
+
self.database.execute_query(emulated_query, cache_enabled=cache_enabled)
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
def estimate_total_rows(self, table: TableReference) -> Cardinality:
|
|
538
|
+
return Cardinality(self.database.statistics().total_rows(table, emulated=False))
|
|
539
|
+
|
|
540
|
+
def describe(self) -> dict:
|
|
541
|
+
return {"name": "precise", "database": self.database.describe()}
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
class _CardinalityEstimatorWrapper(BaseTableEstimator):
|
|
545
|
+
def __init__(
|
|
546
|
+
self, estimator: CardinalityEstimator, *, target_db: Optional[Database] = None
|
|
547
|
+
) -> None:
|
|
548
|
+
super().__init__(type(estimator).__name__)
|
|
549
|
+
self.estimator = estimator
|
|
550
|
+
self.target_db = target_db or DatabasePool.get_instance().current_database()
|
|
551
|
+
self.query: SqlQuery | None = None
|
|
552
|
+
|
|
553
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
554
|
+
self.query = query
|
|
555
|
+
self.estimator.initialize(self.target_db, query)
|
|
556
|
+
|
|
557
|
+
def estimate_for(self, table: TableReference) -> Cardinality:
|
|
558
|
+
return self.estimator.calculate_estimate(self.query, table)
|
|
559
|
+
|
|
560
|
+
def estimate_total_rows(self, table) -> Cardinality:
|
|
561
|
+
# The implementation of this method is a bit of a hassle since estimation without filters is not really supported
|
|
562
|
+
# by the CardinalityEstimator interface. To work around this issue, we estimate the cardinality of a new query
|
|
563
|
+
# that coincides with the total number of rows.
|
|
564
|
+
|
|
565
|
+
# First, free all resources of the current query
|
|
566
|
+
self.estimator.cleanup()
|
|
567
|
+
|
|
568
|
+
# Now, create our new query and obtain the cardinality estimate
|
|
569
|
+
star_query = ImplicitSqlQuery(
|
|
570
|
+
select_clause=Select.star(),
|
|
571
|
+
from_clause=ImplicitFromClause.create_for(table),
|
|
572
|
+
)
|
|
573
|
+
self.estimator.initialize(self.target_db, star_query)
|
|
574
|
+
cardinality = self.estimator.calculate_estimate(star_query, table)
|
|
575
|
+
|
|
576
|
+
# Lastly, undo all of our temporary changes
|
|
577
|
+
self.estimator.cleanup()
|
|
578
|
+
self.estimator.initialize(self.target_db, self.query)
|
|
579
|
+
return cardinality
|
|
580
|
+
|
|
581
|
+
def describe(self) -> util.jsondict:
|
|
582
|
+
return self.estimator.describe()
|
|
583
|
+
|
|
584
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
585
|
+
return self.estimator.pre_check()
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
class JoinEstimator(abc.ABC):
|
|
589
|
+
"""The join cardinality estimator calculates cardinality estimates for arbitrary n-ary joins.
|
|
590
|
+
|
|
591
|
+
Implementations could for example use direct computation based on advanced statistics, sampling strategies or
|
|
592
|
+
machine learning-based approaches.
|
|
593
|
+
|
|
594
|
+
Parameters
|
|
595
|
+
----------
|
|
596
|
+
name : str
|
|
597
|
+
The name of the actual estimation strategy.
|
|
598
|
+
"""
|
|
599
|
+
|
|
600
|
+
def __init__(self, name: str) -> None:
|
|
601
|
+
self.name = name
|
|
602
|
+
|
|
603
|
+
@abc.abstractmethod
|
|
604
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
605
|
+
"""Enables the estimator to prepare internal data structures.
|
|
606
|
+
|
|
607
|
+
Parameters
|
|
608
|
+
----------
|
|
609
|
+
query : SqlQuery
|
|
610
|
+
The query for which cardinalities should be estimated.
|
|
611
|
+
"""
|
|
612
|
+
raise NotImplementedError
|
|
613
|
+
|
|
614
|
+
@abc.abstractmethod
|
|
615
|
+
def estimate_for(
|
|
616
|
+
self, join_edge: AbstractPredicate, join_graph: JoinGraph
|
|
617
|
+
) -> Cardinality:
|
|
618
|
+
"""Calculates the cardinality estimate for a specific join predicate, given the current state in the join graph.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
join_edge : AbstractPredicate
|
|
623
|
+
The predicate that should be estimated.
|
|
624
|
+
join_graph : JoinGraph
|
|
625
|
+
A graph describing the currently joined relations as well as the join types (e.g. primary key/foreign key or n:m
|
|
626
|
+
joins).
|
|
627
|
+
|
|
628
|
+
Returns
|
|
629
|
+
-------
|
|
630
|
+
int
|
|
631
|
+
The estimated join cardinality
|
|
632
|
+
"""
|
|
633
|
+
raise NotImplementedError
|
|
634
|
+
|
|
635
|
+
@abc.abstractmethod
|
|
636
|
+
def describe(self) -> dict:
|
|
637
|
+
"""Provides a JSON-serializable representation of the selected cardinality estimation strategy.
|
|
638
|
+
|
|
639
|
+
Returns
|
|
640
|
+
-------
|
|
641
|
+
dict
|
|
642
|
+
The representation
|
|
643
|
+
"""
|
|
644
|
+
raise NotImplementedError
|
|
645
|
+
|
|
646
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
647
|
+
"""Provides requirements that an input query has to satisfy in order for the estimator to work properly.
|
|
648
|
+
|
|
649
|
+
Returns
|
|
650
|
+
-------
|
|
651
|
+
OptimizationPreCheck
|
|
652
|
+
The requirements check
|
|
653
|
+
"""
|
|
654
|
+
return EmptyPreCheck()
|
|
655
|
+
|
|
656
|
+
def __repr__(self) -> str:
|
|
657
|
+
return str(self)
|
|
658
|
+
|
|
659
|
+
def __str__(self) -> str:
|
|
660
|
+
return f"JoinCardinalityEstimator[{self.name}]"
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
class UESBoundEstimator(JoinEstimator):
|
|
664
|
+
"""Implementation of the UES formula to calculate upper bounds of join cardinalities.
|
|
665
|
+
|
|
666
|
+
The formula distinuishes two cases: n:m joins are estimated according to the maximum frequencies of the join columns.
|
|
667
|
+
Primary key/foreign key joins are estimated according to the cardinality of the foreign key column. The calculation also
|
|
668
|
+
accounts for conjunctive join predicates, but is still limited to equi joins.
|
|
669
|
+
"""
|
|
670
|
+
|
|
671
|
+
def __init__(self) -> None:
|
|
672
|
+
super().__init__("UES join estimator")
|
|
673
|
+
self.query: ImplicitSqlQuery | None = None
|
|
674
|
+
self.stats_container: StatsContainer[MaxFrequency] | None = None
|
|
675
|
+
|
|
676
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
677
|
+
self.query = query
|
|
678
|
+
|
|
679
|
+
def setup_for_stats(self, stats_container: StatsContainer[MaxFrequency]) -> None:
|
|
680
|
+
"""Configures the statistics container that contains the actual frequencies and cardinalities to use.
|
|
681
|
+
|
|
682
|
+
Parameters
|
|
683
|
+
----------
|
|
684
|
+
stats_container : StatisticsContainer[MaxFrequency]
|
|
685
|
+
The statistics to use
|
|
686
|
+
"""
|
|
687
|
+
self.stats_container = stats_container
|
|
688
|
+
|
|
689
|
+
def estimate_for(
|
|
690
|
+
self, join_edge: AbstractPredicate, join_graph: JoinGraph
|
|
691
|
+
) -> Cardinality:
|
|
692
|
+
current_min_bound = np.inf
|
|
693
|
+
|
|
694
|
+
for base_predicate in join_edge.base_predicates():
|
|
695
|
+
first_col, second_col = util.simplify(base_predicate.join_partners())
|
|
696
|
+
if join_graph.is_pk_fk_join(first_col.table, second_col.table):
|
|
697
|
+
join_bound = self._estimate_pk_fk_join(first_col, second_col)
|
|
698
|
+
elif join_graph.is_pk_fk_join(second_col.table, first_col.table):
|
|
699
|
+
join_bound = self._estimate_pk_fk_join(second_col, first_col)
|
|
700
|
+
else:
|
|
701
|
+
join_bound = self._estimate_n_m_join(first_col, second_col)
|
|
702
|
+
|
|
703
|
+
if join_bound < current_min_bound:
|
|
704
|
+
current_min_bound = join_bound
|
|
705
|
+
|
|
706
|
+
return Cardinality(current_min_bound)
|
|
707
|
+
|
|
708
|
+
def describe(self) -> dict:
|
|
709
|
+
return {"name": "ues"}
|
|
710
|
+
|
|
711
|
+
def pre_check(self) -> Optional[OptimizationPreCheck]:
|
|
712
|
+
# TODO: the UES check is slightly too restrictive here.
|
|
713
|
+
# It suffices to check that there are only conjunctive equi joins.
|
|
714
|
+
return UESOptimizationPreCheck # this is a pre-generated check instance, don't call () it here!
|
|
715
|
+
|
|
716
|
+
def _estimate_pk_fk_join(
|
|
717
|
+
self, fk_column: ColumnReference, pk_column: ColumnReference
|
|
718
|
+
) -> Cardinality:
|
|
719
|
+
"""Estimation formula for primary key/foreign key joins.
|
|
720
|
+
|
|
721
|
+
Parameters
|
|
722
|
+
----------
|
|
723
|
+
fk_column : ColumnReference
|
|
724
|
+
The foreign key column
|
|
725
|
+
pk_column : ColumnReference
|
|
726
|
+
The primary key column
|
|
727
|
+
|
|
728
|
+
Returns
|
|
729
|
+
-------
|
|
730
|
+
Cardinality
|
|
731
|
+
An upper bound of the primary key/foreign key join cardinality.
|
|
732
|
+
"""
|
|
733
|
+
pk_cardinality = Cardinality(
|
|
734
|
+
self.stats_container.base_table_estimates[pk_column.table]
|
|
735
|
+
)
|
|
736
|
+
fk_frequency = self.stats_container.attribute_frequencies[fk_column]
|
|
737
|
+
return math.ceil(fk_frequency * pk_cardinality)
|
|
738
|
+
|
|
739
|
+
def _estimate_n_m_join(
|
|
740
|
+
self, first_column: ColumnReference, second_column: ColumnReference
|
|
741
|
+
) -> Cardinality:
|
|
742
|
+
"""Estimation formula for n:m joins.
|
|
743
|
+
|
|
744
|
+
Parameters
|
|
745
|
+
----------
|
|
746
|
+
first_column : ColumnReference
|
|
747
|
+
The join column from the first partner
|
|
748
|
+
second_column : ColumnReference
|
|
749
|
+
The join column from the second partner
|
|
750
|
+
|
|
751
|
+
Returns
|
|
752
|
+
-------
|
|
753
|
+
Cardinality
|
|
754
|
+
An upper bound of the n:m join cardinality.
|
|
755
|
+
"""
|
|
756
|
+
first_bound, second_bound = (
|
|
757
|
+
self._fetch_bound(first_column),
|
|
758
|
+
self._fetch_bound(second_column),
|
|
759
|
+
)
|
|
760
|
+
first_freq = self.stats_container.attribute_frequencies[first_column]
|
|
761
|
+
second_freq = self.stats_container.attribute_frequencies[second_column]
|
|
762
|
+
|
|
763
|
+
if any(
|
|
764
|
+
var == 0 for var in [first_bound, second_bound, first_freq, second_freq]
|
|
765
|
+
):
|
|
766
|
+
return 0
|
|
767
|
+
|
|
768
|
+
first_distinct_vals = first_bound / first_freq
|
|
769
|
+
second_distinct_vals = second_bound / second_freq
|
|
770
|
+
|
|
771
|
+
n_m_bound = (
|
|
772
|
+
min(first_distinct_vals, second_distinct_vals) * first_freq * second_freq
|
|
773
|
+
)
|
|
774
|
+
return Cardinality(math.ceil(n_m_bound))
|
|
775
|
+
|
|
776
|
+
def _fetch_bound(self, column: ColumnReference) -> Cardinality:
|
|
777
|
+
"""Provides the appropriate table bound (based on upper bound or base table estimate) for the given column.
|
|
778
|
+
|
|
779
|
+
This is a utility method to work with the statistics container in a more convenient way, since the container can store
|
|
780
|
+
the table cardinality at two different places: as a base table estimate, or as a intermediate estimate for base tables
|
|
781
|
+
that can be filtered via a primary key/foreign key join.
|
|
782
|
+
|
|
783
|
+
Parameters
|
|
784
|
+
----------
|
|
785
|
+
column : ColumnReference
|
|
786
|
+
The column for which the upper bound of the corresponding base table should be loaded.
|
|
787
|
+
|
|
788
|
+
Returns
|
|
789
|
+
-------
|
|
790
|
+
Cardinality
|
|
791
|
+
An upper bound on the cardinality of the table
|
|
792
|
+
"""
|
|
793
|
+
table = column.table
|
|
794
|
+
card = (
|
|
795
|
+
self.stats_container.upper_bounds[table]
|
|
796
|
+
if table in self.stats_container.upper_bounds
|
|
797
|
+
else self.stats_container.base_table_estimates[table]
|
|
798
|
+
)
|
|
799
|
+
return Cardinality(card)
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
class BranchingPolicy(abc.ABC):
|
|
803
|
+
"""This policy influences the creation of branches in the join tree in contrast to linear join paths.
|
|
804
|
+
|
|
805
|
+
The terminology used in this policy treats branches in the join tree and subqueries as synonymous.
|
|
806
|
+
|
|
807
|
+
If an implementation of this policy requires additional information to work properly, this information should be supplied
|
|
808
|
+
via custom setup methods.
|
|
809
|
+
|
|
810
|
+
Parameters
|
|
811
|
+
----------
|
|
812
|
+
name : str
|
|
813
|
+
The name of the actual branching strategy.
|
|
814
|
+
"""
|
|
815
|
+
|
|
816
|
+
def __init__(self, name: str) -> None:
|
|
817
|
+
self.name = name
|
|
818
|
+
|
|
819
|
+
@abc.abstractmethod
|
|
820
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
821
|
+
"""Enables the policy to setup of internal data structures.
|
|
822
|
+
|
|
823
|
+
Parameters
|
|
824
|
+
----------
|
|
825
|
+
query : SqlQuery
|
|
826
|
+
The query that should be optimized next
|
|
827
|
+
"""
|
|
828
|
+
raise NotImplementedError
|
|
829
|
+
|
|
830
|
+
@abc.abstractmethod
|
|
831
|
+
def generate_subquery_for(
|
|
832
|
+
self, join: AbstractPredicate, join_graph: JoinGraph
|
|
833
|
+
) -> bool:
|
|
834
|
+
"""Decides whether the given join should be executed in a subquery.
|
|
835
|
+
|
|
836
|
+
Parameters
|
|
837
|
+
----------
|
|
838
|
+
join : AbstractPredicate
|
|
839
|
+
The join that should be executed **within the subquery**. This is not the predicate that should be used to combine
|
|
840
|
+
the results of two intermediate relations.
|
|
841
|
+
join_graph : JoinGraph
|
|
842
|
+
The current optimization state, providing information about joined relations and the join types (e.g. primary
|
|
843
|
+
key/foreign key or n:m joins).
|
|
844
|
+
|
|
845
|
+
Returns
|
|
846
|
+
-------
|
|
847
|
+
bool
|
|
848
|
+
Whether a branch should be created for the join
|
|
849
|
+
"""
|
|
850
|
+
raise NotImplementedError
|
|
851
|
+
|
|
852
|
+
@abc.abstractmethod
|
|
853
|
+
def describe(self) -> dict:
|
|
854
|
+
"""Provides a JSON-serializable representation of the selected branching strategy.
|
|
855
|
+
|
|
856
|
+
Returns
|
|
857
|
+
-------
|
|
858
|
+
dict
|
|
859
|
+
The representation
|
|
860
|
+
"""
|
|
861
|
+
raise NotImplementedError
|
|
862
|
+
|
|
863
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
864
|
+
"""Provides requirements that an input query has to satisfy in order for the policy to work properly.
|
|
865
|
+
|
|
866
|
+
Returns
|
|
867
|
+
-------
|
|
868
|
+
OptimizationPreCheck
|
|
869
|
+
The requirements check
|
|
870
|
+
"""
|
|
871
|
+
return EmptyPreCheck()
|
|
872
|
+
|
|
873
|
+
def __repr__(self) -> str:
|
|
874
|
+
return str(self)
|
|
875
|
+
|
|
876
|
+
def __str__(self) -> str:
|
|
877
|
+
return f"SubqueryGenerationStrategy[{self.name}]"
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
class LinearJoinTreePolicy(BranchingPolicy):
|
|
881
|
+
"""Branching strategy that leaves all join paths linear, and therefore does not generate subqueries at all."""
|
|
882
|
+
|
|
883
|
+
def __init__(self):
|
|
884
|
+
super().__init__("Linear subquery policy")
|
|
885
|
+
|
|
886
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
887
|
+
pass
|
|
888
|
+
|
|
889
|
+
def generate_subquery_for(
|
|
890
|
+
self, join: AbstractPredicate, join_graph: JoinGraph
|
|
891
|
+
) -> bool:
|
|
892
|
+
return False
|
|
893
|
+
|
|
894
|
+
def describe(self) -> dict:
|
|
895
|
+
return {"name": "linear"}
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
class UESSubqueryPolicy(BranchingPolicy):
|
|
899
|
+
"""Implementation of the UES policy to decide when to insert branches into the join order.
|
|
900
|
+
|
|
901
|
+
In short, the policy generates subqueries whenever they guarantee a reduction of the upper bound of the n:m join partner
|
|
902
|
+
table.
|
|
903
|
+
"""
|
|
904
|
+
|
|
905
|
+
def __init__(self):
|
|
906
|
+
super().__init__("UES subquery policy")
|
|
907
|
+
self.query: SqlQuery | None = None
|
|
908
|
+
self.stats_container: StatsContainer | None = None
|
|
909
|
+
|
|
910
|
+
def setup_for_query(self, query: SqlQuery) -> None:
|
|
911
|
+
self.query = query
|
|
912
|
+
|
|
913
|
+
def setup_for_stats_container(self, stats_container: StatsContainer) -> None:
|
|
914
|
+
"""Configures the statistics container that contains the actual frequencies and bounds to use.
|
|
915
|
+
|
|
916
|
+
Parameters
|
|
917
|
+
----------
|
|
918
|
+
stats_container : StatisticsContainer[MaxFrequency]
|
|
919
|
+
The statistics to use
|
|
920
|
+
"""
|
|
921
|
+
self.stats_container = stats_container
|
|
922
|
+
|
|
923
|
+
def generate_subquery_for(
|
|
924
|
+
self, join: AbstractPredicate, join_graph: JoinGraph
|
|
925
|
+
) -> bool:
|
|
926
|
+
if join_graph.count_consumed_tables() < 2:
|
|
927
|
+
return False
|
|
928
|
+
|
|
929
|
+
stats_container = self.stats_container
|
|
930
|
+
for first_col, second_col in join.join_partners():
|
|
931
|
+
first_tab, second_tab = first_col.table, second_col.table
|
|
932
|
+
if join_graph.is_pk_fk_join(first_tab, second_tab):
|
|
933
|
+
joined_table = first_tab
|
|
934
|
+
elif join_graph.is_pk_fk_join(second_tab, first_tab):
|
|
935
|
+
joined_table = second_tab
|
|
936
|
+
else:
|
|
937
|
+
continue
|
|
938
|
+
|
|
939
|
+
generate_subquery = (
|
|
940
|
+
stats_container.upper_bounds[joined_table]
|
|
941
|
+
< stats_container.base_table_estimates[joined_table]
|
|
942
|
+
)
|
|
943
|
+
if generate_subquery:
|
|
944
|
+
return True
|
|
945
|
+
|
|
946
|
+
return False
|
|
947
|
+
|
|
948
|
+
def describe(self) -> dict:
|
|
949
|
+
return {"name": "defensive"}
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
class UESJoinOrderOptimizer(JoinOrderOptimization):
|
|
953
|
+
"""Implementation of the UES join order algorithm.
|
|
954
|
+
|
|
955
|
+
Our implementation expands upon the original algorithm in a number of ways. These are used to enable a variation of
|
|
956
|
+
different policies during optimization, and to apply the algorithm to a larger set of queries. See the module documentation
|
|
957
|
+
for details.
|
|
958
|
+
|
|
959
|
+
Parameters
|
|
960
|
+
----------
|
|
961
|
+
base_table_estimation : Optional[BaseTableEstimator], optional
|
|
962
|
+
A strategy to estimate the cardinalities/bounds of (filtered) base tables. Defaults to a native estimation by the
|
|
963
|
+
optimizer of the `database`.
|
|
964
|
+
join_estimation : Optional[JoinEstimator], optional
|
|
965
|
+
A strategy to estimate the upper bounds of intermediate joins. Defaults to the `UESBoundEstimator`.
|
|
966
|
+
subquery_policy : Optional[BranchingPolicy], optional
|
|
967
|
+
A strategy to determine when to insert subqueries into the resulting join tree. Defaults to the
|
|
968
|
+
`UESSubqueryPolicy`.
|
|
969
|
+
stats_container : Optional[StatsContainer], optional
|
|
970
|
+
The statistics used to calcualte the different upper bounds. These have to be compatible with the `join_estimation`.
|
|
971
|
+
Defaults to a `MaxFrequencyStatsContainer`.
|
|
972
|
+
pull_eager_pk_tables : bool, optional
|
|
973
|
+
How to deal with chains of primary key/foreign key joins (joins where the primary key table for one join acts as the
|
|
974
|
+
foreign key in another join). By default, only the first primary key/foreign key join is used as a filter. If eager
|
|
975
|
+
pulling is enabled, the subsequent primary key filters are also included.
|
|
976
|
+
database : Optional[db.Database], optional
|
|
977
|
+
The database whose statistics should be used. The database has to be configured appropriately already (e.g. regarding
|
|
978
|
+
the usage of emulated statistics). If this parameter is omitted, it is inferred from the `db.DatabasePool`.
|
|
979
|
+
verbose : bool, optional
|
|
980
|
+
Whether to log internal progress and bound statistics. This is off by default.
|
|
981
|
+
|
|
982
|
+
References
|
|
983
|
+
----------
|
|
984
|
+
|
|
985
|
+
.. A. Hertzschuch et al.: "Simplicity Done Right for Join Ordering", CIDR'2021
|
|
986
|
+
"""
|
|
987
|
+
|
|
988
|
+
def __init__(
|
|
989
|
+
self,
|
|
990
|
+
*,
|
|
991
|
+
base_table_estimation: Optional[
|
|
992
|
+
CardinalityEstimator | BaseTableEstimator
|
|
993
|
+
] = None,
|
|
994
|
+
join_estimation: Optional[JoinEstimator] = None,
|
|
995
|
+
subquery_policy: Optional[BranchingPolicy] = None,
|
|
996
|
+
stats_container: Optional[StatsContainer] = None,
|
|
997
|
+
pull_eager_pk_tables: bool = False,
|
|
998
|
+
database: Optional[Database] = None,
|
|
999
|
+
verbose: bool = False,
|
|
1000
|
+
) -> None:
|
|
1001
|
+
super().__init__()
|
|
1002
|
+
self.database = (
|
|
1003
|
+
database if database else DatabasePool().get_instance().current_database()
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
match base_table_estimation:
|
|
1007
|
+
case CardinalityEstimator():
|
|
1008
|
+
self.base_table_estimation = _CardinalityEstimatorWrapper(
|
|
1009
|
+
base_table_estimation, target_db=self.database
|
|
1010
|
+
)
|
|
1011
|
+
case None:
|
|
1012
|
+
self.base_table_estimation = NativeCardinalityEstimator(self.database)
|
|
1013
|
+
case _:
|
|
1014
|
+
self.base_table_estimation = base_table_estimation
|
|
1015
|
+
|
|
1016
|
+
self.join_estimation = (
|
|
1017
|
+
join_estimation if join_estimation else UESBoundEstimator()
|
|
1018
|
+
)
|
|
1019
|
+
self.subquery_policy = (
|
|
1020
|
+
subquery_policy if subquery_policy else UESSubqueryPolicy()
|
|
1021
|
+
)
|
|
1022
|
+
self.stats_container = (
|
|
1023
|
+
stats_container
|
|
1024
|
+
if stats_container
|
|
1025
|
+
else MaxFrequencyStats(self.database.statistics())
|
|
1026
|
+
)
|
|
1027
|
+
self._pull_eager_pk_tables = pull_eager_pk_tables
|
|
1028
|
+
self._logging_enabled = verbose
|
|
1029
|
+
|
|
1030
|
+
def optimize_join_order(self, query: SqlQuery) -> Optional[LogicalJoinTree]:
|
|
1031
|
+
if not isinstance(query, ImplicitSqlQuery):
|
|
1032
|
+
raise ValueError("UES optimization only works for implicit queries for now")
|
|
1033
|
+
if len(query.tables()) < 2:
|
|
1034
|
+
return None
|
|
1035
|
+
|
|
1036
|
+
self.base_table_estimation.setup_for_query(query)
|
|
1037
|
+
self.stats_container.setup_for_query(query, self.base_table_estimation)
|
|
1038
|
+
self.join_estimation.setup_for_query(query)
|
|
1039
|
+
self.join_estimation.setup_for_stats(self.stats_container)
|
|
1040
|
+
self.subquery_policy.setup_for_query(query)
|
|
1041
|
+
if "setup_for_stats_container" in dir(self.subquery_policy):
|
|
1042
|
+
self.subquery_policy.setup_for_stats_container(self.stats_container)
|
|
1043
|
+
|
|
1044
|
+
join_graph = JoinGraph(query, self.database.schema())
|
|
1045
|
+
|
|
1046
|
+
if len(query.tables()) == 2:
|
|
1047
|
+
final_join_tree = self._binary_join_optimization(query, join_graph)
|
|
1048
|
+
elif join_graph.contains_cross_products():
|
|
1049
|
+
# cross-product query is reduced to multiple independent optimization passes
|
|
1050
|
+
optimized_components: list[LogicalJoinTree] = []
|
|
1051
|
+
for component in join_graph.join_components():
|
|
1052
|
+
# FIXME: join components might consist of single tables!
|
|
1053
|
+
optimized_component = self._clone().optimize_join_order(component.query)
|
|
1054
|
+
if not optimized_component:
|
|
1055
|
+
raise JoinOrderOptimizationError(component.query)
|
|
1056
|
+
optimized_components.append(optimized_component)
|
|
1057
|
+
|
|
1058
|
+
# insert cross-products such that the smaller partitions are joined first
|
|
1059
|
+
sorted(optimized_components, key=operator.attrgetter("annotation"))
|
|
1060
|
+
final_join_tree, *remaining_joins = optimized_components
|
|
1061
|
+
for remaining_join in remaining_joins:
|
|
1062
|
+
output_cardinality = (
|
|
1063
|
+
final_join_tree.annotation * remaining_join.annotation
|
|
1064
|
+
)
|
|
1065
|
+
final_join_tree = final_join_tree.join_with(
|
|
1066
|
+
remaining_join, annotation=output_cardinality
|
|
1067
|
+
)
|
|
1068
|
+
elif join_graph.contains_free_n_m_joins():
|
|
1069
|
+
final_join_tree = self._default_ues_optimizer(query, join_graph)
|
|
1070
|
+
else:
|
|
1071
|
+
final_join_tree = self._star_query_optimizer(query, join_graph)
|
|
1072
|
+
|
|
1073
|
+
return final_join_tree
|
|
1074
|
+
|
|
1075
|
+
def describe(self) -> dict:
|
|
1076
|
+
return {
|
|
1077
|
+
"name": "ues",
|
|
1078
|
+
"settings": {
|
|
1079
|
+
"base_table_estimation": self.base_table_estimation.describe(),
|
|
1080
|
+
"join_estimation": self.join_estimation.describe(),
|
|
1081
|
+
"subqueries": self.subquery_policy.describe(),
|
|
1082
|
+
"statistics": self.stats_container.describe(),
|
|
1083
|
+
},
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
def pre_check(self) -> OptimizationPreCheck:
|
|
1087
|
+
specified_checks = [
|
|
1088
|
+
check
|
|
1089
|
+
for check in [
|
|
1090
|
+
self.base_table_estimation.pre_check(),
|
|
1091
|
+
self.join_estimation.pre_check(),
|
|
1092
|
+
self.subquery_policy.pre_check(),
|
|
1093
|
+
]
|
|
1094
|
+
if check
|
|
1095
|
+
]
|
|
1096
|
+
specified_checks.append(UESOptimizationPreCheck)
|
|
1097
|
+
return merge_checks(specified_checks)
|
|
1098
|
+
|
|
1099
|
+
def _default_ues_optimizer(
|
|
1100
|
+
self, query: SqlQuery, join_graph: JoinGraph
|
|
1101
|
+
) -> LogicalJoinTree:
|
|
1102
|
+
"""Implementation of our take on the UES algorithm for queries with n:m joins.
|
|
1103
|
+
|
|
1104
|
+
Parameters
|
|
1105
|
+
----------
|
|
1106
|
+
query : SqlQuery
|
|
1107
|
+
The query to optimize.
|
|
1108
|
+
join_graph : JoinGraph
|
|
1109
|
+
The join graph of the input query. This structure is mutated during the algorithm.
|
|
1110
|
+
|
|
1111
|
+
Returns
|
|
1112
|
+
-------
|
|
1113
|
+
LogicalJoinTree
|
|
1114
|
+
The resulting join tree
|
|
1115
|
+
|
|
1116
|
+
Raises
|
|
1117
|
+
------
|
|
1118
|
+
AssertionError
|
|
1119
|
+
If the iterative construction failed. This indicates a bug in the implementation of the algorithm, not a mistake by
|
|
1120
|
+
the user.
|
|
1121
|
+
"""
|
|
1122
|
+
self._log_information("Using default UES optimizer")
|
|
1123
|
+
join_tree = LogicalJoinTree.empty()
|
|
1124
|
+
|
|
1125
|
+
while join_graph.contains_free_n_m_joins():
|
|
1126
|
+
# Update the current upper bounds
|
|
1127
|
+
lowest_bound = np.inf
|
|
1128
|
+
lowest_bound_table = None
|
|
1129
|
+
for candidate_join in join_graph.available_n_m_join_paths(
|
|
1130
|
+
both_directions_on_initial=True
|
|
1131
|
+
):
|
|
1132
|
+
candidate_table = candidate_join.target_table
|
|
1133
|
+
filter_estimate = self.stats_container.base_table_estimates[
|
|
1134
|
+
candidate_table
|
|
1135
|
+
]
|
|
1136
|
+
pk_fk_bounds = [
|
|
1137
|
+
self.join_estimation.estimate_for(
|
|
1138
|
+
join_path.join_condition, join_graph
|
|
1139
|
+
)
|
|
1140
|
+
for join_path in join_graph.available_pk_fk_joins_for(
|
|
1141
|
+
candidate_table
|
|
1142
|
+
)
|
|
1143
|
+
]
|
|
1144
|
+
candidate_min_bound = min([filter_estimate] + pk_fk_bounds)
|
|
1145
|
+
self.stats_container.upper_bounds[candidate_table] = candidate_min_bound
|
|
1146
|
+
|
|
1147
|
+
if candidate_min_bound < lowest_bound:
|
|
1148
|
+
lowest_bound = candidate_min_bound
|
|
1149
|
+
lowest_bound_table = candidate_table
|
|
1150
|
+
self._log_information(
|
|
1151
|
+
".. Current bounds: "
|
|
1152
|
+
+ util.dicts.stringify(self.stats_container.upper_bounds)
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
if join_tree.is_empty():
|
|
1156
|
+
join_tree = LogicalJoinTree.scan(
|
|
1157
|
+
lowest_bound_table, annotation=lowest_bound
|
|
1158
|
+
)
|
|
1159
|
+
join_graph.mark_joined(lowest_bound_table)
|
|
1160
|
+
self.stats_container.upper_bounds[join_tree] = lowest_bound
|
|
1161
|
+
pk_joins = join_graph.available_deep_pk_join_paths_for(
|
|
1162
|
+
lowest_bound_table, self._table_base_cardinality_ordering
|
|
1163
|
+
)
|
|
1164
|
+
for pk_join in pk_joins:
|
|
1165
|
+
target_table = pk_join.target_table
|
|
1166
|
+
base_cardinality = self.stats_container.base_table_estimates[
|
|
1167
|
+
target_table
|
|
1168
|
+
]
|
|
1169
|
+
join_bound = self.join_estimation.estimate_for(
|
|
1170
|
+
pk_join.join_condition, join_graph
|
|
1171
|
+
)
|
|
1172
|
+
join_graph.mark_joined(target_table, pk_join.join_condition)
|
|
1173
|
+
join_tree = join_tree.join_with(
|
|
1174
|
+
pk_join.target_table,
|
|
1175
|
+
annotation=join_bound,
|
|
1176
|
+
partner_annotation=base_cardinality,
|
|
1177
|
+
)
|
|
1178
|
+
self._log_optimization_progress(
|
|
1179
|
+
"Initial table selection", lowest_bound_table, pk_joins
|
|
1180
|
+
)
|
|
1181
|
+
continue
|
|
1182
|
+
|
|
1183
|
+
selected_candidate: JoinPath | None = None
|
|
1184
|
+
lowest_bound = np.inf
|
|
1185
|
+
bounds_log: dict[JoinPath, float] = {}
|
|
1186
|
+
for candidate_join in join_graph.available_n_m_join_paths():
|
|
1187
|
+
candidate_bound = self.join_estimation.estimate_for(
|
|
1188
|
+
candidate_join.join_condition, join_graph
|
|
1189
|
+
)
|
|
1190
|
+
bounds_log[candidate_join] = candidate_bound
|
|
1191
|
+
if candidate_bound < lowest_bound:
|
|
1192
|
+
selected_candidate = candidate_join
|
|
1193
|
+
lowest_bound = candidate_bound
|
|
1194
|
+
self._log_information(f".. n:m join bounds: {bounds_log}")
|
|
1195
|
+
|
|
1196
|
+
direct_pk_joins = join_graph.available_pk_fk_joins_for(
|
|
1197
|
+
selected_candidate.target_table
|
|
1198
|
+
)
|
|
1199
|
+
create_subquery = any(
|
|
1200
|
+
self.subquery_policy.generate_subquery_for(
|
|
1201
|
+
pk_join.join_condition, join_graph
|
|
1202
|
+
)
|
|
1203
|
+
for pk_join in direct_pk_joins
|
|
1204
|
+
)
|
|
1205
|
+
candidate_table = selected_candidate.target_table
|
|
1206
|
+
all_pk_joins = (
|
|
1207
|
+
join_graph.available_deep_pk_join_paths_for(candidate_table)
|
|
1208
|
+
if self._pull_eager_pk_tables
|
|
1209
|
+
else join_graph.available_pk_fk_joins_for(candidate_table)
|
|
1210
|
+
)
|
|
1211
|
+
candidate_base_cardinality = self.stats_container.base_table_estimates[
|
|
1212
|
+
candidate_table
|
|
1213
|
+
]
|
|
1214
|
+
self._log_optimization_progress(
|
|
1215
|
+
"n:m join",
|
|
1216
|
+
candidate_table,
|
|
1217
|
+
all_pk_joins,
|
|
1218
|
+
join_condition=selected_candidate.join_condition,
|
|
1219
|
+
subquery_join=create_subquery,
|
|
1220
|
+
)
|
|
1221
|
+
if create_subquery:
|
|
1222
|
+
subquery_tree = JoinTree.scan(
|
|
1223
|
+
candidate_table, annotation=candidate_base_cardinality
|
|
1224
|
+
)
|
|
1225
|
+
join_graph.mark_joined(candidate_table)
|
|
1226
|
+
subquery_tree = self._insert_pk_joins(
|
|
1227
|
+
query, all_pk_joins, subquery_tree, join_graph
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
join_tree = join_tree.join_with(subquery_tree, annotation=lowest_bound)
|
|
1231
|
+
self.stats_container.upper_bounds[join_tree] = lowest_bound
|
|
1232
|
+
|
|
1233
|
+
else:
|
|
1234
|
+
join_tree = join_tree.join_with(
|
|
1235
|
+
candidate_table,
|
|
1236
|
+
annotation=lowest_bound,
|
|
1237
|
+
partner_annotation=candidate_base_cardinality,
|
|
1238
|
+
)
|
|
1239
|
+
join_graph.mark_joined(
|
|
1240
|
+
candidate_table, selected_candidate.join_condition
|
|
1241
|
+
)
|
|
1242
|
+
self.stats_container.upper_bounds[join_tree] = lowest_bound
|
|
1243
|
+
join_tree = self._insert_pk_joins(
|
|
1244
|
+
query, all_pk_joins, join_tree, join_graph
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
self.stats_container.trigger_frequency_update(
|
|
1248
|
+
join_tree, candidate_table, selected_candidate.join_condition
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
if join_graph.contains_free_tables():
|
|
1252
|
+
raise AssertionError("Join graph still has free tables remaining!")
|
|
1253
|
+
return join_tree
|
|
1254
|
+
|
|
1255
|
+
def _binary_join_optimization(
|
|
1256
|
+
self, query: ImplicitSqlQuery, join_graph: JoinGraph
|
|
1257
|
+
) -> LogicalJoinTree:
|
|
1258
|
+
"""Specialized optimization algorithm for queries with just a single join.
|
|
1259
|
+
|
|
1260
|
+
The algorithm can still be meaningful to determine the inner and outer relation for the only join that has to be
|
|
1261
|
+
performed. Furthermore, this algorithm can be used for smaller partitions of queries with cross products.
|
|
1262
|
+
|
|
1263
|
+
The algorithm is inspired by UES and uses the table with the smaller upper bound as the outer relation.
|
|
1264
|
+
|
|
1265
|
+
Parameters
|
|
1266
|
+
----------
|
|
1267
|
+
query : ImplicitSqlQuery
|
|
1268
|
+
The query to optimize
|
|
1269
|
+
join_graph : joingraph.JoinGraph
|
|
1270
|
+
The join graph of the query. This structure is mutated during the algorithm.
|
|
1271
|
+
|
|
1272
|
+
Returns
|
|
1273
|
+
-------
|
|
1274
|
+
LogicalJoinTree
|
|
1275
|
+
The resulting join tree
|
|
1276
|
+
"""
|
|
1277
|
+
table1, table2 = query.tables()
|
|
1278
|
+
table1_smaller = (
|
|
1279
|
+
self.stats_container.base_table_estimates[table1]
|
|
1280
|
+
< self.stats_container.base_table_estimates[table2]
|
|
1281
|
+
)
|
|
1282
|
+
small_table, large_table = (
|
|
1283
|
+
(table1, table2) if table1_smaller else (table2, table1)
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
large_card = self.stats_container.base_table_estimates[large_table]
|
|
1287
|
+
small_card = self.stats_container.base_table_estimates[small_table]
|
|
1288
|
+
|
|
1289
|
+
join_predicate = query.predicates().joins_between(large_table, small_table)
|
|
1290
|
+
join_bound = self.join_estimation.estimate_for(join_predicate, join_graph)
|
|
1291
|
+
|
|
1292
|
+
join_tree = LogicalJoinTree.scan(large_table, annotation=large_card)
|
|
1293
|
+
join_tree = join_tree.join_with(
|
|
1294
|
+
small_table, annotation=join_bound, partner_annotation=small_card
|
|
1295
|
+
)
|
|
1296
|
+
return join_tree
|
|
1297
|
+
|
|
1298
|
+
def _star_query_optimizer(
|
|
1299
|
+
self, query: ImplicitSqlQuery, join_graph: JoinGraph
|
|
1300
|
+
) -> LogicalJoinTree:
|
|
1301
|
+
"""Join ordering algorithm for star queries (i.e. queries which only consist of primary key/foreign key joins).
|
|
1302
|
+
|
|
1303
|
+
The algorithm is inspired by UES and always tries to insert the table next that guarantees the smallest upper bound.
|
|
1304
|
+
|
|
1305
|
+
Parameters
|
|
1306
|
+
----------
|
|
1307
|
+
query : ImplicitSqlQuery
|
|
1308
|
+
The query to optimize
|
|
1309
|
+
join_graph : JoinGraph
|
|
1310
|
+
The join graph of the input query. This structure is mutated during the algorithm.
|
|
1311
|
+
|
|
1312
|
+
Returns
|
|
1313
|
+
-------
|
|
1314
|
+
LogicalJoinTree
|
|
1315
|
+
The resulting join tree
|
|
1316
|
+
"""
|
|
1317
|
+
self._log_information("Using star query optimizer")
|
|
1318
|
+
# initial table / join selection
|
|
1319
|
+
lowest_bound = np.inf
|
|
1320
|
+
lowest_bound_join = None
|
|
1321
|
+
for candidate_join in join_graph.available_join_paths():
|
|
1322
|
+
current_bound = self.join_estimation.estimate_for(
|
|
1323
|
+
candidate_join.join_condition, join_graph
|
|
1324
|
+
)
|
|
1325
|
+
if current_bound < lowest_bound:
|
|
1326
|
+
lowest_bound = current_bound
|
|
1327
|
+
lowest_bound_join = candidate_join
|
|
1328
|
+
|
|
1329
|
+
start_table = lowest_bound_join.start_table
|
|
1330
|
+
start_card = self.stats_container.base_table_estimates[start_table]
|
|
1331
|
+
join_tree = LogicalJoinTree.scan(start_table, annotation=start_card)
|
|
1332
|
+
join_graph.mark_joined(start_table)
|
|
1333
|
+
join_tree = self._apply_pk_fk_join(
|
|
1334
|
+
query,
|
|
1335
|
+
lowest_bound_join,
|
|
1336
|
+
join_bound=lowest_bound,
|
|
1337
|
+
join_graph=join_graph,
|
|
1338
|
+
current_join_tree=join_tree,
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# join partner selection
|
|
1342
|
+
while join_graph.contains_free_tables():
|
|
1343
|
+
lowest_bound = np.inf
|
|
1344
|
+
lowest_bound_join = None
|
|
1345
|
+
for candidate_join in join_graph.available_join_paths():
|
|
1346
|
+
current_bound = self.join_estimation.estimate_for(
|
|
1347
|
+
candidate_join.join_condition, join_graph
|
|
1348
|
+
)
|
|
1349
|
+
if current_bound < lowest_bound:
|
|
1350
|
+
lowest_bound = current_bound
|
|
1351
|
+
lowest_bound_join = candidate_join
|
|
1352
|
+
|
|
1353
|
+
join_tree = self._apply_pk_fk_join(
|
|
1354
|
+
query,
|
|
1355
|
+
lowest_bound_join,
|
|
1356
|
+
join_bound=lowest_bound,
|
|
1357
|
+
join_graph=join_graph,
|
|
1358
|
+
current_join_tree=join_tree,
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
return join_tree
|
|
1362
|
+
|
|
1363
|
+
def _table_base_cardinality_ordering(
|
|
1364
|
+
self, table: TableReference, join_edge: dict
|
|
1365
|
+
) -> int:
|
|
1366
|
+
"""Utility method to impose an ordering of multiple primary key tables for a foreign key join.
|
|
1367
|
+
|
|
1368
|
+
The actual ordering sorts the primary key tables according to their upper bounds and is used internally by the join
|
|
1369
|
+
graph.
|
|
1370
|
+
|
|
1371
|
+
Parameters
|
|
1372
|
+
----------
|
|
1373
|
+
table : TableReference
|
|
1374
|
+
The table for which the cardinality should be retrieved.
|
|
1375
|
+
join_edge : dict
|
|
1376
|
+
The edge of the join graph that describes the current join. This is ignored by the calculation and only required
|
|
1377
|
+
to satisfy the interface required by the join graph.
|
|
1378
|
+
|
|
1379
|
+
Returns
|
|
1380
|
+
-------
|
|
1381
|
+
int
|
|
1382
|
+
A order index based on the cardinality estimate of the table
|
|
1383
|
+
|
|
1384
|
+
See Also
|
|
1385
|
+
--------
|
|
1386
|
+
joingraph.JoinGraph.available_deep_pk_join_paths_for
|
|
1387
|
+
"""
|
|
1388
|
+
return self.stats_container.base_table_estimates[table]
|
|
1389
|
+
|
|
1390
|
+
def _apply_pk_fk_join(
|
|
1391
|
+
self,
|
|
1392
|
+
query: SqlQuery,
|
|
1393
|
+
pk_fk_join: JoinPath,
|
|
1394
|
+
*,
|
|
1395
|
+
join_bound: int,
|
|
1396
|
+
join_graph: JoinGraph,
|
|
1397
|
+
current_join_tree: LogicalJoinTree,
|
|
1398
|
+
) -> LogicalJoinTree:
|
|
1399
|
+
"""Includes a specific pk/fk join into a join tree, taking care of all necessary updates.
|
|
1400
|
+
|
|
1401
|
+
Parameters
|
|
1402
|
+
----------
|
|
1403
|
+
query : SqlQuery
|
|
1404
|
+
The query that is being optimized
|
|
1405
|
+
pk_fk_join : JoinPath
|
|
1406
|
+
The actual join that should be performed
|
|
1407
|
+
join_bound : int
|
|
1408
|
+
The calculated upper bound of the join
|
|
1409
|
+
join_graph : JoinGraph
|
|
1410
|
+
The join graph of the query. This structure is mutated as part of the update
|
|
1411
|
+
current_join_tree : LogicalJoinTree
|
|
1412
|
+
The join order that has been determined so far
|
|
1413
|
+
|
|
1414
|
+
Returns
|
|
1415
|
+
-------
|
|
1416
|
+
LogicalJoinTree
|
|
1417
|
+
An updated join tree that includes the given join as the last (i.e. top-most) join.
|
|
1418
|
+
"""
|
|
1419
|
+
target_table = pk_fk_join.target_table
|
|
1420
|
+
target_cardinality = self.stats_container.base_table_estimates[target_table]
|
|
1421
|
+
updated_join_tree = current_join_tree.join_with(
|
|
1422
|
+
target_table, annotation=join_bound, partner_annotation=target_cardinality
|
|
1423
|
+
)
|
|
1424
|
+
join_graph.mark_joined(target_table, pk_fk_join.join_condition)
|
|
1425
|
+
self.stats_container.upper_bounds[updated_join_tree] = join_bound
|
|
1426
|
+
return updated_join_tree
|
|
1427
|
+
|
|
1428
|
+
def _insert_pk_joins(
|
|
1429
|
+
self,
|
|
1430
|
+
query: SqlQuery,
|
|
1431
|
+
pk_joins: Iterable[JoinPath],
|
|
1432
|
+
join_tree: LogicalJoinTree,
|
|
1433
|
+
join_graph: JoinGraph,
|
|
1434
|
+
) -> LogicalJoinTree:
|
|
1435
|
+
"""Generalization of `_apply_pk_fk_join` to multiple join paths.
|
|
1436
|
+
|
|
1437
|
+
Parameters
|
|
1438
|
+
----------
|
|
1439
|
+
query : SqlQuery
|
|
1440
|
+
The query that is being optimized
|
|
1441
|
+
pk_joins : Iterable[joingraph.JoinPath]
|
|
1442
|
+
The joins that should be included in the join tree, in the order in which they are inserted
|
|
1443
|
+
join_tree : jointree.LogicalJoinTree
|
|
1444
|
+
The join order that has been determined so far
|
|
1445
|
+
join_graph : joingraph.JoinGraph
|
|
1446
|
+
The join graph of the query. This structure is mutated as part of the update
|
|
1447
|
+
|
|
1448
|
+
Returns
|
|
1449
|
+
-------
|
|
1450
|
+
jointree.LogicalJoinTree
|
|
1451
|
+
An updated join tree that includes all of the join paths. Join paths that appear earlier in the iterable are
|
|
1452
|
+
inserted deeper within the tree.
|
|
1453
|
+
"""
|
|
1454
|
+
# TODO: refactor in terms of _apply_pk_fk_join
|
|
1455
|
+
for pk_join in pk_joins:
|
|
1456
|
+
pk_table = pk_join.target_table
|
|
1457
|
+
if not join_graph.is_free_table(pk_table):
|
|
1458
|
+
continue
|
|
1459
|
+
pk_join_bound = self.join_estimation.estimate_for(
|
|
1460
|
+
pk_join.join_condition, join_graph
|
|
1461
|
+
)
|
|
1462
|
+
pk_base_cardinality = self.stats_container.base_table_estimates[pk_table]
|
|
1463
|
+
join_tree = join_tree.join_with(
|
|
1464
|
+
pk_table,
|
|
1465
|
+
annotation=pk_join_bound,
|
|
1466
|
+
partner_annotation=pk_base_cardinality,
|
|
1467
|
+
)
|
|
1468
|
+
join_graph.mark_joined(pk_table, pk_join.join_condition)
|
|
1469
|
+
self.stats_container.upper_bounds[join_tree] = pk_join_bound
|
|
1470
|
+
return join_tree
|
|
1471
|
+
|
|
1472
|
+
def _clone(self) -> UESJoinOrderOptimizer:
|
|
1473
|
+
"""Creates a new join order optimizer with the same settings as this one.
|
|
1474
|
+
|
|
1475
|
+
Returns
|
|
1476
|
+
-------
|
|
1477
|
+
UESJoinOrderOptimizer
|
|
1478
|
+
The cloned optimizer
|
|
1479
|
+
"""
|
|
1480
|
+
return UESJoinOrderOptimizer(
|
|
1481
|
+
base_table_estimation=copy.copy(self.base_table_estimation),
|
|
1482
|
+
join_estimation=copy.copy(self.join_estimation),
|
|
1483
|
+
subquery_policy=copy.copy(self.subquery_policy),
|
|
1484
|
+
stats_container=copy.copy(self.stats_container),
|
|
1485
|
+
database=self.database,
|
|
1486
|
+
)
|
|
1487
|
+
|
|
1488
|
+
def _log_information(self, info: str) -> None:
|
|
1489
|
+
"""Displays arbitrary information.
|
|
1490
|
+
|
|
1491
|
+
The current implementation of this methods writes to *stdout* directly. If logging is disabled, no information is
|
|
1492
|
+
printed.
|
|
1493
|
+
|
|
1494
|
+
Parameters
|
|
1495
|
+
----------
|
|
1496
|
+
info : str
|
|
1497
|
+
The information to display
|
|
1498
|
+
"""
|
|
1499
|
+
if self._logging_enabled:
|
|
1500
|
+
print(info)
|
|
1501
|
+
|
|
1502
|
+
def _log_optimization_progress(
|
|
1503
|
+
self,
|
|
1504
|
+
phase: str,
|
|
1505
|
+
candidate_table: TableReference,
|
|
1506
|
+
pk_joins: Iterable[JoinPath],
|
|
1507
|
+
*,
|
|
1508
|
+
join_condition: AbstractPredicate | None = None,
|
|
1509
|
+
subquery_join: bool | None = None,
|
|
1510
|
+
) -> None:
|
|
1511
|
+
"""Displays the current optimizer state.
|
|
1512
|
+
|
|
1513
|
+
The current implementation of this method writes to *stdout* directly. If logging is disabled, no information is
|
|
1514
|
+
printed.
|
|
1515
|
+
|
|
1516
|
+
Parameters
|
|
1517
|
+
----------
|
|
1518
|
+
phase : str
|
|
1519
|
+
The phase of the UES algorithm, e.g. initial table selection of n:m join execution
|
|
1520
|
+
candidate_table : TableReference
|
|
1521
|
+
The table that is considered as the next join partner
|
|
1522
|
+
pk_joins : Iterable[JoinPath]
|
|
1523
|
+
Primary key joins that should be applied to the candidate table
|
|
1524
|
+
join_condition : AbstractPredicate | None, optional
|
|
1525
|
+
The join condition that was used to find the candidate table. Can be ``None`` to omit this information, e.g. when
|
|
1526
|
+
it is not applicable for the current phase.
|
|
1527
|
+
subquery_join : bool | None, optional
|
|
1528
|
+
Whether the primary key tables should be joined before the actual n:m join. Can be ``None`` to omit this
|
|
1529
|
+
information, e.g. when it is not applicable for the current phase.
|
|
1530
|
+
"""
|
|
1531
|
+
# TODO: use proper logging instead of print() calls
|
|
1532
|
+
if not self._logging_enabled:
|
|
1533
|
+
return
|
|
1534
|
+
log_components = [
|
|
1535
|
+
phase,
|
|
1536
|
+
"::",
|
|
1537
|
+
str(candidate_table),
|
|
1538
|
+
"with PK joins",
|
|
1539
|
+
str(pk_joins),
|
|
1540
|
+
]
|
|
1541
|
+
if join_condition:
|
|
1542
|
+
log_components.extend(["on condition", str(join_condition)])
|
|
1543
|
+
if subquery_join is not None:
|
|
1544
|
+
log_components.append(
|
|
1545
|
+
"with subquery" if subquery_join else "without subquery"
|
|
1546
|
+
)
|
|
1547
|
+
log_message = " ".join(log_components)
|
|
1548
|
+
print(log_message)
|
|
1549
|
+
|
|
1550
|
+
|
|
1551
|
+
class UESOperatorSelection(PhysicalOperatorSelection):
|
|
1552
|
+
"""Implementation of the operator selection used in the UES algorithm.
|
|
1553
|
+
|
|
1554
|
+
UES is actually not concerned with operator selection and focuses exclusively on join orders. Therefore, this
|
|
1555
|
+
strategy simply disables nested loop joins since they can typically lead to performance degradation. Essentially
|
|
1556
|
+
this enforces the usage of hash joins for the vast majority of joins in a typical database system because they
|
|
1557
|
+
provide the most robust behavior.
|
|
1558
|
+
|
|
1559
|
+
Parameters
|
|
1560
|
+
----------
|
|
1561
|
+
database : Database
|
|
1562
|
+
The target database on which the optimized query should be executed. This parameter enables a graceful fallback in case
|
|
1563
|
+
the database does not support a nested-loop join in the first place. If this situation occurs, nothing is disabled.
|
|
1564
|
+
|
|
1565
|
+
Notes
|
|
1566
|
+
-----
|
|
1567
|
+
Although the UES join order optimizer never produces physical query plans and is only concerned with logical join trees,
|
|
1568
|
+
this selection algorithm handles physical plans gracefully by retaining all former operator assignments that do not
|
|
1569
|
+
contradict the no-nested-loop join rule.
|
|
1570
|
+
"""
|
|
1571
|
+
|
|
1572
|
+
def __init__(self, database: Database) -> None:
|
|
1573
|
+
super().__init__()
|
|
1574
|
+
self.database = database
|
|
1575
|
+
|
|
1576
|
+
def select_physical_operators(
|
|
1577
|
+
self, query: SqlQuery, join_order: Optional[JoinTree]
|
|
1578
|
+
) -> PhysicalOperatorAssignment:
|
|
1579
|
+
assignment = PhysicalOperatorAssignment()
|
|
1580
|
+
if self.database.hinting().supports_hint(JoinOperator.NestedLoopJoin):
|
|
1581
|
+
assignment.set_operator_enabled_globally(
|
|
1582
|
+
JoinOperator.NestedLoopJoin,
|
|
1583
|
+
False,
|
|
1584
|
+
overwrite_fine_grained_selection=True,
|
|
1585
|
+
)
|
|
1586
|
+
return assignment
|
|
1587
|
+
|
|
1588
|
+
def describe(self) -> dict:
|
|
1589
|
+
return {"name": "ues"}
|
|
1590
|
+
|
|
1591
|
+
|
|
1592
|
+
UESOptimizationPreCheck = merge_checks(
|
|
1593
|
+
ImplicitQueryPreCheck(),
|
|
1594
|
+
EquiJoinPreCheck(),
|
|
1595
|
+
DependentSubqueryPreCheck(),
|
|
1596
|
+
SetOperationsPreCheck(),
|
|
1597
|
+
VirtualTablesPreCheck(),
|
|
1598
|
+
)
|
|
1599
|
+
"""Check for all query features that UES does (not) support.
|
|
1600
|
+
|
|
1601
|
+
This check asserts that the following criteria are met:
|
|
1602
|
+
|
|
1603
|
+
- the input query is an *implicit* SQL query (see qal for details)
|
|
1604
|
+
- all join predicates are binary equi joins
|
|
1605
|
+
- there are no dependent subqueries
|
|
1606
|
+
- there are no virtual tables, including no CTEs
|
|
1607
|
+
"""
|