PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Contains utilities to conveniently execute individual queries or entire workloads and to evaluate their results.
|
|
2
|
+
|
|
3
|
+
This module provides direct access to some frequently-used functionality, mostly related to workload modelling and execution.
|
|
4
|
+
Other modules need to be imported explicitly.
|
|
5
|
+
|
|
6
|
+
Specifically, this package provides the following modules:
|
|
7
|
+
|
|
8
|
+
- `analysis` provides a loose collection of utilities and formulas somewhat related to query optimization
|
|
9
|
+
- `querygen` provides a simple random query generator
|
|
10
|
+
- `ceb` provides an implementation of the Cardinality Estimation Benchmark workload generator
|
|
11
|
+
- `interactive` contains a simple interactive join order optimizer
|
|
12
|
+
"""
|
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
"""Provides a collection of utilities related to query optimization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import collections
|
|
6
|
+
import math
|
|
7
|
+
from collections.abc import Collection
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Literal, Optional
|
|
10
|
+
|
|
11
|
+
import Levenshtein
|
|
12
|
+
|
|
13
|
+
from .. import util
|
|
14
|
+
from .._core import ColumnReference, PhysicalOperator, TableReference
|
|
15
|
+
from .._jointree import JoinTree
|
|
16
|
+
from .._qep import QueryPlan
|
|
17
|
+
from ..db._db import Database, DatabasePool
|
|
18
|
+
from ..qal import parser, transform
|
|
19
|
+
from ..qal._qal import (
|
|
20
|
+
AbstractPredicate,
|
|
21
|
+
BinaryPredicate,
|
|
22
|
+
ColumnExpression,
|
|
23
|
+
CompoundPredicate,
|
|
24
|
+
SqlQuery,
|
|
25
|
+
StaticValueExpression,
|
|
26
|
+
Where,
|
|
27
|
+
)
|
|
28
|
+
from ..util import StateError
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def possible_plans_bound(
|
|
32
|
+
query: SqlQuery,
|
|
33
|
+
*,
|
|
34
|
+
join_operators: set[str] = {"nested-loop join", "hash join", "sort-merge join"},
|
|
35
|
+
scan_operators: set[str] = {"sequential scan", "index scan"},
|
|
36
|
+
) -> int:
|
|
37
|
+
"""Computes a quick upper bound on the maximum number of possible query execution plans for a given query.
|
|
38
|
+
|
|
39
|
+
This upper bound is a very coarse one, based on three assumptions:
|
|
40
|
+
|
|
41
|
+
1. any join sequence (even involving cross-products) of any form (i.e. right-deep, bushy, ...) is allowed
|
|
42
|
+
2. the choice of scan operators and join operators can be varied freely
|
|
43
|
+
3. each table can be scanned using arbitrary operators
|
|
44
|
+
|
|
45
|
+
The number of real-world query execution plans will typically be much smaller, because cross-products are only
|
|
46
|
+
used if really necessary and the selected join operator influences the scan operators and vice-versa.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
query : SqlQuery
|
|
51
|
+
The query for which the bound should be computed
|
|
52
|
+
join_operators : set[str], optional
|
|
53
|
+
The allowed join operators, by default {"nested-loop join", "hash join", "sort-merge join"}
|
|
54
|
+
scan_operators : set[str], optional
|
|
55
|
+
The allowed scan operators, by default {"sequential scan", "index scan"}
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
int
|
|
60
|
+
An upper bound on the number of possible query execution plans
|
|
61
|
+
"""
|
|
62
|
+
n_tables = len(query.tables())
|
|
63
|
+
|
|
64
|
+
join_orders = util.stats.catalan_number(n_tables)
|
|
65
|
+
joins = (n_tables - 1) * len(join_operators)
|
|
66
|
+
scans = n_tables * len(scan_operators)
|
|
67
|
+
|
|
68
|
+
return join_orders * joins * scans
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def actual_plan_cost(
|
|
72
|
+
query: SqlQuery, analyze_plan: QueryPlan, *, database: Optional[Database] = None
|
|
73
|
+
) -> float:
|
|
74
|
+
"""Utility to compute the true cost of a query plan based on the actual cardinalities.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
query : SqlQuery
|
|
79
|
+
The query to analyze
|
|
80
|
+
analyze_plan : QueryPlan
|
|
81
|
+
The executed query which also contains the true cardinalities
|
|
82
|
+
database : Optional[Database], optional
|
|
83
|
+
The database providing the cost model. If omitted, the database is inferred from the database pool.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
float
|
|
88
|
+
_description_
|
|
89
|
+
"""
|
|
90
|
+
if not analyze_plan.is_analyze():
|
|
91
|
+
raise ValueError("The provided plan is not an ANALYZE plan")
|
|
92
|
+
database = database if database is not None else DatabasePool().get_instance()
|
|
93
|
+
hinted_query = database.hinting().generate_hints(
|
|
94
|
+
query, analyze_plan.with_actual_card()
|
|
95
|
+
)
|
|
96
|
+
return database.optimizer().cost_estimate(hinted_query)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def text_diff(left: str, right: str, *, sep: str = " | ") -> str:
|
|
100
|
+
"""Merges two text snippets to allow for a comparison on a per-line basis.
|
|
101
|
+
|
|
102
|
+
The two snippets are split into their individual lines and then merged back together.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
left : str
|
|
107
|
+
The text snippet to display on the left-hand side.
|
|
108
|
+
right : str
|
|
109
|
+
The text snippet to display on the right-hand side.
|
|
110
|
+
sep : str, optional
|
|
111
|
+
The separator to use between the left and right text snippets, by default `` | ``.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
str
|
|
116
|
+
The combined text snippet
|
|
117
|
+
"""
|
|
118
|
+
left_lines = left.splitlines()
|
|
119
|
+
right_lines = right.splitlines()
|
|
120
|
+
|
|
121
|
+
max_left_len = max(len(line) for line in left_lines)
|
|
122
|
+
left_lines_padded = [line.ljust(max_left_len) for line in left_lines]
|
|
123
|
+
|
|
124
|
+
merged_lines = [
|
|
125
|
+
f"{left_line}{sep}{right_line}"
|
|
126
|
+
for left_line, right_line in zip(left_lines_padded, right_lines)
|
|
127
|
+
]
|
|
128
|
+
return "\n".join(merged_lines)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def star_query_cardinality(
|
|
132
|
+
query: SqlQuery,
|
|
133
|
+
fact_table_pk_column: ColumnReference,
|
|
134
|
+
*,
|
|
135
|
+
database: Optional[Database] = None,
|
|
136
|
+
verbose: bool = False,
|
|
137
|
+
) -> int:
|
|
138
|
+
"""Utility function to manually compute the cardinality of a star query.
|
|
139
|
+
|
|
140
|
+
This function is intended for situations where the database is unable to compute the cardinality because the intermediates
|
|
141
|
+
involved in the query become to large or the query plans are simply too bad. It operates by manually computing the number
|
|
142
|
+
of output tuples for each of the entries in the fact table by sequentially joining the fact table with each dimension
|
|
143
|
+
table.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
query : SqlQuery
|
|
148
|
+
The query to compute the cardinality for. This is assumed to be a **SELECT \\*** query and the actual **SELECT** clause
|
|
149
|
+
is ignored completely.
|
|
150
|
+
fact_table_pk_column : ColumnReference
|
|
151
|
+
The fact table's primary key column. All dimension tables must perform an equi-join on this column.
|
|
152
|
+
database : Optional[Database], optional
|
|
153
|
+
The actual database. If this is omitted, the current database from the database pool is used.
|
|
154
|
+
verbose : bool, optional
|
|
155
|
+
Whether progress information should be printed during the computation. If this is enabled, the function will report
|
|
156
|
+
every 1000th value processed.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
int
|
|
161
|
+
The cardinality (i.e. number of output tuples) of the query
|
|
162
|
+
|
|
163
|
+
Warnings
|
|
164
|
+
--------
|
|
165
|
+
Currently, this function works well for simple SPJ-based queries, more complicated features might lead to wrong results.
|
|
166
|
+
Similarly, only pure star queries are supported, i.e. there has to be one central fact table and each dimension table
|
|
167
|
+
performs exactly one equi-join with the fact table's primary key. There may not be additional joins on the dimension
|
|
168
|
+
tables. If such additional dimension joins exist, they have to be pre-processed (e.g. by introducing materialized views)
|
|
169
|
+
and the query has to be rewritten to operate on the views instead.
|
|
170
|
+
It is the user's responsibility to ensure that the query is well-formed in these regards.
|
|
171
|
+
"""
|
|
172
|
+
logger = util.make_logger(verbose, prefix=util.timestamp)
|
|
173
|
+
database = (
|
|
174
|
+
DatabasePool().get_instance().current_database()
|
|
175
|
+
if database is None
|
|
176
|
+
else database
|
|
177
|
+
)
|
|
178
|
+
fact_table = (
|
|
179
|
+
fact_table_pk_column.table
|
|
180
|
+
if fact_table_pk_column.is_bound()
|
|
181
|
+
else database.schema().lookup_column(fact_table_pk_column, query.tables())
|
|
182
|
+
)
|
|
183
|
+
if fact_table is None:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Cannot infer fact table from column '{fact_table_pk_column}'"
|
|
186
|
+
)
|
|
187
|
+
fact_table_pk_column = fact_table_pk_column.bind_to(fact_table)
|
|
188
|
+
|
|
189
|
+
id_vals_query = parser.parse_query(f"""
|
|
190
|
+
SELECT {fact_table_pk_column}, COUNT(*) AS card
|
|
191
|
+
FROM {fact_table}
|
|
192
|
+
GROUP BY {fact_table_pk_column}""")
|
|
193
|
+
if query.predicates().filters_for(fact_table):
|
|
194
|
+
filter_clause = Where(query.predicates().filters_for(fact_table))
|
|
195
|
+
id_vals_query = transform.add_clause(id_vals_query, filter_clause)
|
|
196
|
+
id_vals: list[tuple[Any, int]] = database.execute_query(id_vals_query)
|
|
197
|
+
|
|
198
|
+
base_query_fragments: dict[AbstractPredicate, SqlQuery] = {}
|
|
199
|
+
for join_pred in query.predicates().joins_for(fact_table):
|
|
200
|
+
join_partner = join_pred.join_partners_of(fact_table)
|
|
201
|
+
if not len(join_partner) == 1:
|
|
202
|
+
raise ValueError("Currently only singular joins are supported")
|
|
203
|
+
|
|
204
|
+
partner_table: ColumnReference = util.simplify(join_partner).table
|
|
205
|
+
query_fragment = transform.extract_query_fragment(
|
|
206
|
+
query, [fact_table, partner_table]
|
|
207
|
+
)
|
|
208
|
+
base_query_fragments[join_pred] = transform.as_count_star_query(query_fragment)
|
|
209
|
+
|
|
210
|
+
total_cardinality = 0
|
|
211
|
+
total_ids = len(id_vals)
|
|
212
|
+
for value_idx, (id_value, current_card) in enumerate(id_vals):
|
|
213
|
+
if value_idx % 1000 == 0:
|
|
214
|
+
logger("--", value_idx, "out of", total_ids, "values processed")
|
|
215
|
+
|
|
216
|
+
id_filter = BinaryPredicate.equal(
|
|
217
|
+
ColumnExpression(fact_table_pk_column),
|
|
218
|
+
StaticValueExpression(id_value),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
for join_pred, base_query in base_query_fragments.items():
|
|
222
|
+
if current_card == 0:
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
expanded_predicate = CompoundPredicate.create_and(
|
|
226
|
+
[base_query.where_clause.predicate, id_filter]
|
|
227
|
+
)
|
|
228
|
+
expanded_where_clause = Where(expanded_predicate)
|
|
229
|
+
|
|
230
|
+
dimension_query = transform.replace_clause(
|
|
231
|
+
base_query, expanded_where_clause
|
|
232
|
+
)
|
|
233
|
+
dimension_card = database.execute_query(dimension_query)
|
|
234
|
+
|
|
235
|
+
current_card *= dimension_card
|
|
236
|
+
|
|
237
|
+
total_cardinality += current_card
|
|
238
|
+
|
|
239
|
+
return total_cardinality
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def jointree_similarity_topdown(
|
|
243
|
+
a: JoinTree, b: JoinTree, *, symmetric: bool = False, gamma: float = 1.1
|
|
244
|
+
) -> float:
|
|
245
|
+
"""Computes the similarity of two join trees using a top-down approach.
|
|
246
|
+
|
|
247
|
+
Parameters
|
|
248
|
+
----------
|
|
249
|
+
a : JoinTree
|
|
250
|
+
The first join tree
|
|
251
|
+
b : JoinTree
|
|
252
|
+
The second join tree
|
|
253
|
+
symmetric : bool, optional
|
|
254
|
+
Whether the calculation should be symmetric. If true, the occurence of joins in different branches is not
|
|
255
|
+
penalized. See Notes for details.
|
|
256
|
+
gamma : float, optional
|
|
257
|
+
The reinforcement factor to prioritize similarity of earlier (i.e. deeper) joins. The higher the value, the
|
|
258
|
+
stronger the amplification, by default 1.1
|
|
259
|
+
|
|
260
|
+
Returns
|
|
261
|
+
-------
|
|
262
|
+
float
|
|
263
|
+
An artificial similarity score in [0, 1]. Higher values indicate larger similarity.
|
|
264
|
+
|
|
265
|
+
Notes
|
|
266
|
+
-----
|
|
267
|
+
TODO: add discussion of the algorithm
|
|
268
|
+
"""
|
|
269
|
+
tables_a, tables_b = a.tables(), b.tables()
|
|
270
|
+
total_n_tables = len(tables_a | tables_b)
|
|
271
|
+
normalization_factor = 1 / total_n_tables
|
|
272
|
+
|
|
273
|
+
# similarity between two leaf nodes
|
|
274
|
+
if len(tables_a) == 1 and len(tables_b) == 1:
|
|
275
|
+
return 1 if tables_a == tables_b else 0
|
|
276
|
+
|
|
277
|
+
# similarity between leaf node and intermediate node
|
|
278
|
+
if len(tables_a) == 1 or len(tables_b) == 1:
|
|
279
|
+
leaf_tree = a if len(tables_a) == 1 else b
|
|
280
|
+
intermediate_tree = b if leaf_tree == a else a
|
|
281
|
+
|
|
282
|
+
inner_score = util.jaccard(
|
|
283
|
+
leaf_tree.tables(), intermediate_tree.inner_child.tables()
|
|
284
|
+
)
|
|
285
|
+
outer_score = util.jaccard(
|
|
286
|
+
leaf_tree.tables(), intermediate_tree.outer_child.tables()
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
return normalization_factor * max(inner_score, outer_score)
|
|
290
|
+
|
|
291
|
+
# similarity between two intermediate nodes
|
|
292
|
+
a_inner, a_outer = a.inner_child, a.outer_child
|
|
293
|
+
b_inner, b_outer = b.inner_child, b.outer_child
|
|
294
|
+
|
|
295
|
+
symmetric_score = util.jaccard(a_inner.tables(), b_inner.tables()) + util.jaccard(
|
|
296
|
+
a_outer.tables(), b_outer.tables()
|
|
297
|
+
)
|
|
298
|
+
crossover_score = (
|
|
299
|
+
util.jaccard(a_inner.tables(), b_outer.tables())
|
|
300
|
+
+ util.jaccard(a_outer.tables(), b_inner.tables())
|
|
301
|
+
if symmetric
|
|
302
|
+
else 0
|
|
303
|
+
)
|
|
304
|
+
node_score = normalization_factor * max(symmetric_score, crossover_score)
|
|
305
|
+
|
|
306
|
+
if symmetric and crossover_score > symmetric_score:
|
|
307
|
+
child_score = jointree_similarity_topdown(
|
|
308
|
+
a_inner, b_outer, symmetric=symmetric, gamma=gamma
|
|
309
|
+
) + jointree_similarity_topdown(
|
|
310
|
+
a_outer, b_inner, symmetric=symmetric, gamma=gamma
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
child_score = jointree_similarity_topdown(
|
|
314
|
+
a_inner, b_inner, symmetric=symmetric, gamma=gamma
|
|
315
|
+
) + jointree_similarity_topdown(
|
|
316
|
+
a_outer, b_outer, symmetric=symmetric, gamma=gamma
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return node_score + gamma * child_score
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def jointree_similarity_bottomup(a: JoinTree, b: JoinTree) -> float:
|
|
323
|
+
"""Computes the similarity of two join trees based on a bottom-up approach.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
a : JoinTree
|
|
328
|
+
The first join tree to compare
|
|
329
|
+
b : JoinTree
|
|
330
|
+
The second join tree to compare
|
|
331
|
+
|
|
332
|
+
Returns
|
|
333
|
+
-------
|
|
334
|
+
float
|
|
335
|
+
An artificial similarity score in [0, 1]. Higher values indicate larger similarity.
|
|
336
|
+
|
|
337
|
+
Notes
|
|
338
|
+
-----
|
|
339
|
+
TODO: add discussion of the algorithm
|
|
340
|
+
"""
|
|
341
|
+
a_subtrees = {join.tables() for join in a.iterjoins()}
|
|
342
|
+
b_subtrees = {join.tables() for join in b.iterjoins()}
|
|
343
|
+
return util.jaccard(a_subtrees, b_subtrees)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def linearized_levenshtein_distance(a: JoinTree, b: JoinTree) -> int:
|
|
347
|
+
"""Computes the levenshtein distance of the table sequences of two join trees.
|
|
348
|
+
|
|
349
|
+
Parameters
|
|
350
|
+
----------
|
|
351
|
+
a : JoinTree
|
|
352
|
+
The first join tree to compare
|
|
353
|
+
b : JoinTree
|
|
354
|
+
The second join tree to compare
|
|
355
|
+
|
|
356
|
+
Returns
|
|
357
|
+
-------
|
|
358
|
+
int
|
|
359
|
+
The distance score. Higher values indicate larger distance.
|
|
360
|
+
|
|
361
|
+
References
|
|
362
|
+
----------
|
|
363
|
+
|
|
364
|
+
.. Levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance
|
|
365
|
+
"""
|
|
366
|
+
return Levenshtein.distance(a.itertables(), b.itertables())
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
_DepthState = collections.namedtuple("_DepthState", ["current_level", "depths"])
|
|
370
|
+
"""Keeps track of the current calculated depths of different base tables."""
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _traverse_join_tree_depth(
|
|
374
|
+
current_node: JoinTree, current_depth: _DepthState
|
|
375
|
+
) -> _DepthState:
|
|
376
|
+
"""Calculates a new depth state for the current join tree node based on the current depth.
|
|
377
|
+
|
|
378
|
+
This is the handler method for `join_depth`.
|
|
379
|
+
|
|
380
|
+
Depending on the specific node, different calculations are applied:
|
|
381
|
+
|
|
382
|
+
- for base tables, a new entry of depth one is inserted into the depth state
|
|
383
|
+
- for intermediate nodes, the children are visited to integrate their depth states. Afterwards, their depth is
|
|
384
|
+
increase to incoporate the join
|
|
385
|
+
|
|
386
|
+
Parameters
|
|
387
|
+
----------
|
|
388
|
+
current_node : JoinTree
|
|
389
|
+
The node whose depth information should be integrated
|
|
390
|
+
current_depth : _DepthState
|
|
391
|
+
The current depth state
|
|
392
|
+
|
|
393
|
+
Returns
|
|
394
|
+
-------
|
|
395
|
+
_DepthState
|
|
396
|
+
The updated depth state
|
|
397
|
+
|
|
398
|
+
Raises
|
|
399
|
+
------
|
|
400
|
+
TypeError
|
|
401
|
+
If the node is neither a base table node, nor an intermediate join node. This indicates that the class
|
|
402
|
+
hierarchy of join tree nodes was expanded, and this method was not updated properly.
|
|
403
|
+
"""
|
|
404
|
+
if current_node.is_scan():
|
|
405
|
+
return _DepthState(1, current_depth.depths | {current_node.base_table: 1})
|
|
406
|
+
|
|
407
|
+
if current_node.is_join():
|
|
408
|
+
raise TypeError("Unknown current node type: " + str(current_node))
|
|
409
|
+
|
|
410
|
+
inner_child, outer_child = current_node.inner_child, current_node.outer_child
|
|
411
|
+
if current_node.is_base_join():
|
|
412
|
+
return _DepthState(
|
|
413
|
+
1,
|
|
414
|
+
current_depth.depths
|
|
415
|
+
| {inner_child.base_table: 1, outer_child.base_table: 1},
|
|
416
|
+
)
|
|
417
|
+
elif inner_child.is_scan():
|
|
418
|
+
outer_depth = _traverse_join_tree_depth(outer_child, current_depth)
|
|
419
|
+
updated_depth = outer_depth.current_level + 1
|
|
420
|
+
return _DepthState(
|
|
421
|
+
updated_depth, outer_depth.depths | {inner_child.base_table: updated_depth}
|
|
422
|
+
)
|
|
423
|
+
elif outer_child.is_scan():
|
|
424
|
+
inner_depth = _traverse_join_tree_depth(inner_child, current_depth)
|
|
425
|
+
updated_depth = inner_depth.current_level + 1
|
|
426
|
+
return _DepthState(
|
|
427
|
+
updated_depth, inner_depth.depths | {outer_child.table: updated_depth}
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
inner_depth = _traverse_join_tree_depth(inner_child, current_depth)
|
|
431
|
+
outer_depth = _traverse_join_tree_depth(outer_child, current_depth)
|
|
432
|
+
updated_depth = max(inner_depth.current_level, outer_depth.current_level) + 1
|
|
433
|
+
return _DepthState(updated_depth, inner_depth.depths | outer_depth.depths)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def join_depth(join_tree: JoinTree) -> dict[TableReference, int]:
|
|
437
|
+
"""Calculates for each base table in a join tree the join index when it was integrated into an intermediate result.
|
|
438
|
+
|
|
439
|
+
For joins of two base tables, the depth value is 1. If a table is joined with the intermediate result of the base
|
|
440
|
+
table join, its depth is 2. Generally speaking, the depth of each table is 1 plus the maximum depth of any table
|
|
441
|
+
in the intermediate result that the new table is joined with.
|
|
442
|
+
|
|
443
|
+
Parameters
|
|
444
|
+
----------
|
|
445
|
+
join_tree : JoinTree
|
|
446
|
+
The join tree for which the depths should be calculated.
|
|
447
|
+
|
|
448
|
+
Returns
|
|
449
|
+
-------
|
|
450
|
+
dict[TableReference, int]
|
|
451
|
+
A mapping from tables to their depth values.
|
|
452
|
+
|
|
453
|
+
Examples
|
|
454
|
+
--------
|
|
455
|
+
TODO add examples
|
|
456
|
+
"""
|
|
457
|
+
if join_tree.is_empty():
|
|
458
|
+
return {}
|
|
459
|
+
return _traverse_join_tree_depth(join_tree, _DepthState(0, {})).depths
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@dataclass
|
|
463
|
+
class PlanChangeEntry:
|
|
464
|
+
"""Models a single diff between two join trees.
|
|
465
|
+
|
|
466
|
+
The compared join trees are referred two as the left tree and the right tree, respectively.
|
|
467
|
+
|
|
468
|
+
Attributes
|
|
469
|
+
----------
|
|
470
|
+
change_type : Literal["tree-structure", "join-direction", "physical-op", "card-est"]
|
|
471
|
+
Describes the precise difference between the trees. *tree-structure* indicates that the two trees are fundamentally
|
|
472
|
+
different. This occurs when the join orders are not the same. *join-direction* means that albeit the join orders are
|
|
473
|
+
the same, the roles in a specific join are reversed: the inner relation of one tree acts as the outer relation in the
|
|
474
|
+
other one and vice-versa. *physical-op* means that two structurally identical nodes (i.e. same join or base table)
|
|
475
|
+
differ in the assigned physical operator. *card-est* indicates that two structurally identifcal nodes (i.e. same join
|
|
476
|
+
or base table) differ in the estimated cardinality, while *cost-est* does the same, just for the estimated cost.
|
|
477
|
+
left_state : frozenset[TableReference] | PhysicalOperator | float
|
|
478
|
+
Depending on the `change_type` this attribute describes the left tree. For example, for different tree structures,
|
|
479
|
+
these are the tables in the left subtree, for different physical operators, this is the operator assigned to the node
|
|
480
|
+
in the left tree and so on. For different join directions, this is the entire join node
|
|
481
|
+
right_state : frozenset[TableReference] | PhysicalOperator | float
|
|
482
|
+
Equivalent attribute to `left_state`, just for the right tree.
|
|
483
|
+
context : Optional[frozenset[TableReference]], optional
|
|
484
|
+
For different physical operators or cardinality estimates, this describes the intermediate that is different. This
|
|
485
|
+
attribute is unset by default.
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
change_type: Literal[
|
|
489
|
+
"tree-structure",
|
|
490
|
+
"join-direction",
|
|
491
|
+
"physical-op",
|
|
492
|
+
"card-est",
|
|
493
|
+
"cost-est",
|
|
494
|
+
"actual-card",
|
|
495
|
+
]
|
|
496
|
+
left_state: frozenset[TableReference] | PhysicalOperator | float
|
|
497
|
+
right_state: frozenset[TableReference] | PhysicalOperator | float
|
|
498
|
+
context: Optional[frozenset[TableReference]] = None
|
|
499
|
+
|
|
500
|
+
def inspect(self) -> str:
|
|
501
|
+
"""Provides a human-readable string of the diff.
|
|
502
|
+
|
|
503
|
+
Returns
|
|
504
|
+
-------
|
|
505
|
+
str
|
|
506
|
+
The diff
|
|
507
|
+
"""
|
|
508
|
+
match self.change_type:
|
|
509
|
+
case "tree-structure":
|
|
510
|
+
left_str = [tab.identifier() for tab in self.left_state]
|
|
511
|
+
right_str = [tab.identifier() for tab in self.right_state]
|
|
512
|
+
return f"Different subtrees: left={left_str} right={right_str}"
|
|
513
|
+
case "join-direction":
|
|
514
|
+
left_str = [tab.identifier() for tab in self.left_state]
|
|
515
|
+
right_str = [tab.identifier() for tab in self.right_state]
|
|
516
|
+
return f"Swapped join direction: left={left_str} right={right_str}"
|
|
517
|
+
case "physical-op":
|
|
518
|
+
return f"Different physical operators on node {self.context}: left={self.left_state} right={self.right_state}"
|
|
519
|
+
case "card-est":
|
|
520
|
+
return (
|
|
521
|
+
f"Different cardinality estimates on node {self.context}: "
|
|
522
|
+
f"left={self.left_state} right={self.right_state}"
|
|
523
|
+
)
|
|
524
|
+
case "cost-est":
|
|
525
|
+
return (
|
|
526
|
+
f"Different cost estimates on node {self.context}: "
|
|
527
|
+
f"left={self.left_state} right={self.right_state}"
|
|
528
|
+
)
|
|
529
|
+
case "actual-card":
|
|
530
|
+
return (
|
|
531
|
+
f"Different actual cardinality on node {self.context}: "
|
|
532
|
+
f"left={self.left_state} right={self.right_state}"
|
|
533
|
+
)
|
|
534
|
+
case _:
|
|
535
|
+
raise StateError(f"Unknown change type '{self.change_type}'")
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
@dataclass
|
|
539
|
+
class PlanChangeset:
|
|
540
|
+
"""Captures an arbitrary amount of join tree diffs.
|
|
541
|
+
|
|
542
|
+
Attributes
|
|
543
|
+
----------
|
|
544
|
+
changes : Collection[JointreeChangeEntry]
|
|
545
|
+
The diffs
|
|
546
|
+
"""
|
|
547
|
+
|
|
548
|
+
changes: Collection[PlanChangeEntry]
|
|
549
|
+
|
|
550
|
+
def inspect(self) -> str:
|
|
551
|
+
"""Provides a human-readable string of the entire diff.
|
|
552
|
+
|
|
553
|
+
The diff will typically contain newlines to separate individual entries.
|
|
554
|
+
|
|
555
|
+
Returns
|
|
556
|
+
-------
|
|
557
|
+
str
|
|
558
|
+
The diff
|
|
559
|
+
"""
|
|
560
|
+
return "\n".join(entry.inspect() for entry in self.changes)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def compare_query_plans(left: QueryPlan, right: QueryPlan) -> PlanChangeset:
|
|
564
|
+
"""Computes differences between two query execution plans.
|
|
565
|
+
|
|
566
|
+
Parameters
|
|
567
|
+
----------
|
|
568
|
+
left : QueryPlan
|
|
569
|
+
The first plan to compare
|
|
570
|
+
right : QueryPlan
|
|
571
|
+
The second plan to compare
|
|
572
|
+
|
|
573
|
+
Returns
|
|
574
|
+
-------
|
|
575
|
+
JointreeChangeset
|
|
576
|
+
A diff between the two join trees
|
|
577
|
+
"""
|
|
578
|
+
# FIXME: query plans might contain auxiliary nodes that are currently not handled/recognized
|
|
579
|
+
if left.find_first_node(lambda node: node.is_auxiliary()) or right.find_first_node(
|
|
580
|
+
lambda node: node.is_auxiliary()
|
|
581
|
+
):
|
|
582
|
+
raise ValueError(
|
|
583
|
+
"Comparison of query plans with auxiliary (i.e. non-join and non-scan) operators "
|
|
584
|
+
"is currently not supported"
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
if left.tables() != right.tables():
|
|
588
|
+
changeset = [
|
|
589
|
+
PlanChangeEntry(
|
|
590
|
+
"tree-structure", left_state=left.tables(), right_state=right.tables()
|
|
591
|
+
)
|
|
592
|
+
]
|
|
593
|
+
return PlanChangeset(changeset)
|
|
594
|
+
|
|
595
|
+
changes: list[PlanChangeEntry] = []
|
|
596
|
+
|
|
597
|
+
left_card_est, right_card_est = (
|
|
598
|
+
left.estimated_cardinality,
|
|
599
|
+
right.estimated_cardinality,
|
|
600
|
+
)
|
|
601
|
+
left_card_actual, right_card_actual = (
|
|
602
|
+
left.actual_cardinality,
|
|
603
|
+
right.actual_cardinality,
|
|
604
|
+
)
|
|
605
|
+
left_cost, right_cost = left.estimated_cost, right.estimated_cost
|
|
606
|
+
if left_card_est != right_card_est and not (
|
|
607
|
+
math.isnan(left_card_est) and math.isnan(right_card_est)
|
|
608
|
+
):
|
|
609
|
+
changes.append(
|
|
610
|
+
PlanChangeEntry(
|
|
611
|
+
"card-est",
|
|
612
|
+
left_state=left_card_est,
|
|
613
|
+
right_state=right_card_est,
|
|
614
|
+
context=left.tables(),
|
|
615
|
+
)
|
|
616
|
+
)
|
|
617
|
+
if left_card_actual != right_card_actual and not (
|
|
618
|
+
math.isnan(left_card_actual) and math.isnan(right_card_actual)
|
|
619
|
+
):
|
|
620
|
+
changes.append(
|
|
621
|
+
PlanChangeEntry(
|
|
622
|
+
"actual-card",
|
|
623
|
+
left_state=left_card_actual,
|
|
624
|
+
right_state=right_card_actual,
|
|
625
|
+
context=left.tables(),
|
|
626
|
+
)
|
|
627
|
+
)
|
|
628
|
+
if left_cost != right_cost and not (
|
|
629
|
+
math.isnan(left_cost) and math.isnan(left_cost)
|
|
630
|
+
):
|
|
631
|
+
changes.append(
|
|
632
|
+
PlanChangeEntry(
|
|
633
|
+
"cost-est",
|
|
634
|
+
left_state=left_cost,
|
|
635
|
+
right_state=right_cost,
|
|
636
|
+
context=left.tables(),
|
|
637
|
+
)
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
left_op, right_op = left.node_type, right.node_type
|
|
641
|
+
if left_op != right_op:
|
|
642
|
+
changes.append(
|
|
643
|
+
PlanChangeEntry(
|
|
644
|
+
"physical-op",
|
|
645
|
+
left_state=left_op,
|
|
646
|
+
right_state=right_op,
|
|
647
|
+
context=left.tables(),
|
|
648
|
+
)
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
if left.is_join():
|
|
652
|
+
# we can also assume that right is an intermediate node since we know both nodes have the same tables and the left tree
|
|
653
|
+
# is an intermediate node
|
|
654
|
+
|
|
655
|
+
join_direction_swap = left.inner_child.tables() == right.outer_child.tables()
|
|
656
|
+
if join_direction_swap:
|
|
657
|
+
changes.append(
|
|
658
|
+
PlanChangeEntry("join-direction", left_state=left, right_state=right)
|
|
659
|
+
)
|
|
660
|
+
changes.extend(
|
|
661
|
+
compare_query_plans(left.inner_child, right.outer_child).changes
|
|
662
|
+
)
|
|
663
|
+
changes.extend(
|
|
664
|
+
compare_query_plans(left.outer_child, right.inner_child).changes
|
|
665
|
+
)
|
|
666
|
+
else:
|
|
667
|
+
changes.extend(
|
|
668
|
+
compare_query_plans(left.inner_child, right.inner_child).changes
|
|
669
|
+
)
|
|
670
|
+
changes.extend(
|
|
671
|
+
compare_query_plans(left.outer_child, right.inner_child).changes
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
return PlanChangeset(changes)
|