PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/_qep.py
ADDED
|
@@ -0,0 +1,1986 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import collections
|
|
4
|
+
import copy
|
|
5
|
+
import math
|
|
6
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from numbers import Number
|
|
9
|
+
from typing import Any, Literal, Optional
|
|
10
|
+
|
|
11
|
+
from . import util
|
|
12
|
+
from ._core import (
|
|
13
|
+
Cardinality,
|
|
14
|
+
ColumnReference,
|
|
15
|
+
Cost,
|
|
16
|
+
JoinOperator,
|
|
17
|
+
PhysicalOperator,
|
|
18
|
+
ScanOperator,
|
|
19
|
+
TableReference,
|
|
20
|
+
)
|
|
21
|
+
from .qal._qal import AbstractPredicate, ColumnExpression, SqlExpression
|
|
22
|
+
from .util import StateError, jsondict
|
|
23
|
+
|
|
24
|
+
JoinDirection = Literal["inner", "outer"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SortKey:
|
|
28
|
+
"""Sort keys describe how the tuples in a relation are sorted.
|
|
29
|
+
|
|
30
|
+
Each sort key contains a set of columns that describe the equivalence class of the sort key, i.e. the column values in each
|
|
31
|
+
row are all equal to one another. Therefore, the relation can be treated as being sorted by any of them.
|
|
32
|
+
|
|
33
|
+
Most commonly, relations will only be sorted by a single column (which can be checked by calling *len()* on the sort key,
|
|
34
|
+
or by checking the `equivalence_class` directly). In this case, the `column` property can be used to retrieve the
|
|
35
|
+
corresponding expression that forms the column.
|
|
36
|
+
|
|
37
|
+
To check, whether two sort keys are equivalent, the `is_compatible_with` method can be used. For more idiomatic access,
|
|
38
|
+
``column in sort_key`` is also supported.
|
|
39
|
+
|
|
40
|
+
To create a new equivalence class, the `for_equivalence_class(columns)` method is available to create a new sort key from
|
|
41
|
+
scratch. To combine two existing sort keys, `merge_with` can be used.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
columns : Iterable[SqlExpression]
|
|
46
|
+
The column(s) that is used to sort the tuples. This will usually contain plain column references (`ColumnExpression`),
|
|
47
|
+
but can also use more complex expressions.
|
|
48
|
+
ascending : bool
|
|
49
|
+
Whether the sorting is ascending or descending. Defaults to ascending.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def of(
|
|
54
|
+
column: SqlExpression | ColumnReference, *, ascending: bool = True
|
|
55
|
+
) -> SortKey:
|
|
56
|
+
"""Creates a new sort key for a single column.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
column : SqlExpression | ColumnReference
|
|
61
|
+
The column that is used to sort the tuples. Can be a plain column reference, which will be wrapped by a
|
|
62
|
+
`ColumnExpression` automatically.
|
|
63
|
+
ascending : bool, optional
|
|
64
|
+
Whether the sorting is ascending or descending. Defaults to ascending.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
SortKey
|
|
69
|
+
The sort key with an equivalence class for the single column.
|
|
70
|
+
"""
|
|
71
|
+
if isinstance(column, ColumnReference):
|
|
72
|
+
column = ColumnExpression(column)
|
|
73
|
+
return SortKey([column], ascending=ascending)
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def for_equivalence_class(
|
|
77
|
+
members: Iterable[SqlExpression | ColumnReference], *, ascending: bool = True
|
|
78
|
+
) -> SortKey:
|
|
79
|
+
"""Creates a new sort key for an equivalence class of columns.
|
|
80
|
+
|
|
81
|
+
This is just a more expressive alias for calling the constructor directly. This method assumes that the values for
|
|
82
|
+
all columns in the equivalence class are equal to one another. The client is responsible for ensuring and checking
|
|
83
|
+
that this is actually the case.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
members : Iterable[SqlExpression | ColumnReference]
|
|
88
|
+
The columns that describe the sorting of the relation. This can contain just a single item, in which case the
|
|
89
|
+
method is pretty much the same as `of`. Any passed `ColumnReference` will be wrapped in a `ColumnExpression`.
|
|
90
|
+
ascending : bool, optional
|
|
91
|
+
Whether the sorting is ascending or descending. Defaults to ascending.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
SortKey
|
|
96
|
+
The sort key with an equivalence class for the columns.
|
|
97
|
+
"""
|
|
98
|
+
members = [
|
|
99
|
+
ColumnExpression(mem) if isinstance(mem, ColumnReference) else mem
|
|
100
|
+
for mem in members
|
|
101
|
+
]
|
|
102
|
+
return SortKey(members, ascending=ascending)
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self, columns: Iterable[SqlExpression], *, ascending: bool = True
|
|
106
|
+
) -> None:
|
|
107
|
+
self._members = frozenset(columns)
|
|
108
|
+
if not self._members:
|
|
109
|
+
raise ValueError("Sort key must contain at least one column")
|
|
110
|
+
self._ascending = ascending
|
|
111
|
+
|
|
112
|
+
__match_args___ = ("equivalence_class", "ascending")
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def column(self) -> SqlExpression:
|
|
116
|
+
"""For single-column sort keys, get this column."""
|
|
117
|
+
if len(self._members) != 1:
|
|
118
|
+
raise StateError("Sort key is not a single column reference")
|
|
119
|
+
return next(iter(self._members))
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def equivalence_class(self) -> frozenset[SqlExpression]:
|
|
123
|
+
"""Get all columns that are part of the equivalence class. This will be 1 or more columns."""
|
|
124
|
+
return self._members
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def ascending(self) -> bool:
|
|
128
|
+
"""Get the sort direction of this key."""
|
|
129
|
+
return self._ascending
|
|
130
|
+
|
|
131
|
+
def is_compatible_with(self, other: SortKey | ColumnReference) -> bool:
|
|
132
|
+
"""Checks, whether two keys are sorted the same way.
|
|
133
|
+
|
|
134
|
+
For single column references, this essentially checks whether the column is part of the key's equivalence class.
|
|
135
|
+
"""
|
|
136
|
+
if isinstance(other, ColumnReference):
|
|
137
|
+
return other in self._members
|
|
138
|
+
|
|
139
|
+
if self.ascending != other.ascending:
|
|
140
|
+
return False
|
|
141
|
+
return len(self._members & other._members) > 0
|
|
142
|
+
|
|
143
|
+
def merge_with(self, other: SortKey) -> SortKey:
|
|
144
|
+
"""Merges the equivalence classes of two sort keys."""
|
|
145
|
+
if self.ascending != other.ascending:
|
|
146
|
+
raise ValueError("Cannot merge sort keys with different sort orders")
|
|
147
|
+
return SortKey(self._members | other._members, ascending=self.ascending)
|
|
148
|
+
|
|
149
|
+
def __json__(self) -> jsondict:
|
|
150
|
+
return {"equivalence_class": self._members, "ascending": self._ascending}
|
|
151
|
+
|
|
152
|
+
def __len__(self) -> int:
|
|
153
|
+
return len(self._members)
|
|
154
|
+
|
|
155
|
+
def __contains__(self, item: object) -> bool:
|
|
156
|
+
return (
|
|
157
|
+
self.is_compatible_with(item)
|
|
158
|
+
if isinstance(item, (ColumnReference, SortKey))
|
|
159
|
+
else False
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def __eq__(self, other: object) -> bool:
|
|
163
|
+
return (
|
|
164
|
+
isinstance(other, type(self))
|
|
165
|
+
and self._members == other._members
|
|
166
|
+
and self._ascending == other._ascending
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def __hash__(self) -> int:
|
|
170
|
+
return hash((self._members, self._ascending))
|
|
171
|
+
|
|
172
|
+
def __repr__(self) -> str:
|
|
173
|
+
return str(self)
|
|
174
|
+
|
|
175
|
+
def __str__(self) -> str:
|
|
176
|
+
suffix = "" if self.ascending else " DESC"
|
|
177
|
+
if len(self._members) == 1:
|
|
178
|
+
member = str(self.column)
|
|
179
|
+
else:
|
|
180
|
+
members = ", ".join(str(m) for m in self._members)
|
|
181
|
+
member = f"{{{members}}}"
|
|
182
|
+
return f"{member}{suffix}"
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class PlanParams:
|
|
186
|
+
"""Plan parameters contain additional "structural" metadata about the operators in a query plan.
|
|
187
|
+
|
|
188
|
+
This information is mostly concerned with how the operator should function, e.g. which table it should scan, or which index
|
|
189
|
+
to use or how the tuples will be sorted.
|
|
190
|
+
|
|
191
|
+
In addition to the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the
|
|
192
|
+
parameters, e.g. ``params["custom"] = 42``.
|
|
193
|
+
|
|
194
|
+
Parameters
|
|
195
|
+
----------
|
|
196
|
+
base_table : Optional[TableReference], optional
|
|
197
|
+
For scan nodes, this is the table being scanned. For all other nodes this is should be *None*.
|
|
198
|
+
filter_predicate : Optional[AbstractPredicate], optional
|
|
199
|
+
An arbitrary predicate to restrict the allowed tuples in the output of a relation. This should be mostly used for
|
|
200
|
+
join nodes and scans.
|
|
201
|
+
sort_keys : Optional[Sequence[SortKey]], optional
|
|
202
|
+
How the tuples in a the output of a relation are sorted. Absence of a specific sort order can be indicated either
|
|
203
|
+
through an empty list or by setting this parameter to *None*. In this case, tuples are assumed to be in some random
|
|
204
|
+
order.
|
|
205
|
+
parallel_workers : Optional[int], optional
|
|
206
|
+
The number of parallel workers that should be used to execute the operator. The underlying processing model assumes
|
|
207
|
+
that there exists some sort of main operator process which spawns additional worker processes. The worker processes
|
|
208
|
+
will compute the output relation together with the main process. Hence, if some relation should be processed by two
|
|
209
|
+
processes in parallel, the proper value for this parameter would be 1 (the main process and one additional worker).
|
|
210
|
+
It is up to the actual execution engine to decide whether a lower number of workers has to be used.
|
|
211
|
+
index : Optional[str], optional
|
|
212
|
+
The name of the index that should be used to scan the table. This is only relevant for scan nodes and should be
|
|
213
|
+
*None* for all other nodes.
|
|
214
|
+
lookup_key : Optional[SqlExpression], optional
|
|
215
|
+
The expression that is used to lookup tuples in some indexing structure. For scans, this could actually be the physical
|
|
216
|
+
index. For intermediate operators such as hash tables or memoize nodes, this could be the expression that is used to
|
|
217
|
+
build the table or to structure the memo.
|
|
218
|
+
**kwargs
|
|
219
|
+
Additional metadata that should be attached to the plan parameters.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
def __init__(
|
|
223
|
+
self,
|
|
224
|
+
*,
|
|
225
|
+
base_table: Optional[TableReference] = None,
|
|
226
|
+
filter_predicate: Optional[AbstractPredicate] = None,
|
|
227
|
+
sort_keys: Optional[Sequence[SortKey]] = None,
|
|
228
|
+
parallel_workers: Optional[int] = None,
|
|
229
|
+
index: Optional[str] = None,
|
|
230
|
+
lookup_key: Optional[SqlExpression] = None,
|
|
231
|
+
**kwargs,
|
|
232
|
+
) -> None:
|
|
233
|
+
self._params: dict[str, Any] = {
|
|
234
|
+
"base_table": base_table,
|
|
235
|
+
"filter_predicate": filter_predicate,
|
|
236
|
+
"sort_keys": tuple(sort_keys) if sort_keys else tuple(),
|
|
237
|
+
"parallel_workers": parallel_workers if parallel_workers else 0,
|
|
238
|
+
"index": index if index else "",
|
|
239
|
+
"lookup_key": lookup_key if lookup_key else None,
|
|
240
|
+
**kwargs,
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def base_table(self) -> Optional[TableReference]:
|
|
245
|
+
"""Get the base table that is being scanned. For non-scan nodes, this is *None*."""
|
|
246
|
+
return self._params["base_table"]
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def filter_predicate(self) -> Optional[AbstractPredicate]:
|
|
250
|
+
"""Get the filter predicate that is used to restrict the tuples in the output of a relation.
|
|
251
|
+
|
|
252
|
+
For join nodes this would be the join condition and for scan nodes this would be the filter conditions from the
|
|
253
|
+
WHERE clause. However, if the optimizer decides to delay the evaluation of some filter, or some filters need to be
|
|
254
|
+
evaluated multiple times (e.g. recheck conditions in Postgres), this predicate can be more complex.
|
|
255
|
+
"""
|
|
256
|
+
return self._params["filter_predicate"]
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def sort_keys(self) -> Sequence[SortKey]:
|
|
260
|
+
"""Get the sort keys describing the ordering of tuples in the output relation.
|
|
261
|
+
|
|
262
|
+
Absence of a specific sort order is indicated by an empty sequence.
|
|
263
|
+
"""
|
|
264
|
+
return self._params["sort_keys"]
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def parallel_workers(self) -> int:
|
|
268
|
+
"""Get the number of parallel workers that should be used to execute the operator.
|
|
269
|
+
|
|
270
|
+
The underlying processing model assumes that there exists some sort of main operator process which spawns additional
|
|
271
|
+
worker processes. The worker processes will compute the output relation together with the main process. Hence, if some
|
|
272
|
+
relation should be processed by two processes in parallel, the proper value for this parameter would be 1 (the main
|
|
273
|
+
process and one additional worker).
|
|
274
|
+
|
|
275
|
+
It is up to the actual execution engine to decide whether a lower number of workers has to be used.
|
|
276
|
+
|
|
277
|
+
Absence of parallelism is indicated by 0.
|
|
278
|
+
"""
|
|
279
|
+
return self._params["parallel_workers"]
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def index(self) -> str:
|
|
283
|
+
"""Get the name of the index that should be used to scan the table.
|
|
284
|
+
|
|
285
|
+
Absence of an index is indicated by an empty string.
|
|
286
|
+
"""
|
|
287
|
+
return self._params["index"]
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def lookup_key(self) -> Optional[SqlExpression]:
|
|
291
|
+
"""Get the expression that is used to lookup tuples in some indexing structure.
|
|
292
|
+
|
|
293
|
+
For scans, this could actually be the physical index. In this case, the lookup expression should be the one that is
|
|
294
|
+
used to build the index, e.g., the primary key column. For intermediate operators such as hash tables or memoize nodes,
|
|
295
|
+
this could be the expression that is used to build the table or to structure the memo.
|
|
296
|
+
"""
|
|
297
|
+
return self._params["lookup_key"]
|
|
298
|
+
|
|
299
|
+
def tables(self) -> set[TableReference]:
|
|
300
|
+
"""Provide all tables that are referenced at some point in the plan parameters.
|
|
301
|
+
|
|
302
|
+
This includes only the well-defined properties available to all parameterizations, i.e. `base_table` and
|
|
303
|
+
`filter_predicate`. If users decide to store additional metadata with table information in the parameters, these are
|
|
304
|
+
not retained here.
|
|
305
|
+
|
|
306
|
+
Returns
|
|
307
|
+
-------
|
|
308
|
+
set[TableReference]
|
|
309
|
+
The tables
|
|
310
|
+
"""
|
|
311
|
+
tables = set()
|
|
312
|
+
if self.base_table:
|
|
313
|
+
tables.add(self.base_table)
|
|
314
|
+
if self.filter_predicate:
|
|
315
|
+
tables |= self.filter_predicate.tables()
|
|
316
|
+
if self.lookup_key:
|
|
317
|
+
tables |= self.lookup_key.tables()
|
|
318
|
+
return tables
|
|
319
|
+
|
|
320
|
+
def columns(self) -> set[ColumnReference]:
|
|
321
|
+
"""Provides all columns that are referenced at some point in the plan parameters.
|
|
322
|
+
|
|
323
|
+
This includes only the well-defined properties available to all parameterizations, i.e. just the `filter_predicate`. If
|
|
324
|
+
users decide to store additional metadata with column information in the parameters, these are not retained here.
|
|
325
|
+
|
|
326
|
+
Returns
|
|
327
|
+
-------
|
|
328
|
+
set[ColumnReference]
|
|
329
|
+
The columns
|
|
330
|
+
"""
|
|
331
|
+
return self.filter_predicate.columns() if self.filter_predicate else set()
|
|
332
|
+
|
|
333
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
334
|
+
"""Retrieves the value of a specific key from the parameters.
|
|
335
|
+
|
|
336
|
+
This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
|
|
337
|
+
parameter with the given key. Instead, we return the default value, which is *None* by default.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
key : str
|
|
342
|
+
The parameter name
|
|
343
|
+
default : Any, optional
|
|
344
|
+
The default value to return if the parameter is not found. Defaults to *None*.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
Any
|
|
349
|
+
The parameter value if it exists, otherwise the default value.
|
|
350
|
+
"""
|
|
351
|
+
value = self._params.get(key, default)
|
|
352
|
+
if isinstance(value, float) and math.isnan(value):
|
|
353
|
+
return default
|
|
354
|
+
return value
|
|
355
|
+
|
|
356
|
+
def items(self) -> Iterable[tuple[str, Any]]:
|
|
357
|
+
"""Provides all metadata that is currently stored in the parameters as key-value pairs, similar to *dict.items*"""
|
|
358
|
+
return self._params.items()
|
|
359
|
+
|
|
360
|
+
def clone(self, *, deep: bool = False) -> PlanParams:
|
|
361
|
+
"""Creates a copy of the current plan parameters.
|
|
362
|
+
|
|
363
|
+
Parameters
|
|
364
|
+
----------
|
|
365
|
+
deep : bool, optional
|
|
366
|
+
Whether to create a deep copy of all parameters. Defaults to *False*.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
PlanParams
|
|
371
|
+
The copied parameters.
|
|
372
|
+
"""
|
|
373
|
+
return self.__deepcopy__({}) if deep else self.__copy__()
|
|
374
|
+
|
|
375
|
+
def __json__(self) -> jsondict:
|
|
376
|
+
return self._params
|
|
377
|
+
|
|
378
|
+
def __copy__(self) -> PlanParams:
|
|
379
|
+
return PlanParams(**self._params)
|
|
380
|
+
|
|
381
|
+
def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanParams:
|
|
382
|
+
params = copy.deepcopy(self._params, memo)
|
|
383
|
+
return PlanParams(**params)
|
|
384
|
+
|
|
385
|
+
def __contains__(self, key: object) -> bool:
|
|
386
|
+
params = object.__getattribute__(self, "_params")
|
|
387
|
+
return key in params
|
|
388
|
+
|
|
389
|
+
def __getattribute__(self, name: str) -> Any:
|
|
390
|
+
params = object.__getattribute__(self, "_params")
|
|
391
|
+
if name == "_params":
|
|
392
|
+
return params
|
|
393
|
+
if name in params:
|
|
394
|
+
return params[name]
|
|
395
|
+
return object.__getattribute__(self, name)
|
|
396
|
+
|
|
397
|
+
def __setattr__(self, name, value) -> None:
|
|
398
|
+
if name == "_params":
|
|
399
|
+
return object.__setattr__(self, name, value)
|
|
400
|
+
params = object.__getattribute__(self, "_params")
|
|
401
|
+
params[name] = value
|
|
402
|
+
|
|
403
|
+
def __getitem__(self, key: str) -> Any:
|
|
404
|
+
return self._params[key]
|
|
405
|
+
|
|
406
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
|
407
|
+
self._params[key] = value
|
|
408
|
+
|
|
409
|
+
def __repr__(self) -> str:
|
|
410
|
+
return str(self)
|
|
411
|
+
|
|
412
|
+
def __str__(self) -> str:
|
|
413
|
+
return str(self._params)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class PlanEstimates:
|
|
417
|
+
"""Plan estimates provide the optimizer's view on a specific (sub-)plan.
|
|
418
|
+
|
|
419
|
+
This includes the estimated cardinality and cost of the plan. The cardinality is the number of tuples that are expected to
|
|
420
|
+
be produced by the operator, while the cost is a measure of the resources that are consumed by the operator.
|
|
421
|
+
Costs do not have a specific unit and it is the user's obligation to ensure that they are used in a sound way. Most
|
|
422
|
+
importantly, this means that only costs from the same source should be compared since most database systems interpret costs
|
|
423
|
+
in a different way.
|
|
424
|
+
|
|
425
|
+
In addition to the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the
|
|
426
|
+
parameters, e.g. ``estimates["custom"] = 42``.
|
|
427
|
+
|
|
428
|
+
Parameters
|
|
429
|
+
----------
|
|
430
|
+
cardinality : Cardinality, optional
|
|
431
|
+
The estimated number of tuples that are produced by the operator. If no estimate is available, *NaN* can be used.
|
|
432
|
+
cost : Cost, optional
|
|
433
|
+
The approximate amount of abstract "work" that needs to be done to compute the result set of the operator. If no
|
|
434
|
+
estimate is available, *NaN* can be used.
|
|
435
|
+
**kwargs
|
|
436
|
+
Additional metadata that should be attached to the plan estimates.
|
|
437
|
+
|
|
438
|
+
Notes
|
|
439
|
+
-----
|
|
440
|
+
In case of parallel execution, all measures should be thought of "meaningful totals", i.e. the cardinality
|
|
441
|
+
numbers are the total number of tuples produced by all workers. The execution time should denote the wall time, it
|
|
442
|
+
took to execute the entire operator (which just happened to include parallel processing), **not** an average of the
|
|
443
|
+
worker execution time or some other measure.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
def __init__(
|
|
447
|
+
self,
|
|
448
|
+
*,
|
|
449
|
+
cardinality: Cardinality = Cardinality.unknown(),
|
|
450
|
+
cost: Cost = math.nan,
|
|
451
|
+
**kwargs,
|
|
452
|
+
) -> None:
|
|
453
|
+
cardinality = (
|
|
454
|
+
cardinality
|
|
455
|
+
if isinstance(cardinality, Cardinality)
|
|
456
|
+
else Cardinality(cardinality)
|
|
457
|
+
)
|
|
458
|
+
self._params = {"cardinality": cardinality, "cost": cost, **kwargs}
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def cardinality(self) -> Cardinality:
|
|
462
|
+
"""Get the estimated cardinality of the operator. Can be *NaN* if no estimate is available."""
|
|
463
|
+
return self._params["cardinality"]
|
|
464
|
+
|
|
465
|
+
@property
|
|
466
|
+
def cost(self) -> Cost:
|
|
467
|
+
"""Get the estimated cost of the operator. Can be *NaN* if no estimate is available."""
|
|
468
|
+
return self._params["cost"]
|
|
469
|
+
|
|
470
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
471
|
+
"""Retrieves the value of a specific key from the estimates.
|
|
472
|
+
|
|
473
|
+
This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
|
|
474
|
+
parameter with the given key. Instead, we return the default value, which is *None* by default.
|
|
475
|
+
|
|
476
|
+
Parameters
|
|
477
|
+
----------
|
|
478
|
+
key : str
|
|
479
|
+
The parameter name
|
|
480
|
+
default : Any, optional
|
|
481
|
+
The default value to return if the parameter is not found. Defaults to *None*.
|
|
482
|
+
|
|
483
|
+
Returns
|
|
484
|
+
-------
|
|
485
|
+
Any
|
|
486
|
+
The parameter value if it exists, otherwise the default value.
|
|
487
|
+
"""
|
|
488
|
+
value = self._params.get(key, default)
|
|
489
|
+
if isinstance(value, float) and math.isnan(value):
|
|
490
|
+
return default
|
|
491
|
+
return value
|
|
492
|
+
|
|
493
|
+
def items(self) -> Iterable[tuple[str, Any]]:
|
|
494
|
+
"""Provides all estimates as key-value pairs, similar to the *dict.items* method."""
|
|
495
|
+
return self._params.items()
|
|
496
|
+
|
|
497
|
+
def clone(self, *, deep: bool = False) -> PlanEstimates:
|
|
498
|
+
"""Creates a copy of the current plan estimates.
|
|
499
|
+
|
|
500
|
+
Parameters
|
|
501
|
+
----------
|
|
502
|
+
deep : bool, optional
|
|
503
|
+
Whether to create a deep copy of all estimates. Defaults to *False*.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
PlanEstimates
|
|
508
|
+
The copied estimates.
|
|
509
|
+
"""
|
|
510
|
+
return self.__deepcopy__({}) if deep else self.__copy__()
|
|
511
|
+
|
|
512
|
+
def __json__(self) -> jsondict:
|
|
513
|
+
return self._params
|
|
514
|
+
|
|
515
|
+
def __copy__(self) -> PlanEstimates:
|
|
516
|
+
return PlanEstimates(**self._params)
|
|
517
|
+
|
|
518
|
+
def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanEstimates:
|
|
519
|
+
params = copy.deepcopy(self._params, memo)
|
|
520
|
+
return PlanEstimates(**params)
|
|
521
|
+
|
|
522
|
+
def __contains__(self, key: object) -> bool:
|
|
523
|
+
params = object.__getattribute__(self, "_params")
|
|
524
|
+
return key in params
|
|
525
|
+
|
|
526
|
+
def __getattribute__(self, name: str) -> Any:
|
|
527
|
+
params = object.__getattribute__(self, "_params")
|
|
528
|
+
if name == "_params":
|
|
529
|
+
return params
|
|
530
|
+
if name in params:
|
|
531
|
+
return params[name]
|
|
532
|
+
return object.__getattribute__(self, name)
|
|
533
|
+
|
|
534
|
+
def __setattr__(self, name, value) -> None:
|
|
535
|
+
if name == "_params":
|
|
536
|
+
return object.__setattr__(self, name, value)
|
|
537
|
+
params = object.__getattribute__(self, "_params")
|
|
538
|
+
params[name] = value
|
|
539
|
+
|
|
540
|
+
def __getitem__(self, key: str) -> Any:
|
|
541
|
+
return self._params[key]
|
|
542
|
+
|
|
543
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
|
544
|
+
self._params[key] = value
|
|
545
|
+
|
|
546
|
+
def __repr__(self) -> str:
|
|
547
|
+
return str(self)
|
|
548
|
+
|
|
549
|
+
def __str__(self) -> str:
|
|
550
|
+
return str(self._params)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
class PlanMeasures:
|
|
554
|
+
"""Plan measures provide actual execution statistics of a specific (sub-)plan.
|
|
555
|
+
|
|
556
|
+
Typically, this includes the actual cardinality of the result set as well as the execution time of the operator.
|
|
557
|
+
Additionally, information about cache hits and misses for the shared buffer can be provided.
|
|
558
|
+
|
|
559
|
+
Other than the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the parameters,
|
|
560
|
+
e.g. ``measures["custom"] = 42``.
|
|
561
|
+
|
|
562
|
+
Parameters
|
|
563
|
+
----------
|
|
564
|
+
cardinality : Cardinality, optional
|
|
565
|
+
The actual number of tuples that are produced by the operator. If no measurement is available, *NaN* can be used.
|
|
566
|
+
execution_time : float, optional
|
|
567
|
+
The total time (in seconds) that was spent to compute the result set of the operator. If no measurement is available,
|
|
568
|
+
*NaN* can be used.
|
|
569
|
+
cache_hits : Optional[int], optional
|
|
570
|
+
The number of page reads that were satisfied by the shared buffer. If no measurement is available, *None* can be
|
|
571
|
+
used.
|
|
572
|
+
cache_misses : Optional[int], optional
|
|
573
|
+
The number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer. If no
|
|
574
|
+
measurement is available, *None* can be used.
|
|
575
|
+
**kwargs
|
|
576
|
+
Additional metadata that should be attached to the plan measures.
|
|
577
|
+
|
|
578
|
+
Notes
|
|
579
|
+
-----
|
|
580
|
+
In case of parallel execution, all measures should be thought of "meaningful totals", i.e. the cardinality
|
|
581
|
+
numbers are the total number of tuples produced by all workers. The execution time should denote the wall time, it
|
|
582
|
+
took to execute the entire operator (which just happened to include parallel processing), **not** an average of the
|
|
583
|
+
worker execution time or some other measure.
|
|
584
|
+
"""
|
|
585
|
+
|
|
586
|
+
def __init__(
|
|
587
|
+
self,
|
|
588
|
+
*,
|
|
589
|
+
cardinality: Cardinality = Cardinality.unknown(),
|
|
590
|
+
execution_time: float = math.nan,
|
|
591
|
+
cache_hits: Optional[int] = None,
|
|
592
|
+
cache_misses: Optional[int] = None,
|
|
593
|
+
**kwargs,
|
|
594
|
+
) -> None:
|
|
595
|
+
cardinality = (
|
|
596
|
+
cardinality
|
|
597
|
+
if isinstance(cardinality, Cardinality)
|
|
598
|
+
else Cardinality(cardinality)
|
|
599
|
+
)
|
|
600
|
+
self._params = {
|
|
601
|
+
"cardinality": cardinality,
|
|
602
|
+
"execution_time": execution_time,
|
|
603
|
+
"cache_hits": cache_hits,
|
|
604
|
+
"cache_misses": cache_misses,
|
|
605
|
+
**kwargs,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
@property
|
|
609
|
+
def cardinality(self) -> Cardinality:
|
|
610
|
+
"""Get the actual cardinality of the operator. Can be *NaN* if no measurement is available."""
|
|
611
|
+
return self._params["cardinality"]
|
|
612
|
+
|
|
613
|
+
@property
|
|
614
|
+
def execution_time(self) -> float:
|
|
615
|
+
"""Get the actual execution time of the operator. Can be *NaN* if no measurement is available."""
|
|
616
|
+
return self._params["execution_time"]
|
|
617
|
+
|
|
618
|
+
@property
|
|
619
|
+
def cache_hits(self) -> Optional[int]:
|
|
620
|
+
"""Get the number of page reads that were satisfied by the shared buffer.
|
|
621
|
+
|
|
622
|
+
If no measurement is available, *None* is returned.
|
|
623
|
+
"""
|
|
624
|
+
return self._params["cache_hits"]
|
|
625
|
+
|
|
626
|
+
@property
|
|
627
|
+
def cache_misses(self) -> Optional[int]:
|
|
628
|
+
"""Get the number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer.
|
|
629
|
+
|
|
630
|
+
If no measurement is available, *None* is returned.
|
|
631
|
+
"""
|
|
632
|
+
return self._params["cache_misses"]
|
|
633
|
+
|
|
634
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
635
|
+
"""Retrieves the value of a specific key from the measures.
|
|
636
|
+
|
|
637
|
+
This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
|
|
638
|
+
parameter with the given key. Instead, we return the default value, which is *None* by default.
|
|
639
|
+
|
|
640
|
+
Parameters
|
|
641
|
+
----------
|
|
642
|
+
key : str
|
|
643
|
+
The parameter name
|
|
644
|
+
default : Any, optional
|
|
645
|
+
The default value to return if the parameter is not found. Defaults to *None*.
|
|
646
|
+
|
|
647
|
+
Returns
|
|
648
|
+
-------
|
|
649
|
+
Any
|
|
650
|
+
The parameter value if it exists, otherwise the default value.
|
|
651
|
+
"""
|
|
652
|
+
value = self._params.get(key, default)
|
|
653
|
+
if isinstance(value, float) and math.isnan(value):
|
|
654
|
+
return default
|
|
655
|
+
return value
|
|
656
|
+
|
|
657
|
+
def items(self) -> Iterable[tuple[str, Any]]:
|
|
658
|
+
"""Provides all measures as key-value pairs, similar to the *dict.items* method."""
|
|
659
|
+
return self._params.items()
|
|
660
|
+
|
|
661
|
+
def clone(self, *, deep: bool = False) -> PlanMeasures:
|
|
662
|
+
"""Creates a copy of the current plan measures.
|
|
663
|
+
|
|
664
|
+
Parameters
|
|
665
|
+
----------
|
|
666
|
+
deep : bool, optional
|
|
667
|
+
Whether to create a deep copy of all measures. Defaults to *False*.
|
|
668
|
+
|
|
669
|
+
Returns
|
|
670
|
+
-------
|
|
671
|
+
PlanMeasures
|
|
672
|
+
The copied measures.
|
|
673
|
+
"""
|
|
674
|
+
return self.__deepcopy__({}) if deep else self.__copy__()
|
|
675
|
+
|
|
676
|
+
def __json__(self) -> jsondict:
|
|
677
|
+
return self._params
|
|
678
|
+
|
|
679
|
+
def __copy__(self) -> PlanMeasures:
|
|
680
|
+
return PlanMeasures(**self._params)
|
|
681
|
+
|
|
682
|
+
def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanMeasures:
|
|
683
|
+
params = copy.deepcopy(self._params, memo)
|
|
684
|
+
return PlanMeasures(**params)
|
|
685
|
+
|
|
686
|
+
def __contains__(self, key: object) -> bool:
|
|
687
|
+
params = object.__getattribute__(self, "_params")
|
|
688
|
+
return key in params
|
|
689
|
+
|
|
690
|
+
def __getattribute__(self, name: str) -> Any:
|
|
691
|
+
params = object.__getattribute__(self, "_params")
|
|
692
|
+
if name == "_params":
|
|
693
|
+
return params
|
|
694
|
+
if name in params:
|
|
695
|
+
return params[name]
|
|
696
|
+
return object.__getattribute__(self, name)
|
|
697
|
+
|
|
698
|
+
def __setattr__(self, name, value) -> None:
|
|
699
|
+
if name == "_params":
|
|
700
|
+
return object.__setattr__(self, name, value)
|
|
701
|
+
params = object.__getattribute__(self, "_params")
|
|
702
|
+
params[name] = value
|
|
703
|
+
|
|
704
|
+
def __getitem__(self, key: str) -> Any:
|
|
705
|
+
return self._params[key]
|
|
706
|
+
|
|
707
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
|
708
|
+
self._params[key] = value
|
|
709
|
+
|
|
710
|
+
def __bool__(self) -> bool:
|
|
711
|
+
return any(
|
|
712
|
+
not math.isnan(v) if isinstance(v, Number) else (v is not None)
|
|
713
|
+
for v in self._params.values()
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
def __repr__(self) -> str:
|
|
717
|
+
return str(self)
|
|
718
|
+
|
|
719
|
+
def __str__(self) -> str:
|
|
720
|
+
return str(self._params)
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
@dataclass(frozen=True)
|
|
724
|
+
class Subplan:
|
|
725
|
+
"""Subplans are used to model subqueries whose results are used while processing another operator in the main query.
|
|
726
|
+
|
|
727
|
+
A typical example are correlated/dependent subqueries that are used in some predicate and need to be evaluated for each
|
|
728
|
+
tuple of the outer relation (unless some algebraic optimization has been applied beforehand).
|
|
729
|
+
|
|
730
|
+
Attributes
|
|
731
|
+
----------
|
|
732
|
+
root : QueryPlan
|
|
733
|
+
The root operator of the subplan
|
|
734
|
+
target_name : str
|
|
735
|
+
The name of the target table that the subplan should produce
|
|
736
|
+
"""
|
|
737
|
+
|
|
738
|
+
root: QueryPlan
|
|
739
|
+
target_name: str = ""
|
|
740
|
+
|
|
741
|
+
def tables(self) -> set[TableReference]:
|
|
742
|
+
"""Provide all tables that are referenced at some point in the subplan.
|
|
743
|
+
|
|
744
|
+
Returns
|
|
745
|
+
-------
|
|
746
|
+
set[TableReference]
|
|
747
|
+
The tables. This set includes the target table that the subplan produces as a virtual table.
|
|
748
|
+
"""
|
|
749
|
+
if not self.target_name:
|
|
750
|
+
return self.root.tables()
|
|
751
|
+
target_table = TableReference.create_virtual(self.target_name)
|
|
752
|
+
return self.root.tables() | {target_table}
|
|
753
|
+
|
|
754
|
+
def clone(self, *, deep: bool = False) -> Subplan:
|
|
755
|
+
"""Creates a copy of the current subplan.
|
|
756
|
+
|
|
757
|
+
Parameters
|
|
758
|
+
----------
|
|
759
|
+
deep : bool, optional
|
|
760
|
+
Whether to create a deep copy of all contained plans. Defaults to *False*.
|
|
761
|
+
|
|
762
|
+
Returns
|
|
763
|
+
-------
|
|
764
|
+
Subplan
|
|
765
|
+
The copied subplan.
|
|
766
|
+
"""
|
|
767
|
+
return self.__deepcopy__({}) if deep else self.__copy__()
|
|
768
|
+
|
|
769
|
+
def __json__(self) -> jsondict:
|
|
770
|
+
return {"root": self.root, "target_name": self.target_name}
|
|
771
|
+
|
|
772
|
+
def __copy__(self) -> Subplan:
|
|
773
|
+
return Subplan(self.root.clone(deep=False), self.target_name)
|
|
774
|
+
|
|
775
|
+
def __deepcopy__(self, memo: dict[int, object] = {}) -> Subplan:
|
|
776
|
+
return Subplan(self.root.clone(deep=True), self.target_name)
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
class QueryPlan:
|
|
780
|
+
"""Models the structure of a query execution plan (QEP).
|
|
781
|
+
|
|
782
|
+
Query plans are constructed as a tree of operators. Each operator represents an entire query plan by itself. Hence, we
|
|
783
|
+
use the *QueryPlan* to refer to the actual nodes in a hierarchical structure. Each node has a potentially large amount of
|
|
784
|
+
metadata attached to it, e.g. regarding the table being scanned for scan nodes, the estimated cost of the operator or the
|
|
785
|
+
actual cardinality of the result set. The different types of metadata are structured into three separate classes:
|
|
786
|
+
|
|
787
|
+
- `PlanParams` contain all structural metadata about the operator, e.g. the table being scanned or the filter predicate.
|
|
788
|
+
- `PlanEstimates` contain the optimizer's view on the operator, e.g. the estimated cardinality and cost.
|
|
789
|
+
- `PlanMeasures` contain the actual execution statistics of the operator, e.g. the actual cardinality and execution time.
|
|
790
|
+
|
|
791
|
+
Users are free to attach additional metadata to each of the containers to support there specific use-cases. However, these
|
|
792
|
+
additional fields are typically not considered by the standard methods available on query plans. For example, if users
|
|
793
|
+
store additional tables in the node, these are not considered in the `tables` method.
|
|
794
|
+
|
|
795
|
+
Each query plan can contain an arbitrary number of child nodes. This is true even for scans, to accomodate bitmap scans
|
|
796
|
+
that combine an arbitrary amount of index lookups with a final scan. If just a single child is present, it can be set more
|
|
797
|
+
expressively using the `input_node` property.
|
|
798
|
+
|
|
799
|
+
PostBOUND uses QEPs in two different ways: first, they can be used as the output of the optimization process (i.e. the
|
|
800
|
+
optimization pipelines), being constructed by the different optimization stages. Second, they can also be extracted from
|
|
801
|
+
an actual database system to encode the QEP that this system used to execute a specific query. This dichotomy leads to
|
|
802
|
+
different granularities of query plans: actual database systems often have much more detailed QEPs. For example, Postgres
|
|
803
|
+
represents a hash join as a hash join operator, whose inner child is a hash operator that constructs the hash table.
|
|
804
|
+
The optimizer stages will typically not worry about such fine-grained details and simply demand a join to be executed as
|
|
805
|
+
a hash join. To mitigate these issues, the query plans can be normalized by using the `canonical` method. This method
|
|
806
|
+
removes all unnecessary details and only retains the join and scan operators.
|
|
807
|
+
|
|
808
|
+
When constructing a query plan, the metadata can be provided in two ways: either as instances of the corresponding metadata
|
|
809
|
+
objects, or explicitly as keyword arguments to enable a more convenient usage. Notice however, that these two ways cannot
|
|
810
|
+
be mixed: either all metadata of a specific type is provided as wrapper instance, or all metadata is provided as keyword
|
|
811
|
+
arguments. Mixing is only allowed across different metadata types, e.g. providing the estimates as a `PlanEstimates` object
|
|
812
|
+
and the measurements as keyword arguments.
|
|
813
|
+
|
|
814
|
+
In addition to the pre-defined metadata types, you can also add additional metadata as part of the *kwargs*. These will
|
|
815
|
+
be added to the plan parameters (using the same mixing rules as the pre-defined types).
|
|
816
|
+
Each query plan provides dict-like access to the plan parameters, estimates and measures, e.g. ``plan["custom"] = 42``,
|
|
817
|
+
``plan.get("custom", default)``, or ``"custom" in plan``.
|
|
818
|
+
|
|
819
|
+
Query plans provide rather extensive support methods to check their shape (e.g. `is_linear()` or `is_bushy()`), to aid with
|
|
820
|
+
traversal (e.g. `find_first_node()` or `find_all_nodes()`) or to extract specific information (e.g. `tables()` or
|
|
821
|
+
`qerror()`).
|
|
822
|
+
|
|
823
|
+
To convert between different optimization artifacts, a number of methods are available. For example, `to_query_plan` can
|
|
824
|
+
be used to construct a query plan from a join order and a set of operators. Likewise, `explode_query_plan` converts the
|
|
825
|
+
query plan back into join order, operators and parameters.
|
|
826
|
+
|
|
827
|
+
Query plans support *len()* (providing the plan depth without subplans) and *iter()* (providing all contained nodes
|
|
828
|
+
including subplans).
|
|
829
|
+
|
|
830
|
+
Parameters
|
|
831
|
+
----------
|
|
832
|
+
node_type : str | PhysicalOperator
|
|
833
|
+
The name of the operator. If this is supplied as a physical operator, the name is inferred from it.
|
|
834
|
+
operator : Optional[PhysicalOperator], optional
|
|
835
|
+
The actual operator that is used to compute the result set. This can be empty if there is no specific operator
|
|
836
|
+
corresponding to the current node (e.g. for transient hash tables).
|
|
837
|
+
children : Optional[QueryPlan | Iterable[QueryPlan]], optional
|
|
838
|
+
The input nodes of the current operator. For nodes without an input (e.g. most scans), this can simply be *None* or
|
|
839
|
+
an empty list. Nodes with exactly one input node (e.g. most aggregations) can supply their input either directly as
|
|
840
|
+
a plan object, or as a singleton list. Nodes with two input nodes (e.g. joins) should supply them as an ordered
|
|
841
|
+
iterable with the outer child first.
|
|
842
|
+
plan_params : Optional[PlanParams], optional
|
|
843
|
+
Structural metadata (e.g. parallel workers or accessed indexes) of the operator. If this is provided, no other
|
|
844
|
+
plan parameters can be supplied as keyword arguments, including kwargs.
|
|
845
|
+
subplan : Optional[Subplan], optional
|
|
846
|
+
A subquery that has to be executed as part of this node. If this is provided, no other subplan components can be
|
|
847
|
+
supplied as keyword arguments.
|
|
848
|
+
estimates : Optional[PlanEstimates], optional
|
|
849
|
+
The optimizer's view on the operator (e.g. estimated cardinality and cost). If this is provided, no other estimates
|
|
850
|
+
can be supplied as keyword arguments.
|
|
851
|
+
measures : Optional[PlanMeasures], optional
|
|
852
|
+
The actual execution statistics of the operator (e.g. actual cardinality and execution time). If this is provided, no
|
|
853
|
+
other measures can be supplied as keyword arguments.
|
|
854
|
+
base_table : Optional[TableReference], optional
|
|
855
|
+
The table that is being scanned. This is only relevant for scan nodes and should be *None* for all other nodes.
|
|
856
|
+
If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
|
|
857
|
+
filter_predicate : Optional[AbstractPredicate], optional
|
|
858
|
+
An arbitrary predicate to restrict the allowed tuples in the output of a relation. This should be mostly used for
|
|
859
|
+
join nodes and scans. If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
|
|
860
|
+
parallel_workers : Optional[int], optional
|
|
861
|
+
The number of parallel workers that should be used to execute the operator. If this argument is used, no other plan
|
|
862
|
+
parameters can be supplied in the `plan_params` argument.
|
|
863
|
+
index : Optional[str], optional
|
|
864
|
+
The name of the index that should be used to scan the table. This is mostly relevant for scan nodes and should be
|
|
865
|
+
*None* for all other nodes. If this argument is used, no other plan parameters can be supplied in the `plan_params`
|
|
866
|
+
argument.
|
|
867
|
+
lookup_key : Optional[SqlExpression], optional
|
|
868
|
+
The expression that is used to lookup tuples in some indexing structure. For scans, this could actually be the
|
|
869
|
+
physical index. For intermediate operators such as hash tables or memoize nodes, this could be the expression that is
|
|
870
|
+
used to build the table or to structure the memo. If this argument is used, no other plan parameters can be supplied
|
|
871
|
+
in the `plan_params` argument.
|
|
872
|
+
sort_keys : Optional[Sequence[SortKey]], optional
|
|
873
|
+
How the tuples in a the output of a relation are sorted. Absence of a specific sort order can be indicated either
|
|
874
|
+
through an empty list or by setting this parameter to *None*. In this case, tuples are assumed to be in some random
|
|
875
|
+
order. If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
|
|
876
|
+
estimated_cardinality : Cardinality, optional
|
|
877
|
+
The estimated number of tuples that are produced by the operator. If no estimate is available, *NaN* can be used.
|
|
878
|
+
If this argument is used, no other estimates can be supplied in the `estimates` argument.
|
|
879
|
+
estimated_cost : Cost, optional
|
|
880
|
+
The approximate amount of abstract "work" that needs to be done to compute the result set of the operator. If no
|
|
881
|
+
estimate is available, *NaN* can be used. If this argument is used, no other estimates can be supplied in the
|
|
882
|
+
`estimates` argument.
|
|
883
|
+
actual_cardinality : Cardinality, optional
|
|
884
|
+
The actual number of tuples that are produced by the operator. If no measurement is available, *NaN* can be used.
|
|
885
|
+
If this argument is used, no other measures can be supplied in the `measures` argument.
|
|
886
|
+
execution_time : float, optional
|
|
887
|
+
The total time (in seconds) that was spent to compute the result set of the operator. If no measurement is available,
|
|
888
|
+
*NaN* can be used. If this argument is used, no other measures can be supplied in the `measures` argument.
|
|
889
|
+
cache_hits : Optional[int], optional
|
|
890
|
+
The number of page reads that were satisfied by the shared buffer. If no measurement is available, *None* can be
|
|
891
|
+
used. If this argument is used, no other measures can be supplied in the `measures` argument.
|
|
892
|
+
cache_misses : Optional[int], optional
|
|
893
|
+
The number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer. If no
|
|
894
|
+
measurement is available, *None* can be used. If this argument is used, no other measures can be supplied in the
|
|
895
|
+
`measures` argument.
|
|
896
|
+
subplan_root : Optional[QueryPlan], optional
|
|
897
|
+
The root operator of the subplan. If this argument is used, no other subplan components can be supplied in the
|
|
898
|
+
`subplan` argument.
|
|
899
|
+
subplan_target_name : str, optional
|
|
900
|
+
The name of the target table that the subplan should produce. If this argument is used, no other subplan components
|
|
901
|
+
can be supplied in the `subplan` argument.
|
|
902
|
+
**kwargs
|
|
903
|
+
Additional metadata that should be attached to the plan parameters. If this is used, no other plan parameters can be
|
|
904
|
+
supplied in the `plan_params` argument.
|
|
905
|
+
|
|
906
|
+
See Also
|
|
907
|
+
--------
|
|
908
|
+
to_query_plan
|
|
909
|
+
explode_query_plan
|
|
910
|
+
OptimizerInterface.query_plan
|
|
911
|
+
OptimizationPipeline.query_execution_plan
|
|
912
|
+
"""
|
|
913
|
+
|
|
914
|
+
def __init__(
|
|
915
|
+
self,
|
|
916
|
+
node_type: str | PhysicalOperator,
|
|
917
|
+
*,
|
|
918
|
+
operator: Optional[PhysicalOperator] = None,
|
|
919
|
+
children: Optional[QueryPlan | Iterable[QueryPlan]] = None,
|
|
920
|
+
plan_params: Optional[PlanParams] = None,
|
|
921
|
+
subplan: Optional[Subplan] = None,
|
|
922
|
+
estimates: Optional[PlanEstimates] = None,
|
|
923
|
+
measures: Optional[PlanMeasures] = None,
|
|
924
|
+
base_table: Optional[TableReference] = None,
|
|
925
|
+
filter_predicate: Optional[AbstractPredicate] = None,
|
|
926
|
+
parallel_workers: Optional[int] = None,
|
|
927
|
+
index: Optional[str] = None,
|
|
928
|
+
sort_keys: Optional[Sequence[SortKey]] = None,
|
|
929
|
+
lookup_key: Optional[SqlExpression] = None,
|
|
930
|
+
estimated_cardinality: Cardinality = Cardinality.unknown(),
|
|
931
|
+
estimated_cost: Cost = math.nan,
|
|
932
|
+
actual_cardinality: Cardinality = Cardinality.unknown(),
|
|
933
|
+
execution_time: float = math.nan,
|
|
934
|
+
cache_hits: Optional[int] = None,
|
|
935
|
+
cache_misses: Optional[int] = None,
|
|
936
|
+
subplan_root: Optional[QueryPlan] = None,
|
|
937
|
+
subplan_target_name: str = "",
|
|
938
|
+
**kwargs,
|
|
939
|
+
) -> None:
|
|
940
|
+
if not node_type:
|
|
941
|
+
raise ValueError("Node type must be provided")
|
|
942
|
+
|
|
943
|
+
custom_params = (
|
|
944
|
+
base_table,
|
|
945
|
+
filter_predicate,
|
|
946
|
+
parallel_workers,
|
|
947
|
+
index,
|
|
948
|
+
sort_keys,
|
|
949
|
+
lookup_key,
|
|
950
|
+
)
|
|
951
|
+
has_custom_params = any(v is not None for v in custom_params) or bool(kwargs)
|
|
952
|
+
if plan_params is not None and has_custom_params:
|
|
953
|
+
raise ValueError(
|
|
954
|
+
"PlanParams and individual parameters/kwargs cannot be provided at the same time"
|
|
955
|
+
)
|
|
956
|
+
if plan_params is None:
|
|
957
|
+
plan_params = PlanParams(
|
|
958
|
+
base_table=base_table,
|
|
959
|
+
filter_predicate=filter_predicate,
|
|
960
|
+
sort_keys=sort_keys,
|
|
961
|
+
lookup_key=lookup_key,
|
|
962
|
+
parallel_workers=parallel_workers,
|
|
963
|
+
index=index,
|
|
964
|
+
**kwargs,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
if estimates is not None and any(
|
|
968
|
+
not math.isnan(v) for v in (estimated_cardinality, estimated_cost)
|
|
969
|
+
):
|
|
970
|
+
raise ValueError(
|
|
971
|
+
"PlanEstimates and individual estimates cannot be provided at the same time"
|
|
972
|
+
)
|
|
973
|
+
if estimates is None:
|
|
974
|
+
estimates = PlanEstimates(
|
|
975
|
+
cardinality=estimated_cardinality, cost=estimated_cost
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
has_custom_measures = any(
|
|
979
|
+
v is not None and not math.isnan(v)
|
|
980
|
+
for v in (execution_time, cache_hits, cache_misses)
|
|
981
|
+
)
|
|
982
|
+
if measures is not None and has_custom_measures:
|
|
983
|
+
raise ValueError(
|
|
984
|
+
"PlanMeasures and individual measures cannot be provided at the same time"
|
|
985
|
+
)
|
|
986
|
+
if measures is None:
|
|
987
|
+
measures = PlanMeasures(
|
|
988
|
+
execution_time=execution_time,
|
|
989
|
+
cardinality=actual_cardinality,
|
|
990
|
+
cache_hits=cache_hits,
|
|
991
|
+
cache_misses=cache_misses,
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
if subplan is not None and (subplan_root is not None or subplan_target_name):
|
|
995
|
+
raise ValueError(
|
|
996
|
+
"Subplan and individual subplan components cannot be provided at the same time"
|
|
997
|
+
)
|
|
998
|
+
if subplan is None and (subplan_root is not None or subplan_target_name):
|
|
999
|
+
subplan = Subplan(subplan_root, subplan_target_name)
|
|
1000
|
+
|
|
1001
|
+
children = [] if children is None else util.enlist(children)
|
|
1002
|
+
|
|
1003
|
+
if isinstance(node_type, PhysicalOperator):
|
|
1004
|
+
operator = node_type
|
|
1005
|
+
node_type = operator.name
|
|
1006
|
+
|
|
1007
|
+
self._node_type = node_type
|
|
1008
|
+
self._operator = operator
|
|
1009
|
+
|
|
1010
|
+
if len(children) == 1:
|
|
1011
|
+
self._input_node = children[0]
|
|
1012
|
+
else:
|
|
1013
|
+
self._input_node = None
|
|
1014
|
+
|
|
1015
|
+
self._children: tuple[QueryPlan] = tuple(children) if children else ()
|
|
1016
|
+
self._plan_params = plan_params
|
|
1017
|
+
self._estimates = estimates
|
|
1018
|
+
self._measures = measures
|
|
1019
|
+
self._subplan = subplan
|
|
1020
|
+
|
|
1021
|
+
@property
|
|
1022
|
+
def node_type(self) -> str:
|
|
1023
|
+
"""Get the name of the operator."""
|
|
1024
|
+
return self._node_type
|
|
1025
|
+
|
|
1026
|
+
@property
|
|
1027
|
+
def operator(self) -> Optional[PhysicalOperator]:
|
|
1028
|
+
"""Get the actual operator that is used to compute the result set.
|
|
1029
|
+
|
|
1030
|
+
For transient operators (e.g. hash tables), this can be *None*.
|
|
1031
|
+
"""
|
|
1032
|
+
return self._operator
|
|
1033
|
+
|
|
1034
|
+
@property
|
|
1035
|
+
def input_node(self) -> Optional[QueryPlan]:
|
|
1036
|
+
"""Get the input node of the current operator.
|
|
1037
|
+
|
|
1038
|
+
For nodes without an input (e.g. most scans), or nodes with multiple inputs (e.g. joins), this is *None*.
|
|
1039
|
+
"""
|
|
1040
|
+
return self._input_node
|
|
1041
|
+
|
|
1042
|
+
@property
|
|
1043
|
+
def children(self) -> Sequence[QueryPlan]:
|
|
1044
|
+
"""Get the input nodes of the current operator.
|
|
1045
|
+
|
|
1046
|
+
For nodes without an input (e.g. most scans), this is an empty list. For nodes with exactly one input (e.g. most
|
|
1047
|
+
aggregations), this is a singleton list. For nodes with two input nodes (e.g. joins), this is an ordered iterable
|
|
1048
|
+
with the outer child first.
|
|
1049
|
+
"""
|
|
1050
|
+
return self._children
|
|
1051
|
+
|
|
1052
|
+
@property
|
|
1053
|
+
def outer_child(self) -> Optional[QueryPlan]:
|
|
1054
|
+
"""Get the outer input of the current operator.
|
|
1055
|
+
|
|
1056
|
+
For nodes that do not have exactly two inputs, this is *None*.
|
|
1057
|
+
"""
|
|
1058
|
+
if len(self._children) == 2:
|
|
1059
|
+
return self._children[0]
|
|
1060
|
+
return None
|
|
1061
|
+
|
|
1062
|
+
@property
|
|
1063
|
+
def inner_child(self) -> Optional[QueryPlan]:
|
|
1064
|
+
"""Get the inner input of the current operator.
|
|
1065
|
+
|
|
1066
|
+
For nodes that do not have exactly two inputs, this is *None*.
|
|
1067
|
+
"""
|
|
1068
|
+
if len(self._children) == 2:
|
|
1069
|
+
return self._children[1]
|
|
1070
|
+
return None
|
|
1071
|
+
|
|
1072
|
+
@property
|
|
1073
|
+
def params(self) -> PlanParams:
|
|
1074
|
+
"""Get the structural metadata of the operator."""
|
|
1075
|
+
return self._plan_params
|
|
1076
|
+
|
|
1077
|
+
@property
|
|
1078
|
+
def base_table(self) -> Optional[TableReference]:
|
|
1079
|
+
"""Get the table that is being scanned. For non-scan nodes, this will probably is *None*.
|
|
1080
|
+
|
|
1081
|
+
This is just a shorthand for accessing the plan parameters manually.
|
|
1082
|
+
|
|
1083
|
+
See Also
|
|
1084
|
+
--------
|
|
1085
|
+
PlanParams.base_table
|
|
1086
|
+
"""
|
|
1087
|
+
return self._plan_params.base_table
|
|
1088
|
+
|
|
1089
|
+
@property
|
|
1090
|
+
def filter_predicate(self) -> Optional[AbstractPredicate]:
|
|
1091
|
+
"""Get the filter predicate that is used to restrict the tuples in the output of a relation.
|
|
1092
|
+
|
|
1093
|
+
This is just a shorthand for accessing the plan parameters manually.
|
|
1094
|
+
|
|
1095
|
+
See Also
|
|
1096
|
+
--------
|
|
1097
|
+
PlanParams.filter_predicate
|
|
1098
|
+
"""
|
|
1099
|
+
return self._plan_params.filter_predicate
|
|
1100
|
+
|
|
1101
|
+
@property
|
|
1102
|
+
def sort_keys(self) -> Sequence[SortKey]:
|
|
1103
|
+
"""Get the sort keys describing the ordering of tuples in the output relation.
|
|
1104
|
+
|
|
1105
|
+
Absence of a specific sort order is indicated by an empty sequence.
|
|
1106
|
+
|
|
1107
|
+
This is just a shorthand for accessing the plan parameters manually.
|
|
1108
|
+
|
|
1109
|
+
See Also
|
|
1110
|
+
--------
|
|
1111
|
+
PlanParams.sort_keys
|
|
1112
|
+
"""
|
|
1113
|
+
return self._plan_params.sort_keys
|
|
1114
|
+
|
|
1115
|
+
@property
|
|
1116
|
+
def lookup_key(self) -> Optional[SqlExpression]:
|
|
1117
|
+
"""Get the expression that is used to lookup tuples in some indexing structure.
|
|
1118
|
+
|
|
1119
|
+
This is just a shorthand for accessing the plan parameters manually.
|
|
1120
|
+
|
|
1121
|
+
See Also
|
|
1122
|
+
--------
|
|
1123
|
+
PlanParams.lookup_key
|
|
1124
|
+
"""
|
|
1125
|
+
return self._plan_params.lookup_key
|
|
1126
|
+
|
|
1127
|
+
@property
|
|
1128
|
+
def parallel_workers(self) -> int:
|
|
1129
|
+
"""Get the number of parallel workers that should be used to execute the operator.
|
|
1130
|
+
|
|
1131
|
+
Absence of parallel execution is indicated by 0.
|
|
1132
|
+
|
|
1133
|
+
This is just a shorthand for accessing the plan parameters manually.
|
|
1134
|
+
|
|
1135
|
+
See Also
|
|
1136
|
+
--------
|
|
1137
|
+
PlanParams.parallel_workers
|
|
1138
|
+
"""
|
|
1139
|
+
return self._plan_params.parallel_workers
|
|
1140
|
+
|
|
1141
|
+
@property
|
|
1142
|
+
def estimates(self) -> PlanEstimates:
|
|
1143
|
+
"""Get the optimizer's view on the operator."""
|
|
1144
|
+
return self._estimates
|
|
1145
|
+
|
|
1146
|
+
@property
|
|
1147
|
+
def estimated_cardinality(self) -> Cardinality:
|
|
1148
|
+
"""Get the cardinality estimate of the optimizer.
|
|
1149
|
+
|
|
1150
|
+
This is just a shorthand for accessing the estimates manually.
|
|
1151
|
+
|
|
1152
|
+
See Also
|
|
1153
|
+
--------
|
|
1154
|
+
PlanEstimates.cardinality
|
|
1155
|
+
"""
|
|
1156
|
+
return self._estimates.cardinality
|
|
1157
|
+
|
|
1158
|
+
@property
|
|
1159
|
+
def estimated_cost(self) -> Cost:
|
|
1160
|
+
"""Get the cost estimate of the optimizer.
|
|
1161
|
+
|
|
1162
|
+
This is just a shorthand for accessing the estimates manually.
|
|
1163
|
+
|
|
1164
|
+
See Also
|
|
1165
|
+
--------
|
|
1166
|
+
PlanEstimates.cost
|
|
1167
|
+
"""
|
|
1168
|
+
return self._estimates.cost
|
|
1169
|
+
|
|
1170
|
+
@property
|
|
1171
|
+
def measures(self) -> PlanMeasures:
|
|
1172
|
+
"""Get the actual execution statistics of the operator."""
|
|
1173
|
+
return self._measures
|
|
1174
|
+
|
|
1175
|
+
@property
|
|
1176
|
+
def actual_cardinality(self) -> Cardinality:
|
|
1177
|
+
"""Get the actual cardinality of the operator.
|
|
1178
|
+
|
|
1179
|
+
This is just a shorthand for accessing the measures manually.
|
|
1180
|
+
|
|
1181
|
+
See Also
|
|
1182
|
+
--------
|
|
1183
|
+
PlanMeasures.cardinality
|
|
1184
|
+
"""
|
|
1185
|
+
return self._measures.cardinality
|
|
1186
|
+
|
|
1187
|
+
@property
|
|
1188
|
+
def execution_time(self) -> float:
|
|
1189
|
+
"""Get the actual execution time of the operator.
|
|
1190
|
+
|
|
1191
|
+
This is just a shorthand for accessing the measures manually.
|
|
1192
|
+
|
|
1193
|
+
See Also
|
|
1194
|
+
--------
|
|
1195
|
+
PlanMeasures.execution_time
|
|
1196
|
+
"""
|
|
1197
|
+
return self._measures.execution_time
|
|
1198
|
+
|
|
1199
|
+
@property
|
|
1200
|
+
def subplan(self) -> Optional[Subplan]:
|
|
1201
|
+
"""Get the subplan that has to be executed as part of this node."""
|
|
1202
|
+
return self._subplan
|
|
1203
|
+
|
|
1204
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
1205
|
+
"""Retrieves a specific parameter from the plan.
|
|
1206
|
+
|
|
1207
|
+
The lookup is performed in the following order:
|
|
1208
|
+
|
|
1209
|
+
1. Plan parameters
|
|
1210
|
+
2. Plan estimates
|
|
1211
|
+
3. Plan measures
|
|
1212
|
+
|
|
1213
|
+
If none of these containers contains the requested key, the default value is returned.
|
|
1214
|
+
"""
|
|
1215
|
+
value = self._plan_params.get(key)
|
|
1216
|
+
if value is not None:
|
|
1217
|
+
return value
|
|
1218
|
+
value = self._estimates.get(key)
|
|
1219
|
+
if value is not None:
|
|
1220
|
+
return value
|
|
1221
|
+
value = self._measures.get(key)
|
|
1222
|
+
if value is not None:
|
|
1223
|
+
return value
|
|
1224
|
+
return default
|
|
1225
|
+
|
|
1226
|
+
def is_join(self) -> bool:
|
|
1227
|
+
"""Checks, whether the current node is a join operator."""
|
|
1228
|
+
return self._operator is not None and self._operator in JoinOperator
|
|
1229
|
+
|
|
1230
|
+
def is_scan(self) -> bool:
|
|
1231
|
+
"""Checks, whether the current node is a scan operator."""
|
|
1232
|
+
return self._operator is not None and self._operator in ScanOperator
|
|
1233
|
+
|
|
1234
|
+
def is_auxiliary(self) -> bool:
|
|
1235
|
+
"""Checks, whether the current node is an arbitrary intermediate operator (i.e. not a join nor a scan)."""
|
|
1236
|
+
return not self.is_join() and not self.is_scan()
|
|
1237
|
+
|
|
1238
|
+
def is_analyze(self) -> bool:
|
|
1239
|
+
"""Checks, whether the plan was executed in ANALYZE mode, i.e. whether runtime measurements are available."""
|
|
1240
|
+
return bool(self._measures)
|
|
1241
|
+
|
|
1242
|
+
def is_ordered(self) -> bool:
|
|
1243
|
+
"""Checks, whether the plan guarantees a specific order of the result tuples."""
|
|
1244
|
+
return bool(self._plan_params.sort_keys)
|
|
1245
|
+
|
|
1246
|
+
def is_linear(self) -> bool:
|
|
1247
|
+
"""Checks, whether the plan performs all joins in a linear order.
|
|
1248
|
+
|
|
1249
|
+
This is the case if all join nodes compute their result by joining at least one base table (no matter whether it
|
|
1250
|
+
is the inner or outer child) with another relation (base relation or intermediate).
|
|
1251
|
+
|
|
1252
|
+
As a special case, scan nodes are considered to be linear as well.
|
|
1253
|
+
"""
|
|
1254
|
+
if self.is_scan():
|
|
1255
|
+
return True
|
|
1256
|
+
outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
|
|
1257
|
+
inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
|
|
1258
|
+
return outer_join is None or inner_join is None
|
|
1259
|
+
|
|
1260
|
+
def is_bushy(self) -> bool:
|
|
1261
|
+
"""Checks, whether the plan performs joins in a bushy order.
|
|
1262
|
+
|
|
1263
|
+
This is the case if at least one join node joins two intermediates that are themselves the result of a join.
|
|
1264
|
+
"""
|
|
1265
|
+
if self.is_scan():
|
|
1266
|
+
return False
|
|
1267
|
+
outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
|
|
1268
|
+
inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
|
|
1269
|
+
return outer_join is not None and inner_join is not None
|
|
1270
|
+
|
|
1271
|
+
def is_left_deep(self) -> bool:
|
|
1272
|
+
"""Checks, whether the plan performs all joins in a left-deep order.
|
|
1273
|
+
|
|
1274
|
+
Left deep order means that the plan is linear and all joins are performed with the base table as the inner relation.
|
|
1275
|
+
As a special case, scan nodes are considered to be right-deep as well.
|
|
1276
|
+
"""
|
|
1277
|
+
if self.is_scan():
|
|
1278
|
+
return True
|
|
1279
|
+
inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
|
|
1280
|
+
return inner_join is None
|
|
1281
|
+
|
|
1282
|
+
def is_right_deep(self) -> bool:
|
|
1283
|
+
"""Checks, whether the plan performs all joins in a right-deep order.
|
|
1284
|
+
|
|
1285
|
+
Right deep order means that the plan is linear and all joins are performed with the base table as the outer relation.
|
|
1286
|
+
As a special case, scan nodes are considered to be right-deep as well.
|
|
1287
|
+
"""
|
|
1288
|
+
if self.is_scan():
|
|
1289
|
+
return True
|
|
1290
|
+
outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
|
|
1291
|
+
return outer_join is None
|
|
1292
|
+
|
|
1293
|
+
def is_zigzag(self) -> bool:
|
|
1294
|
+
"""Checks, whether the plan performs all joins in a zigzag order.
|
|
1295
|
+
|
|
1296
|
+
Zig-zag order means that the plan is linear, but neither left-deep nor right-deep. Therefore, at least one join has
|
|
1297
|
+
to be performed with the base table as the outer relation and another join with the base table as the inner relation.
|
|
1298
|
+
As a special case, scan nodes are considered to be zig-zag as well.
|
|
1299
|
+
"""
|
|
1300
|
+
if self.is_scan():
|
|
1301
|
+
return True
|
|
1302
|
+
return self.is_linear() and not self.is_left_deep() and not self.is_right_deep()
|
|
1303
|
+
|
|
1304
|
+
def is_scan_branch(self) -> bool:
|
|
1305
|
+
"""Checks, whether the current node directly leads to a scan node.
|
|
1306
|
+
|
|
1307
|
+
For example, the plan *Hash(SeqScan(R))* is a scan branch, because the input of the hash node is a scan node.
|
|
1308
|
+
Likewise, the plan *Aggregate(Sort(R))* is a scan branch, because the input of the aggregate node is just a sort
|
|
1309
|
+
node which in turn contains a scan node. On the other hand, the plan *NestLoop(SeqScan(R), IdxScan(S))* is not a
|
|
1310
|
+
scan branch, because the nested-loop join contains two input nodes that are both scans.
|
|
1311
|
+
|
|
1312
|
+
If a plan is a scan branch, `fetch_base_table()` can be used to directly retrieve the base table that is being scanned.
|
|
1313
|
+
"""
|
|
1314
|
+
return self.is_scan() or self.input_node.is_scan_branch()
|
|
1315
|
+
|
|
1316
|
+
def is_base_join(self) -> bool:
|
|
1317
|
+
"""Checks, whether the current node is a join node that joins two base tables.
|
|
1318
|
+
|
|
1319
|
+
The base tables do not need to be direct children of the join, but both at least have to be scan branches, as in the
|
|
1320
|
+
case of *MergeJoin(Sort(SeqScan(R)), IdxScan(S))*.
|
|
1321
|
+
|
|
1322
|
+
See Also
|
|
1323
|
+
--------
|
|
1324
|
+
is_scan_branch
|
|
1325
|
+
"""
|
|
1326
|
+
if not self.is_join():
|
|
1327
|
+
return False
|
|
1328
|
+
return all(child.is_scan_branch() for child in self.children)
|
|
1329
|
+
|
|
1330
|
+
def plan_depth(self) -> int:
|
|
1331
|
+
"""Calculates the depth of the query plan.
|
|
1332
|
+
|
|
1333
|
+
The depth of a query plan is the length of the longest path from the root to a leaf node. The leaf node is included in
|
|
1334
|
+
the calculation, i.e. the depth of the plan *SeqScan(R)* is 1.
|
|
1335
|
+
"""
|
|
1336
|
+
return 1 + max((child.plan_depth() for child in self.children), default=0)
|
|
1337
|
+
|
|
1338
|
+
def fetch_base_table(self) -> Optional[TableReference]:
|
|
1339
|
+
"""Retrieves the base table that is being scanned by the plan.
|
|
1340
|
+
|
|
1341
|
+
The base table is only specified for plans that directly lead to a scan node, as defined by `is_scan_branch()`.
|
|
1342
|
+
"""
|
|
1343
|
+
if self.is_scan():
|
|
1344
|
+
return self.base_table
|
|
1345
|
+
elif self.is_join():
|
|
1346
|
+
return None
|
|
1347
|
+
|
|
1348
|
+
if len(self.children) == 1:
|
|
1349
|
+
return self.children[0].fetch_base_table()
|
|
1350
|
+
return None
|
|
1351
|
+
|
|
1352
|
+
def outermost_scan(self) -> Optional[QueryPlan]:
|
|
1353
|
+
"""Retrieves the scan node that is furthest to the "left", i.e. on the outer-most position in the plan."""
|
|
1354
|
+
if self.is_scan():
|
|
1355
|
+
return self
|
|
1356
|
+
elif self.is_join():
|
|
1357
|
+
return self.outer_child.outermost_scan()
|
|
1358
|
+
|
|
1359
|
+
assert self.input_node is not None
|
|
1360
|
+
return self.input_node.outermost_scan()
|
|
1361
|
+
|
|
1362
|
+
def tables(self) -> set[TableReference]:
|
|
1363
|
+
"""Provides all tables that are accessed at some point in the plan.
|
|
1364
|
+
|
|
1365
|
+
Notice that tables that are only accessed as part of user-specific metadata are not considered.
|
|
1366
|
+
"""
|
|
1367
|
+
subplan_tabs: set[TableReference] = (
|
|
1368
|
+
self._subplan.tables() if self._subplan else set()
|
|
1369
|
+
)
|
|
1370
|
+
return (
|
|
1371
|
+
self._plan_params.tables()
|
|
1372
|
+
| util.set_union(c.tables() for c in self._children)
|
|
1373
|
+
| subplan_tabs
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
def columns(self) -> set[ColumnReference]:
|
|
1377
|
+
"""Provides all columns that are accessed at some point in the plan.
|
|
1378
|
+
|
|
1379
|
+
Notice that columns that are only accessed as part of user-specific metadata are not considered.
|
|
1380
|
+
"""
|
|
1381
|
+
subplan_cols = self._subplan.root.columns() if self._subplan else set()
|
|
1382
|
+
return (
|
|
1383
|
+
self._plan_params.columns()
|
|
1384
|
+
| util.set_union(c.columns() for c in self._children)
|
|
1385
|
+
| subplan_cols
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
def iternodes(self) -> Iterable[QueryPlan]:
|
|
1389
|
+
"""Provides all nodes that are contained in the plan in depth-first order, prioritizing outer child nodes."""
|
|
1390
|
+
return util.flatten(child.iternodes() for child in self._children) + [self]
|
|
1391
|
+
|
|
1392
|
+
def lookup(
|
|
1393
|
+
self, tables: TableReference | Iterable[TableReference]
|
|
1394
|
+
) -> Optional[QueryPlan]:
|
|
1395
|
+
"""Traverse the plan to find a specific intermediate node.
|
|
1396
|
+
|
|
1397
|
+
If two nodes compute the same intermediate (i.e. provide the same tables), the node that is higher up in the plan is
|
|
1398
|
+
returned. If both appear on the same level, the outer child is preferred.
|
|
1399
|
+
|
|
1400
|
+
Parameters
|
|
1401
|
+
----------
|
|
1402
|
+
tables : TableReference | Iterable[TableReference]
|
|
1403
|
+
The tables that should be contained in the intermediate. If a single table is provided (either as-is or as a
|
|
1404
|
+
singleton iterable), the corresponding scan node will be returned. If multiple tables are provided, the highest
|
|
1405
|
+
node that provides all of them *exactly* is returned.
|
|
1406
|
+
|
|
1407
|
+
Returns
|
|
1408
|
+
-------
|
|
1409
|
+
Optional[QueryPlan]
|
|
1410
|
+
The join tree node that contains the specified tables. If no such node exists, *None* is returned.
|
|
1411
|
+
"""
|
|
1412
|
+
needle: set[TableReference] = set(util.enlist(tables))
|
|
1413
|
+
candidates = self.tables()
|
|
1414
|
+
|
|
1415
|
+
if needle == candidates:
|
|
1416
|
+
return self
|
|
1417
|
+
if not needle.issubset(candidates):
|
|
1418
|
+
return None
|
|
1419
|
+
|
|
1420
|
+
for child in self.children:
|
|
1421
|
+
result = child.lookup(needle)
|
|
1422
|
+
if result is not None:
|
|
1423
|
+
return result
|
|
1424
|
+
|
|
1425
|
+
return None
|
|
1426
|
+
|
|
1427
|
+
def find_first_node(
|
|
1428
|
+
self,
|
|
1429
|
+
predicate: Callable[[QueryPlan], bool],
|
|
1430
|
+
*args,
|
|
1431
|
+
direction: JoinDirection = "outer",
|
|
1432
|
+
**kwargs,
|
|
1433
|
+
) -> Optional[QueryPlan]:
|
|
1434
|
+
"""Recursively searches for the first node that matches a specific predicate.
|
|
1435
|
+
|
|
1436
|
+
Parameters
|
|
1437
|
+
----------
|
|
1438
|
+
predicate : Callable[[QueryPlan], bool]
|
|
1439
|
+
The predicate to check. The predicate is called on each node in the tree and should return a *True-ish* value if
|
|
1440
|
+
the node matches the desired search criteria.
|
|
1441
|
+
direction : JoinDirection, optional
|
|
1442
|
+
The traversal strategy to use. *Outer* (the default) indicates that the outer child should be traversed first if
|
|
1443
|
+
the check on the parent node fails. *Inner* indicates the opposite.
|
|
1444
|
+
args
|
|
1445
|
+
Additional positional arguments that are passed to the predicate *after* the current node.
|
|
1446
|
+
kwargs
|
|
1447
|
+
Additional keyword arguments that are passed to the predicate.
|
|
1448
|
+
|
|
1449
|
+
Returns
|
|
1450
|
+
-------
|
|
1451
|
+
Optional[QueryPlan]
|
|
1452
|
+
The first node that matches the predicate. If no such node exists, *None* is returned.
|
|
1453
|
+
"""
|
|
1454
|
+
if predicate(self, *args, **kwargs):
|
|
1455
|
+
return self
|
|
1456
|
+
if not self.children:
|
|
1457
|
+
return None
|
|
1458
|
+
|
|
1459
|
+
if len(self.children) == 1:
|
|
1460
|
+
return self.input_node.find_first_node(
|
|
1461
|
+
predicate, *args, direction=direction, **kwargs
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
first_candidate, second_candidate = (
|
|
1465
|
+
(self.outer_child, self.inner_child)
|
|
1466
|
+
if direction == "outer"
|
|
1467
|
+
else (self.inner_child, self.outer_child)
|
|
1468
|
+
)
|
|
1469
|
+
first_match = first_candidate.find_first_node(
|
|
1470
|
+
predicate, *args, direction=direction, **kwargs
|
|
1471
|
+
)
|
|
1472
|
+
if first_match:
|
|
1473
|
+
return first_match
|
|
1474
|
+
|
|
1475
|
+
second_match = second_candidate.find_first_node(
|
|
1476
|
+
predicate, *args, direction=direction, **kwargs
|
|
1477
|
+
)
|
|
1478
|
+
if second_match:
|
|
1479
|
+
return second_match
|
|
1480
|
+
|
|
1481
|
+
return (
|
|
1482
|
+
self._subplan.root.find_first_node(
|
|
1483
|
+
predicate, *args, direction=direction, **kwargs
|
|
1484
|
+
)
|
|
1485
|
+
if self._subplan
|
|
1486
|
+
else None
|
|
1487
|
+
)
|
|
1488
|
+
|
|
1489
|
+
def find_all_nodes(
|
|
1490
|
+
self, predicate: Callable[[QueryPlan], bool], *args, **kwargs
|
|
1491
|
+
) -> Iterable[QueryPlan]:
|
|
1492
|
+
"""Recursively searches for all nodes that match a specific predicate.
|
|
1493
|
+
|
|
1494
|
+
The order in which the matching nodes appear is an implementation detail and should not be relied upon.
|
|
1495
|
+
|
|
1496
|
+
Parameters
|
|
1497
|
+
----------
|
|
1498
|
+
predicate : Callable[[QueryPlan], bool]
|
|
1499
|
+
The predicate to check. The predicate is called on each node in the tree and should return a *True-ish* value if
|
|
1500
|
+
the node matches the desired search criteria.
|
|
1501
|
+
args
|
|
1502
|
+
Additional positional arguments that are passed to the predicate *after* the current node.
|
|
1503
|
+
kwargs
|
|
1504
|
+
Additional keyword arguments that are passed to the predicate.
|
|
1505
|
+
|
|
1506
|
+
Returns
|
|
1507
|
+
-------
|
|
1508
|
+
Iterable[QueryPlan]
|
|
1509
|
+
All nodes that match the predicate. If no such nodes exist, an empty iterable is returned.
|
|
1510
|
+
"""
|
|
1511
|
+
matches: list[QueryPlan] = [self] if predicate(self, *args, **kwargs) else []
|
|
1512
|
+
for child in self._children:
|
|
1513
|
+
matches.extend(child.find_all_nodes(predicate, *args, **kwargs))
|
|
1514
|
+
if self._subplan:
|
|
1515
|
+
matches.extend(
|
|
1516
|
+
self._subplan.root.find_all_nodes(predicate, *args, **kwargs)
|
|
1517
|
+
)
|
|
1518
|
+
return matches
|
|
1519
|
+
|
|
1520
|
+
def cout(self, *, include_auxiliaries: bool = True) -> float:
|
|
1521
|
+
"""Computes the *C-out* value of the operator.
|
|
1522
|
+
|
|
1523
|
+
The *C-out* value is the sum of the cardinalities of the current operator and all its children.
|
|
1524
|
+
|
|
1525
|
+
If the plan does not contain a measurement of the actual cardinality, the *C-out* value is undefined (indicated as
|
|
1526
|
+
*NaN*).
|
|
1527
|
+
|
|
1528
|
+
Parameters
|
|
1529
|
+
----------
|
|
1530
|
+
include_auxiliaries : bool, optional
|
|
1531
|
+
Whether to include auxiliary nodes in the computation (which is the default). If disabled, only the actual
|
|
1532
|
+
cardinality of join and scan nodes is considered.
|
|
1533
|
+
"""
|
|
1534
|
+
if not self.is_analyze():
|
|
1535
|
+
return math.nan
|
|
1536
|
+
own_card = (
|
|
1537
|
+
self.actual_cardinality
|
|
1538
|
+
if include_auxiliaries or not self.is_auxiliary()
|
|
1539
|
+
else 0
|
|
1540
|
+
)
|
|
1541
|
+
return own_card + sum(
|
|
1542
|
+
c.cout(include_auxiliaries=include_auxiliaries) for c in self.children
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
def qerror(self) -> float:
|
|
1546
|
+
"""Computes the *Q-error* of the operator.
|
|
1547
|
+
|
|
1548
|
+
If the plan does not contain an estimate of the cardinality, the *Q-error* value is undefined (indicated as *NaN*).
|
|
1549
|
+
|
|
1550
|
+
Notes
|
|
1551
|
+
-----
|
|
1552
|
+
We use the a slight deviation from the standard definition:
|
|
1553
|
+
|
|
1554
|
+
.. math ::
|
|
1555
|
+
qerror(e, a) = \\frac{max(e, a) + 1}{min(e, a) + 1}
|
|
1556
|
+
|
|
1557
|
+
where *e* is the estimated cardinality of the node and *a* is the actual cardinality. Notice that we add 1 to both the
|
|
1558
|
+
numerator as well as the denominator to prevent infinity errors for nodes that do not process any rows (e.g. due to
|
|
1559
|
+
pruning).
|
|
1560
|
+
"""
|
|
1561
|
+
if not self.is_analyze():
|
|
1562
|
+
return math.nan
|
|
1563
|
+
|
|
1564
|
+
larger = max(self.estimated_cardinality, self.actual_cardinality) + 1
|
|
1565
|
+
smaller = min(self.estimated_cardinality, self.actual_cardinality) + 1
|
|
1566
|
+
return larger / smaller
|
|
1567
|
+
|
|
1568
|
+
def parallelize(self, workers: int) -> QueryPlan:
|
|
1569
|
+
plan_params = self._plan_params.clone()
|
|
1570
|
+
plan_params.parallel_workers = workers
|
|
1571
|
+
return QueryPlan(
|
|
1572
|
+
self.node_type,
|
|
1573
|
+
operator=self.operator,
|
|
1574
|
+
children=self.children,
|
|
1575
|
+
plan_params=plan_params,
|
|
1576
|
+
estimates=self._estimates,
|
|
1577
|
+
measures=self._measures,
|
|
1578
|
+
subplan=self.subplan,
|
|
1579
|
+
)
|
|
1580
|
+
|
|
1581
|
+
def with_estimates(
|
|
1582
|
+
self,
|
|
1583
|
+
*,
|
|
1584
|
+
cardinality: Optional[Cardinality] = None,
|
|
1585
|
+
cost: Optional[Cost] = None,
|
|
1586
|
+
keep_measures: bool = False,
|
|
1587
|
+
) -> QueryPlan:
|
|
1588
|
+
"""Replaces the current estimates of the operator with new ones.
|
|
1589
|
+
|
|
1590
|
+
Parameters
|
|
1591
|
+
----------
|
|
1592
|
+
cardinality : Optional[Cardinality], optional
|
|
1593
|
+
The new estimated cardinality of the operator. If the estimate should be dropped *NaN* can be used. If the current
|
|
1594
|
+
cardinality should be kept, *None* can be passed (which is the default).
|
|
1595
|
+
cost : Optional[Cost], optional
|
|
1596
|
+
The new estimated cost of the operator. If the estimate should be dropped, *NaN* can be used. If the current cost
|
|
1597
|
+
should be kept, *None* can be passed (which is the default).
|
|
1598
|
+
keep_measures : bool, optional
|
|
1599
|
+
Whether to keep the actual measurements of the operator. If this is set to *False*, the actual cardinality and
|
|
1600
|
+
execution time are dropped. Measures are dropped by default because they usually depend on the estimates (which
|
|
1601
|
+
are now changed).
|
|
1602
|
+
"""
|
|
1603
|
+
cardinality = self.estimated_cardinality if cardinality is None else cardinality
|
|
1604
|
+
cost = self.estimated_cost if cost is None else cost
|
|
1605
|
+
updated_estimates = PlanEstimates(cardinality=cardinality, cost=cost)
|
|
1606
|
+
updated_measures = self._measures if keep_measures else None
|
|
1607
|
+
return QueryPlan(
|
|
1608
|
+
self.node_type,
|
|
1609
|
+
operator=self.operator,
|
|
1610
|
+
children=self.children,
|
|
1611
|
+
plan_params=self.params,
|
|
1612
|
+
estimates=updated_estimates,
|
|
1613
|
+
measures=updated_measures,
|
|
1614
|
+
subplan=self.subplan,
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
def with_actual_card(
|
|
1618
|
+
self,
|
|
1619
|
+
*,
|
|
1620
|
+
cost_estimator: Optional[Callable[[QueryPlan, Cardinality], Cost]] = None,
|
|
1621
|
+
ignore_nan: bool = True,
|
|
1622
|
+
) -> QueryPlan:
|
|
1623
|
+
"""Replaces the current estimates of the operator with the actual measurements.
|
|
1624
|
+
|
|
1625
|
+
The updated plan will not contain any measurements anymore and the costs will be set to *Nan* unless an explicit cost
|
|
1626
|
+
estimator is provided.
|
|
1627
|
+
|
|
1628
|
+
Parameters
|
|
1629
|
+
----------
|
|
1630
|
+
cost_estimator : Optional[Callable[[QueryPlan, Cardinality], Cost]], optional
|
|
1631
|
+
An optional cost function to compute new estimates based on the new estimates. If no cost estimator is provided,
|
|
1632
|
+
the cost is set to *NaN*. The estimator receives the old plan now along with the new cardinality estimate as input
|
|
1633
|
+
and should return the new cost estimate.
|
|
1634
|
+
ignore_nan : bool, optional
|
|
1635
|
+
Whether *NaN* cardinalities should also be swapped. By default, this is set to *True*, which only replaces the
|
|
1636
|
+
estimated cardinality if the actual cardinality is a meaningful value.
|
|
1637
|
+
|
|
1638
|
+
Returns
|
|
1639
|
+
-------
|
|
1640
|
+
QueryPlan
|
|
1641
|
+
A new query plan with the actual cardinality as the estimated cardinality and the actual execution time as the
|
|
1642
|
+
estimated cost. The current plan is not changed.
|
|
1643
|
+
"""
|
|
1644
|
+
if self.actual_cardinality:
|
|
1645
|
+
updated_cardinality = (
|
|
1646
|
+
self.estimated_cardinality
|
|
1647
|
+
if ignore_nan and self.actual_cardinality.isnan()
|
|
1648
|
+
else self.actual_cardinality
|
|
1649
|
+
)
|
|
1650
|
+
updated_cost = (
|
|
1651
|
+
cost_estimator(self, updated_cardinality)
|
|
1652
|
+
if cost_estimator
|
|
1653
|
+
else math.nan
|
|
1654
|
+
)
|
|
1655
|
+
updated_estimates = PlanEstimates(
|
|
1656
|
+
cardinality=updated_cardinality, cost=updated_cost
|
|
1657
|
+
)
|
|
1658
|
+
updated_measures = None
|
|
1659
|
+
else:
|
|
1660
|
+
updated_estimates = self._estimates
|
|
1661
|
+
updated_measures = None
|
|
1662
|
+
|
|
1663
|
+
updated_children = [
|
|
1664
|
+
child.with_actual_card(cost_estimator=cost_estimator, ignore_nan=ignore_nan)
|
|
1665
|
+
for child in self.children
|
|
1666
|
+
]
|
|
1667
|
+
|
|
1668
|
+
if self.subplan:
|
|
1669
|
+
updated_subplan_root = self.subplan.root.with_actual_card(
|
|
1670
|
+
cost_estimator=cost_estimator, ignore_nan=ignore_nan
|
|
1671
|
+
)
|
|
1672
|
+
updated_subplan = Subplan(updated_subplan_root, self.subplan.target_name)
|
|
1673
|
+
else:
|
|
1674
|
+
updated_subplan = None
|
|
1675
|
+
|
|
1676
|
+
return QueryPlan(
|
|
1677
|
+
self.node_type,
|
|
1678
|
+
operator=self.operator,
|
|
1679
|
+
children=updated_children,
|
|
1680
|
+
plan_params=self.params,
|
|
1681
|
+
estimates=updated_estimates,
|
|
1682
|
+
measures=updated_measures,
|
|
1683
|
+
subplan=updated_subplan,
|
|
1684
|
+
)
|
|
1685
|
+
|
|
1686
|
+
def canonical(self) -> QueryPlan:
|
|
1687
|
+
"""Creates a normalized version of the query plan.
|
|
1688
|
+
|
|
1689
|
+
This normalized version will only contain scan and join nodes, without any auxiliary nodes. Estimates and measurements
|
|
1690
|
+
of these nodes are kept as they are.
|
|
1691
|
+
|
|
1692
|
+
This method is mostly intended to remove system-specific elements of the QEP and provide a more stable representation.
|
|
1693
|
+
For example, Postgres uses a combination of Hash join node and Hash node to represent an actual hash join. Likewise,
|
|
1694
|
+
bitmap scans are represented as a bitmap heap scan with a number of bitmap index scans (and optional bitmap ANDs and
|
|
1695
|
+
ORs) as child nodes. With `canonical` all of these "implementation details" are removed and only the core of the query
|
|
1696
|
+
plan is kept.
|
|
1697
|
+
|
|
1698
|
+
Notice that aggregations and groupings are also auxiliary nodes and will not be available after canonicalization.
|
|
1699
|
+
Therefore, the cost of the canonical query plan might be less than the cost of the original plan.
|
|
1700
|
+
"""
|
|
1701
|
+
if self.subplan:
|
|
1702
|
+
updated_subplan_root = self.subplan.root.canonical()
|
|
1703
|
+
updated_subplan = Subplan(updated_subplan_root, self.subplan.target_name)
|
|
1704
|
+
else:
|
|
1705
|
+
updated_subplan = None
|
|
1706
|
+
|
|
1707
|
+
if self.is_scan():
|
|
1708
|
+
# we remove all child nodes from scans to prevent any bitmap-scan shenanigans
|
|
1709
|
+
return QueryPlan(
|
|
1710
|
+
self.node_type,
|
|
1711
|
+
operator=self.operator,
|
|
1712
|
+
children=[],
|
|
1713
|
+
plan_params=self.params, # params include the base table
|
|
1714
|
+
estimates=self.estimates,
|
|
1715
|
+
measures=self.measures,
|
|
1716
|
+
subplan=updated_subplan,
|
|
1717
|
+
)
|
|
1718
|
+
|
|
1719
|
+
if not self.is_scan() and not self.is_join():
|
|
1720
|
+
# skip over auxiliary nodes
|
|
1721
|
+
return self.input_node.canonical()
|
|
1722
|
+
|
|
1723
|
+
children = [child.canonical() for child in self.children]
|
|
1724
|
+
return QueryPlan(
|
|
1725
|
+
self.node_type,
|
|
1726
|
+
operator=self.operator,
|
|
1727
|
+
children=children,
|
|
1728
|
+
plan_params=self.params,
|
|
1729
|
+
estimates=self.estimates,
|
|
1730
|
+
measures=self.measures,
|
|
1731
|
+
subplan=updated_subplan,
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
def inspect(self, *, fields: Optional[Iterable[str]] = None) -> str:
|
|
1735
|
+
"""Provides a human-readable representation of the query plan, inspired by Postgre's *EXPLAIN* output.
|
|
1736
|
+
|
|
1737
|
+
By default, the output will contain fields akin to the *EXPLAIN* output of Postgres. For example, this includes the
|
|
1738
|
+
estimated cardinality and the operator cost, or for *ANALYZE* plans also the actual measurements.
|
|
1739
|
+
|
|
1740
|
+
This can be customized by providing a list of fields that should be included in the output. The fields can either
|
|
1741
|
+
reference properties of the plan itself (e.g. ``estimated_cardinality``) or of a redirection to the metadata properties
|
|
1742
|
+
(e.g. ``params.index``). However, the current implementation only supports a single level of indirection, i.e. no
|
|
1743
|
+
``params.custom_property.one_more_level``.
|
|
1744
|
+
"""
|
|
1745
|
+
fields = [] if fields is None else list(fields)
|
|
1746
|
+
return _explainify(self, fields=fields)
|
|
1747
|
+
|
|
1748
|
+
def explain(self) -> str:
|
|
1749
|
+
"""Alias for `inspect`
|
|
1750
|
+
|
|
1751
|
+
See Also
|
|
1752
|
+
--------
|
|
1753
|
+
inspect
|
|
1754
|
+
"""
|
|
1755
|
+
return self.inspect()
|
|
1756
|
+
|
|
1757
|
+
def plan_summary(self) -> dict[str, object]:
|
|
1758
|
+
"""Provides a quick summary of important properties of the query plan, inspired by Panda's *describe* method."""
|
|
1759
|
+
all_nodes = list(self.iternodes())
|
|
1760
|
+
summary: dict[str, object] = {
|
|
1761
|
+
"operator": self.node_type,
|
|
1762
|
+
"intermediate": " ⋈ ".join(str(t) for t in self.tables()),
|
|
1763
|
+
"estimated_card": round(self.estimated_cardinality, 3),
|
|
1764
|
+
"actual_card": round(self.actual_cardinality, 3),
|
|
1765
|
+
"estimated_cost": round(self.estimated_cost, 3),
|
|
1766
|
+
"c_out": self.cout(),
|
|
1767
|
+
"max_qerror": round(max(node.qerror() for node in all_nodes), 3),
|
|
1768
|
+
"avg_qerror": round(
|
|
1769
|
+
sum(node.qerror() for node in all_nodes) / len(all_nodes), 3
|
|
1770
|
+
),
|
|
1771
|
+
"phys_ops": collections.Counter(child.node_type for child in all_nodes),
|
|
1772
|
+
}
|
|
1773
|
+
return summary
|
|
1774
|
+
|
|
1775
|
+
def ast(self) -> str:
|
|
1776
|
+
"""Provides the tree-structure of the plan in a human-readable format."""
|
|
1777
|
+
return _astify(self)
|
|
1778
|
+
|
|
1779
|
+
def clone(self, *, deep: bool = False) -> QueryPlan:
|
|
1780
|
+
return self.__deepcopy__({}) if deep else self.__copy__()
|
|
1781
|
+
|
|
1782
|
+
def __json__(self) -> jsondict:
|
|
1783
|
+
return {
|
|
1784
|
+
"node_type": self.node_type,
|
|
1785
|
+
"operator": self.operator,
|
|
1786
|
+
"children": self.children,
|
|
1787
|
+
"plan_params": self._plan_params,
|
|
1788
|
+
"estimates": self._estimates,
|
|
1789
|
+
"measures": self._measures,
|
|
1790
|
+
"subplan": self._subplan,
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
def __copy__(self) -> QueryPlan:
|
|
1794
|
+
return QueryPlan(
|
|
1795
|
+
self._node_type,
|
|
1796
|
+
operator=self._operator,
|
|
1797
|
+
children=self._children,
|
|
1798
|
+
plan_params=self._plan_params.clone(deep=False),
|
|
1799
|
+
estimates=self._estimates.clone(deep=False),
|
|
1800
|
+
measures=self._measures.clone(deep=False) if self._measures else None,
|
|
1801
|
+
subplan=self._subplan.clone(deep=False) if self._subplan else None,
|
|
1802
|
+
)
|
|
1803
|
+
|
|
1804
|
+
def __deepcopy__(self, memo: dict[int, object] = {}) -> QueryPlan:
|
|
1805
|
+
return QueryPlan(
|
|
1806
|
+
self._node_type,
|
|
1807
|
+
operator=self._operator,
|
|
1808
|
+
children=[child.__deepcopy__(memo) for child in self._children],
|
|
1809
|
+
plan_params=self._plan_params.clone(deep=True),
|
|
1810
|
+
estimates=self._estimates.clone(deep=True),
|
|
1811
|
+
measures=self._measures.clone(deep=True) if self._measures else None,
|
|
1812
|
+
subplan=self._subplan.clone(deep=True) if self._subplan else None,
|
|
1813
|
+
)
|
|
1814
|
+
|
|
1815
|
+
def __len__(self) -> int:
|
|
1816
|
+
return self.plan_depth()
|
|
1817
|
+
|
|
1818
|
+
def __contains__(
|
|
1819
|
+
self, key: str | TableReference | Iterable[TableReference]
|
|
1820
|
+
) -> bool:
|
|
1821
|
+
if isinstance(key, TableReference):
|
|
1822
|
+
return key in self.tables()
|
|
1823
|
+
elif isinstance(key, Iterable):
|
|
1824
|
+
return set(key).issubset(self.tables())
|
|
1825
|
+
|
|
1826
|
+
return (
|
|
1827
|
+
key in self._plan_params
|
|
1828
|
+
or key in self._estimates
|
|
1829
|
+
or (self._measures and key in self._measures)
|
|
1830
|
+
)
|
|
1831
|
+
|
|
1832
|
+
def __getitem__(self, key: str) -> Any:
|
|
1833
|
+
if key in self._plan_params:
|
|
1834
|
+
return self._plan_params[key]
|
|
1835
|
+
if key in self._estimates:
|
|
1836
|
+
return self._estimates[key]
|
|
1837
|
+
if self._measures and key in self._measures:
|
|
1838
|
+
return self._measures[key]
|
|
1839
|
+
raise KeyError(f"'{key}' not found")
|
|
1840
|
+
|
|
1841
|
+
def __iter__(self) -> Iterator[QueryPlan]:
|
|
1842
|
+
yield self
|
|
1843
|
+
for child in self.children:
|
|
1844
|
+
yield from child
|
|
1845
|
+
if self.subplan:
|
|
1846
|
+
yield from self.subplan.root
|
|
1847
|
+
|
|
1848
|
+
def __eq__(self, other: object) -> bool:
|
|
1849
|
+
return (
|
|
1850
|
+
isinstance(other, type(self))
|
|
1851
|
+
and self._node_type == other._node_type
|
|
1852
|
+
and self.base_table == other.base_table
|
|
1853
|
+
and self._children == other._children
|
|
1854
|
+
)
|
|
1855
|
+
|
|
1856
|
+
def __hash__(self) -> int:
|
|
1857
|
+
return hash((self.node_type, self.base_table, self._children))
|
|
1858
|
+
|
|
1859
|
+
def __repr__(self) -> str:
|
|
1860
|
+
return str(self)
|
|
1861
|
+
|
|
1862
|
+
def __str__(self) -> str:
|
|
1863
|
+
normalized_node_type = self.node_type.replace(" ", "")
|
|
1864
|
+
if self.base_table:
|
|
1865
|
+
return f"{normalized_node_type}({self.base_table.identifier()})"
|
|
1866
|
+
child_texts = ", ".join(str(child) for child in self.children)
|
|
1867
|
+
return f"{normalized_node_type}({child_texts})"
|
|
1868
|
+
|
|
1869
|
+
|
|
1870
|
+
_starting_indentation = 0
|
|
1871
|
+
|
|
1872
|
+
|
|
1873
|
+
def _default_explain(plan: QueryPlan, *, padding: str) -> str:
|
|
1874
|
+
"""Generates the Postgres-style *EXPLAIN* output for the current node."""
|
|
1875
|
+
components: list[str] = []
|
|
1876
|
+
metadata_indent = " " if padding else " "
|
|
1877
|
+
|
|
1878
|
+
estimated_card = round(plan.estimated_cardinality, 3)
|
|
1879
|
+
estimated_cost = round(plan.estimated_cost, 3)
|
|
1880
|
+
components.append(
|
|
1881
|
+
f"{padding}{metadata_indent}Estimated Cardinality={estimated_card}, Estimated Cost={estimated_cost}"
|
|
1882
|
+
)
|
|
1883
|
+
|
|
1884
|
+
if plan.is_analyze():
|
|
1885
|
+
actual_card = round(plan.actual_cardinality, 3)
|
|
1886
|
+
exec_time = round(plan.execution_time, 3)
|
|
1887
|
+
components.append(
|
|
1888
|
+
f"{padding}{metadata_indent}Actual Cardinality={actual_card}, Actual Time={exec_time}s"
|
|
1889
|
+
)
|
|
1890
|
+
|
|
1891
|
+
measures = plan.measures
|
|
1892
|
+
if measures.cache_hits is not None or measures.cache_misses is not None:
|
|
1893
|
+
cache_hits = (
|
|
1894
|
+
measures.cache_hits if measures.cache_hits is not None else math.nan
|
|
1895
|
+
)
|
|
1896
|
+
cache_misses = (
|
|
1897
|
+
measures.cache_misses if measures.cache_misses is not None else math.nan
|
|
1898
|
+
)
|
|
1899
|
+
components.append(
|
|
1900
|
+
f"{padding}{metadata_indent}Cache Hits={cache_hits}, Cache Misses={cache_misses}"
|
|
1901
|
+
)
|
|
1902
|
+
|
|
1903
|
+
params = plan.params
|
|
1904
|
+
if params.parallel_workers:
|
|
1905
|
+
components.append(
|
|
1906
|
+
f"{padding}{metadata_indent}Parallel Workers={params.parallel_workers}"
|
|
1907
|
+
)
|
|
1908
|
+
if params.lookup_key:
|
|
1909
|
+
components.append(f"{padding}{metadata_indent}Lookup Key={params.lookup_key}")
|
|
1910
|
+
|
|
1911
|
+
path_props: list[str] = []
|
|
1912
|
+
if params.index:
|
|
1913
|
+
path_props.append(f"Index={params.index}")
|
|
1914
|
+
if params.sort_keys:
|
|
1915
|
+
sort_keys = ", ".join(str(key) for key in params.sort_keys)
|
|
1916
|
+
path_props.append(f"Sort Keys={sort_keys}")
|
|
1917
|
+
if path_props:
|
|
1918
|
+
components.append(f"{padding}{metadata_indent}{', '.join(path_props)}")
|
|
1919
|
+
|
|
1920
|
+
return "\n".join(components)
|
|
1921
|
+
|
|
1922
|
+
|
|
1923
|
+
def _custom_explain(plan: QueryPlan, *, fields: list[str], padding: str) -> str:
|
|
1924
|
+
"""Generates the user-specific *EXPLAIN* output for the current node."""
|
|
1925
|
+
attr_values: dict[str, str] = {}
|
|
1926
|
+
for attr in fields:
|
|
1927
|
+
if "." in attr:
|
|
1928
|
+
container_name, attr_name = attr.split(".")
|
|
1929
|
+
container = getattr(plan, container_name)
|
|
1930
|
+
value = getattr(container, attr_name)
|
|
1931
|
+
else:
|
|
1932
|
+
value = getattr(plan, attr)
|
|
1933
|
+
|
|
1934
|
+
attr_values[attr] = (
|
|
1935
|
+
str(round(value, 3)) if isinstance(value, Number) else str(value)
|
|
1936
|
+
)
|
|
1937
|
+
|
|
1938
|
+
attr_str = " ".join(f"{attr}={val}" for attr, val in attr_values.items())
|
|
1939
|
+
explain_data = f"{padding} [{attr_str}]"
|
|
1940
|
+
return explain_data
|
|
1941
|
+
|
|
1942
|
+
|
|
1943
|
+
def _explainify(
|
|
1944
|
+
plan: QueryPlan, *, fields: list[str], level: int = _starting_indentation
|
|
1945
|
+
) -> str:
|
|
1946
|
+
"""Handler method to generate the *EXPLAIN* output for the current node and its children."""
|
|
1947
|
+
padding = "" if not level else " " + " " * (level - 1)
|
|
1948
|
+
prefix = f"{padding}-> " if padding else ""
|
|
1949
|
+
|
|
1950
|
+
header = (
|
|
1951
|
+
f"{plan.node_type}({plan.base_table})" if plan.is_scan() else plan.node_type
|
|
1952
|
+
)
|
|
1953
|
+
explain_data = (
|
|
1954
|
+
_custom_explain(plan, fields=fields, padding=padding)
|
|
1955
|
+
if fields
|
|
1956
|
+
else _default_explain(plan, padding=padding)
|
|
1957
|
+
)
|
|
1958
|
+
child_explains = "\n".join(
|
|
1959
|
+
f"{_explainify(child, fields=fields, level=level + 1)}"
|
|
1960
|
+
for child in plan.children
|
|
1961
|
+
)
|
|
1962
|
+
subplan_explains = (
|
|
1963
|
+
_explainify(plan.subplan.root, fields=fields, level=level + 1)
|
|
1964
|
+
if plan.subplan
|
|
1965
|
+
else ""
|
|
1966
|
+
)
|
|
1967
|
+
if subplan_explains:
|
|
1968
|
+
child_explains = f"{child_explains}\n{subplan_explains}"
|
|
1969
|
+
|
|
1970
|
+
if not child_explains:
|
|
1971
|
+
return f"{prefix}{header}\n{explain_data}"
|
|
1972
|
+
return f"{prefix}{header}\n{explain_data}\n{child_explains}"
|
|
1973
|
+
|
|
1974
|
+
|
|
1975
|
+
def _astify(plan: QueryPlan, *, indentation: int = _starting_indentation) -> str:
|
|
1976
|
+
"""Handler method to generate a tree-structure of the query plan."""
|
|
1977
|
+
padding = " " * indentation
|
|
1978
|
+
prefix = f"{padding}-> " if padding else ""
|
|
1979
|
+
if plan.is_scan():
|
|
1980
|
+
item_str = f"{prefix}{plan.node_type}({plan.base_table})"
|
|
1981
|
+
else:
|
|
1982
|
+
item_str = f"{prefix}{plan.node_type}"
|
|
1983
|
+
child_str = "\n".join(
|
|
1984
|
+
_astify(child, indentation=indentation + 2) for child in plan.children
|
|
1985
|
+
)
|
|
1986
|
+
return f"{item_str}\n{child_str}" if child_str else item_str
|