PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/_qep.py ADDED
@@ -0,0 +1,1986 @@
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import copy
5
+ import math
6
+ from collections.abc import Callable, Iterable, Iterator, Sequence
7
+ from dataclasses import dataclass
8
+ from numbers import Number
9
+ from typing import Any, Literal, Optional
10
+
11
+ from . import util
12
+ from ._core import (
13
+ Cardinality,
14
+ ColumnReference,
15
+ Cost,
16
+ JoinOperator,
17
+ PhysicalOperator,
18
+ ScanOperator,
19
+ TableReference,
20
+ )
21
+ from .qal._qal import AbstractPredicate, ColumnExpression, SqlExpression
22
+ from .util import StateError, jsondict
23
+
24
+ JoinDirection = Literal["inner", "outer"]
25
+
26
+
27
+ class SortKey:
28
+ """Sort keys describe how the tuples in a relation are sorted.
29
+
30
+ Each sort key contains a set of columns that describe the equivalence class of the sort key, i.e. the column values in each
31
+ row are all equal to one another. Therefore, the relation can be treated as being sorted by any of them.
32
+
33
+ Most commonly, relations will only be sorted by a single column (which can be checked by calling *len()* on the sort key,
34
+ or by checking the `equivalence_class` directly). In this case, the `column` property can be used to retrieve the
35
+ corresponding expression that forms the column.
36
+
37
+ To check, whether two sort keys are equivalent, the `is_compatible_with` method can be used. For more idiomatic access,
38
+ ``column in sort_key`` is also supported.
39
+
40
+ To create a new equivalence class, the `for_equivalence_class(columns)` method is available to create a new sort key from
41
+ scratch. To combine two existing sort keys, `merge_with` can be used.
42
+
43
+ Parameters
44
+ ----------
45
+ columns : Iterable[SqlExpression]
46
+ The column(s) that is used to sort the tuples. This will usually contain plain column references (`ColumnExpression`),
47
+ but can also use more complex expressions.
48
+ ascending : bool
49
+ Whether the sorting is ascending or descending. Defaults to ascending.
50
+ """
51
+
52
+ @staticmethod
53
+ def of(
54
+ column: SqlExpression | ColumnReference, *, ascending: bool = True
55
+ ) -> SortKey:
56
+ """Creates a new sort key for a single column.
57
+
58
+ Parameters
59
+ ----------
60
+ column : SqlExpression | ColumnReference
61
+ The column that is used to sort the tuples. Can be a plain column reference, which will be wrapped by a
62
+ `ColumnExpression` automatically.
63
+ ascending : bool, optional
64
+ Whether the sorting is ascending or descending. Defaults to ascending.
65
+
66
+ Returns
67
+ -------
68
+ SortKey
69
+ The sort key with an equivalence class for the single column.
70
+ """
71
+ if isinstance(column, ColumnReference):
72
+ column = ColumnExpression(column)
73
+ return SortKey([column], ascending=ascending)
74
+
75
+ @staticmethod
76
+ def for_equivalence_class(
77
+ members: Iterable[SqlExpression | ColumnReference], *, ascending: bool = True
78
+ ) -> SortKey:
79
+ """Creates a new sort key for an equivalence class of columns.
80
+
81
+ This is just a more expressive alias for calling the constructor directly. This method assumes that the values for
82
+ all columns in the equivalence class are equal to one another. The client is responsible for ensuring and checking
83
+ that this is actually the case.
84
+
85
+ Parameters
86
+ ----------
87
+ members : Iterable[SqlExpression | ColumnReference]
88
+ The columns that describe the sorting of the relation. This can contain just a single item, in which case the
89
+ method is pretty much the same as `of`. Any passed `ColumnReference` will be wrapped in a `ColumnExpression`.
90
+ ascending : bool, optional
91
+ Whether the sorting is ascending or descending. Defaults to ascending.
92
+
93
+ Returns
94
+ -------
95
+ SortKey
96
+ The sort key with an equivalence class for the columns.
97
+ """
98
+ members = [
99
+ ColumnExpression(mem) if isinstance(mem, ColumnReference) else mem
100
+ for mem in members
101
+ ]
102
+ return SortKey(members, ascending=ascending)
103
+
104
+ def __init__(
105
+ self, columns: Iterable[SqlExpression], *, ascending: bool = True
106
+ ) -> None:
107
+ self._members = frozenset(columns)
108
+ if not self._members:
109
+ raise ValueError("Sort key must contain at least one column")
110
+ self._ascending = ascending
111
+
112
+ __match_args___ = ("equivalence_class", "ascending")
113
+
114
+ @property
115
+ def column(self) -> SqlExpression:
116
+ """For single-column sort keys, get this column."""
117
+ if len(self._members) != 1:
118
+ raise StateError("Sort key is not a single column reference")
119
+ return next(iter(self._members))
120
+
121
+ @property
122
+ def equivalence_class(self) -> frozenset[SqlExpression]:
123
+ """Get all columns that are part of the equivalence class. This will be 1 or more columns."""
124
+ return self._members
125
+
126
+ @property
127
+ def ascending(self) -> bool:
128
+ """Get the sort direction of this key."""
129
+ return self._ascending
130
+
131
+ def is_compatible_with(self, other: SortKey | ColumnReference) -> bool:
132
+ """Checks, whether two keys are sorted the same way.
133
+
134
+ For single column references, this essentially checks whether the column is part of the key's equivalence class.
135
+ """
136
+ if isinstance(other, ColumnReference):
137
+ return other in self._members
138
+
139
+ if self.ascending != other.ascending:
140
+ return False
141
+ return len(self._members & other._members) > 0
142
+
143
+ def merge_with(self, other: SortKey) -> SortKey:
144
+ """Merges the equivalence classes of two sort keys."""
145
+ if self.ascending != other.ascending:
146
+ raise ValueError("Cannot merge sort keys with different sort orders")
147
+ return SortKey(self._members | other._members, ascending=self.ascending)
148
+
149
+ def __json__(self) -> jsondict:
150
+ return {"equivalence_class": self._members, "ascending": self._ascending}
151
+
152
+ def __len__(self) -> int:
153
+ return len(self._members)
154
+
155
+ def __contains__(self, item: object) -> bool:
156
+ return (
157
+ self.is_compatible_with(item)
158
+ if isinstance(item, (ColumnReference, SortKey))
159
+ else False
160
+ )
161
+
162
+ def __eq__(self, other: object) -> bool:
163
+ return (
164
+ isinstance(other, type(self))
165
+ and self._members == other._members
166
+ and self._ascending == other._ascending
167
+ )
168
+
169
+ def __hash__(self) -> int:
170
+ return hash((self._members, self._ascending))
171
+
172
+ def __repr__(self) -> str:
173
+ return str(self)
174
+
175
+ def __str__(self) -> str:
176
+ suffix = "" if self.ascending else " DESC"
177
+ if len(self._members) == 1:
178
+ member = str(self.column)
179
+ else:
180
+ members = ", ".join(str(m) for m in self._members)
181
+ member = f"{{{members}}}"
182
+ return f"{member}{suffix}"
183
+
184
+
185
+ class PlanParams:
186
+ """Plan parameters contain additional "structural" metadata about the operators in a query plan.
187
+
188
+ This information is mostly concerned with how the operator should function, e.g. which table it should scan, or which index
189
+ to use or how the tuples will be sorted.
190
+
191
+ In addition to the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the
192
+ parameters, e.g. ``params["custom"] = 42``.
193
+
194
+ Parameters
195
+ ----------
196
+ base_table : Optional[TableReference], optional
197
+ For scan nodes, this is the table being scanned. For all other nodes this is should be *None*.
198
+ filter_predicate : Optional[AbstractPredicate], optional
199
+ An arbitrary predicate to restrict the allowed tuples in the output of a relation. This should be mostly used for
200
+ join nodes and scans.
201
+ sort_keys : Optional[Sequence[SortKey]], optional
202
+ How the tuples in a the output of a relation are sorted. Absence of a specific sort order can be indicated either
203
+ through an empty list or by setting this parameter to *None*. In this case, tuples are assumed to be in some random
204
+ order.
205
+ parallel_workers : Optional[int], optional
206
+ The number of parallel workers that should be used to execute the operator. The underlying processing model assumes
207
+ that there exists some sort of main operator process which spawns additional worker processes. The worker processes
208
+ will compute the output relation together with the main process. Hence, if some relation should be processed by two
209
+ processes in parallel, the proper value for this parameter would be 1 (the main process and one additional worker).
210
+ It is up to the actual execution engine to decide whether a lower number of workers has to be used.
211
+ index : Optional[str], optional
212
+ The name of the index that should be used to scan the table. This is only relevant for scan nodes and should be
213
+ *None* for all other nodes.
214
+ lookup_key : Optional[SqlExpression], optional
215
+ The expression that is used to lookup tuples in some indexing structure. For scans, this could actually be the physical
216
+ index. For intermediate operators such as hash tables or memoize nodes, this could be the expression that is used to
217
+ build the table or to structure the memo.
218
+ **kwargs
219
+ Additional metadata that should be attached to the plan parameters.
220
+ """
221
+
222
+ def __init__(
223
+ self,
224
+ *,
225
+ base_table: Optional[TableReference] = None,
226
+ filter_predicate: Optional[AbstractPredicate] = None,
227
+ sort_keys: Optional[Sequence[SortKey]] = None,
228
+ parallel_workers: Optional[int] = None,
229
+ index: Optional[str] = None,
230
+ lookup_key: Optional[SqlExpression] = None,
231
+ **kwargs,
232
+ ) -> None:
233
+ self._params: dict[str, Any] = {
234
+ "base_table": base_table,
235
+ "filter_predicate": filter_predicate,
236
+ "sort_keys": tuple(sort_keys) if sort_keys else tuple(),
237
+ "parallel_workers": parallel_workers if parallel_workers else 0,
238
+ "index": index if index else "",
239
+ "lookup_key": lookup_key if lookup_key else None,
240
+ **kwargs,
241
+ }
242
+
243
+ @property
244
+ def base_table(self) -> Optional[TableReference]:
245
+ """Get the base table that is being scanned. For non-scan nodes, this is *None*."""
246
+ return self._params["base_table"]
247
+
248
+ @property
249
+ def filter_predicate(self) -> Optional[AbstractPredicate]:
250
+ """Get the filter predicate that is used to restrict the tuples in the output of a relation.
251
+
252
+ For join nodes this would be the join condition and for scan nodes this would be the filter conditions from the
253
+ WHERE clause. However, if the optimizer decides to delay the evaluation of some filter, or some filters need to be
254
+ evaluated multiple times (e.g. recheck conditions in Postgres), this predicate can be more complex.
255
+ """
256
+ return self._params["filter_predicate"]
257
+
258
+ @property
259
+ def sort_keys(self) -> Sequence[SortKey]:
260
+ """Get the sort keys describing the ordering of tuples in the output relation.
261
+
262
+ Absence of a specific sort order is indicated by an empty sequence.
263
+ """
264
+ return self._params["sort_keys"]
265
+
266
+ @property
267
+ def parallel_workers(self) -> int:
268
+ """Get the number of parallel workers that should be used to execute the operator.
269
+
270
+ The underlying processing model assumes that there exists some sort of main operator process which spawns additional
271
+ worker processes. The worker processes will compute the output relation together with the main process. Hence, if some
272
+ relation should be processed by two processes in parallel, the proper value for this parameter would be 1 (the main
273
+ process and one additional worker).
274
+
275
+ It is up to the actual execution engine to decide whether a lower number of workers has to be used.
276
+
277
+ Absence of parallelism is indicated by 0.
278
+ """
279
+ return self._params["parallel_workers"]
280
+
281
+ @property
282
+ def index(self) -> str:
283
+ """Get the name of the index that should be used to scan the table.
284
+
285
+ Absence of an index is indicated by an empty string.
286
+ """
287
+ return self._params["index"]
288
+
289
+ @property
290
+ def lookup_key(self) -> Optional[SqlExpression]:
291
+ """Get the expression that is used to lookup tuples in some indexing structure.
292
+
293
+ For scans, this could actually be the physical index. In this case, the lookup expression should be the one that is
294
+ used to build the index, e.g., the primary key column. For intermediate operators such as hash tables or memoize nodes,
295
+ this could be the expression that is used to build the table or to structure the memo.
296
+ """
297
+ return self._params["lookup_key"]
298
+
299
+ def tables(self) -> set[TableReference]:
300
+ """Provide all tables that are referenced at some point in the plan parameters.
301
+
302
+ This includes only the well-defined properties available to all parameterizations, i.e. `base_table` and
303
+ `filter_predicate`. If users decide to store additional metadata with table information in the parameters, these are
304
+ not retained here.
305
+
306
+ Returns
307
+ -------
308
+ set[TableReference]
309
+ The tables
310
+ """
311
+ tables = set()
312
+ if self.base_table:
313
+ tables.add(self.base_table)
314
+ if self.filter_predicate:
315
+ tables |= self.filter_predicate.tables()
316
+ if self.lookup_key:
317
+ tables |= self.lookup_key.tables()
318
+ return tables
319
+
320
+ def columns(self) -> set[ColumnReference]:
321
+ """Provides all columns that are referenced at some point in the plan parameters.
322
+
323
+ This includes only the well-defined properties available to all parameterizations, i.e. just the `filter_predicate`. If
324
+ users decide to store additional metadata with column information in the parameters, these are not retained here.
325
+
326
+ Returns
327
+ -------
328
+ set[ColumnReference]
329
+ The columns
330
+ """
331
+ return self.filter_predicate.columns() if self.filter_predicate else set()
332
+
333
+ def get(self, key: str, default: Any = None) -> Any:
334
+ """Retrieves the value of a specific key from the parameters.
335
+
336
+ This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
337
+ parameter with the given key. Instead, we return the default value, which is *None* by default.
338
+
339
+ Parameters
340
+ ----------
341
+ key : str
342
+ The parameter name
343
+ default : Any, optional
344
+ The default value to return if the parameter is not found. Defaults to *None*.
345
+
346
+ Returns
347
+ -------
348
+ Any
349
+ The parameter value if it exists, otherwise the default value.
350
+ """
351
+ value = self._params.get(key, default)
352
+ if isinstance(value, float) and math.isnan(value):
353
+ return default
354
+ return value
355
+
356
+ def items(self) -> Iterable[tuple[str, Any]]:
357
+ """Provides all metadata that is currently stored in the parameters as key-value pairs, similar to *dict.items*"""
358
+ return self._params.items()
359
+
360
+ def clone(self, *, deep: bool = False) -> PlanParams:
361
+ """Creates a copy of the current plan parameters.
362
+
363
+ Parameters
364
+ ----------
365
+ deep : bool, optional
366
+ Whether to create a deep copy of all parameters. Defaults to *False*.
367
+
368
+ Returns
369
+ -------
370
+ PlanParams
371
+ The copied parameters.
372
+ """
373
+ return self.__deepcopy__({}) if deep else self.__copy__()
374
+
375
+ def __json__(self) -> jsondict:
376
+ return self._params
377
+
378
+ def __copy__(self) -> PlanParams:
379
+ return PlanParams(**self._params)
380
+
381
+ def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanParams:
382
+ params = copy.deepcopy(self._params, memo)
383
+ return PlanParams(**params)
384
+
385
+ def __contains__(self, key: object) -> bool:
386
+ params = object.__getattribute__(self, "_params")
387
+ return key in params
388
+
389
+ def __getattribute__(self, name: str) -> Any:
390
+ params = object.__getattribute__(self, "_params")
391
+ if name == "_params":
392
+ return params
393
+ if name in params:
394
+ return params[name]
395
+ return object.__getattribute__(self, name)
396
+
397
+ def __setattr__(self, name, value) -> None:
398
+ if name == "_params":
399
+ return object.__setattr__(self, name, value)
400
+ params = object.__getattribute__(self, "_params")
401
+ params[name] = value
402
+
403
+ def __getitem__(self, key: str) -> Any:
404
+ return self._params[key]
405
+
406
+ def __setitem__(self, key: str, value: Any) -> None:
407
+ self._params[key] = value
408
+
409
+ def __repr__(self) -> str:
410
+ return str(self)
411
+
412
+ def __str__(self) -> str:
413
+ return str(self._params)
414
+
415
+
416
+ class PlanEstimates:
417
+ """Plan estimates provide the optimizer's view on a specific (sub-)plan.
418
+
419
+ This includes the estimated cardinality and cost of the plan. The cardinality is the number of tuples that are expected to
420
+ be produced by the operator, while the cost is a measure of the resources that are consumed by the operator.
421
+ Costs do not have a specific unit and it is the user's obligation to ensure that they are used in a sound way. Most
422
+ importantly, this means that only costs from the same source should be compared since most database systems interpret costs
423
+ in a different way.
424
+
425
+ In addition to the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the
426
+ parameters, e.g. ``estimates["custom"] = 42``.
427
+
428
+ Parameters
429
+ ----------
430
+ cardinality : Cardinality, optional
431
+ The estimated number of tuples that are produced by the operator. If no estimate is available, *NaN* can be used.
432
+ cost : Cost, optional
433
+ The approximate amount of abstract "work" that needs to be done to compute the result set of the operator. If no
434
+ estimate is available, *NaN* can be used.
435
+ **kwargs
436
+ Additional metadata that should be attached to the plan estimates.
437
+
438
+ Notes
439
+ -----
440
+ In case of parallel execution, all measures should be thought of "meaningful totals", i.e. the cardinality
441
+ numbers are the total number of tuples produced by all workers. The execution time should denote the wall time, it
442
+ took to execute the entire operator (which just happened to include parallel processing), **not** an average of the
443
+ worker execution time or some other measure.
444
+ """
445
+
446
+ def __init__(
447
+ self,
448
+ *,
449
+ cardinality: Cardinality = Cardinality.unknown(),
450
+ cost: Cost = math.nan,
451
+ **kwargs,
452
+ ) -> None:
453
+ cardinality = (
454
+ cardinality
455
+ if isinstance(cardinality, Cardinality)
456
+ else Cardinality(cardinality)
457
+ )
458
+ self._params = {"cardinality": cardinality, "cost": cost, **kwargs}
459
+
460
+ @property
461
+ def cardinality(self) -> Cardinality:
462
+ """Get the estimated cardinality of the operator. Can be *NaN* if no estimate is available."""
463
+ return self._params["cardinality"]
464
+
465
+ @property
466
+ def cost(self) -> Cost:
467
+ """Get the estimated cost of the operator. Can be *NaN* if no estimate is available."""
468
+ return self._params["cost"]
469
+
470
+ def get(self, key: str, default: Any = None) -> Any:
471
+ """Retrieves the value of a specific key from the estimates.
472
+
473
+ This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
474
+ parameter with the given key. Instead, we return the default value, which is *None* by default.
475
+
476
+ Parameters
477
+ ----------
478
+ key : str
479
+ The parameter name
480
+ default : Any, optional
481
+ The default value to return if the parameter is not found. Defaults to *None*.
482
+
483
+ Returns
484
+ -------
485
+ Any
486
+ The parameter value if it exists, otherwise the default value.
487
+ """
488
+ value = self._params.get(key, default)
489
+ if isinstance(value, float) and math.isnan(value):
490
+ return default
491
+ return value
492
+
493
+ def items(self) -> Iterable[tuple[str, Any]]:
494
+ """Provides all estimates as key-value pairs, similar to the *dict.items* method."""
495
+ return self._params.items()
496
+
497
+ def clone(self, *, deep: bool = False) -> PlanEstimates:
498
+ """Creates a copy of the current plan estimates.
499
+
500
+ Parameters
501
+ ----------
502
+ deep : bool, optional
503
+ Whether to create a deep copy of all estimates. Defaults to *False*.
504
+
505
+ Returns
506
+ -------
507
+ PlanEstimates
508
+ The copied estimates.
509
+ """
510
+ return self.__deepcopy__({}) if deep else self.__copy__()
511
+
512
+ def __json__(self) -> jsondict:
513
+ return self._params
514
+
515
+ def __copy__(self) -> PlanEstimates:
516
+ return PlanEstimates(**self._params)
517
+
518
+ def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanEstimates:
519
+ params = copy.deepcopy(self._params, memo)
520
+ return PlanEstimates(**params)
521
+
522
+ def __contains__(self, key: object) -> bool:
523
+ params = object.__getattribute__(self, "_params")
524
+ return key in params
525
+
526
+ def __getattribute__(self, name: str) -> Any:
527
+ params = object.__getattribute__(self, "_params")
528
+ if name == "_params":
529
+ return params
530
+ if name in params:
531
+ return params[name]
532
+ return object.__getattribute__(self, name)
533
+
534
+ def __setattr__(self, name, value) -> None:
535
+ if name == "_params":
536
+ return object.__setattr__(self, name, value)
537
+ params = object.__getattribute__(self, "_params")
538
+ params[name] = value
539
+
540
+ def __getitem__(self, key: str) -> Any:
541
+ return self._params[key]
542
+
543
+ def __setitem__(self, key: str, value: Any) -> None:
544
+ self._params[key] = value
545
+
546
+ def __repr__(self) -> str:
547
+ return str(self)
548
+
549
+ def __str__(self) -> str:
550
+ return str(self._params)
551
+
552
+
553
+ class PlanMeasures:
554
+ """Plan measures provide actual execution statistics of a specific (sub-)plan.
555
+
556
+ Typically, this includes the actual cardinality of the result set as well as the execution time of the operator.
557
+ Additionally, information about cache hits and misses for the shared buffer can be provided.
558
+
559
+ Other than the pre-defined attributes, users can attach arbitrary metadata using a dict-like access into the parameters,
560
+ e.g. ``measures["custom"] = 42``.
561
+
562
+ Parameters
563
+ ----------
564
+ cardinality : Cardinality, optional
565
+ The actual number of tuples that are produced by the operator. If no measurement is available, *NaN* can be used.
566
+ execution_time : float, optional
567
+ The total time (in seconds) that was spent to compute the result set of the operator. If no measurement is available,
568
+ *NaN* can be used.
569
+ cache_hits : Optional[int], optional
570
+ The number of page reads that were satisfied by the shared buffer. If no measurement is available, *None* can be
571
+ used.
572
+ cache_misses : Optional[int], optional
573
+ The number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer. If no
574
+ measurement is available, *None* can be used.
575
+ **kwargs
576
+ Additional metadata that should be attached to the plan measures.
577
+
578
+ Notes
579
+ -----
580
+ In case of parallel execution, all measures should be thought of "meaningful totals", i.e. the cardinality
581
+ numbers are the total number of tuples produced by all workers. The execution time should denote the wall time, it
582
+ took to execute the entire operator (which just happened to include parallel processing), **not** an average of the
583
+ worker execution time or some other measure.
584
+ """
585
+
586
+ def __init__(
587
+ self,
588
+ *,
589
+ cardinality: Cardinality = Cardinality.unknown(),
590
+ execution_time: float = math.nan,
591
+ cache_hits: Optional[int] = None,
592
+ cache_misses: Optional[int] = None,
593
+ **kwargs,
594
+ ) -> None:
595
+ cardinality = (
596
+ cardinality
597
+ if isinstance(cardinality, Cardinality)
598
+ else Cardinality(cardinality)
599
+ )
600
+ self._params = {
601
+ "cardinality": cardinality,
602
+ "execution_time": execution_time,
603
+ "cache_hits": cache_hits,
604
+ "cache_misses": cache_misses,
605
+ **kwargs,
606
+ }
607
+
608
+ @property
609
+ def cardinality(self) -> Cardinality:
610
+ """Get the actual cardinality of the operator. Can be *NaN* if no measurement is available."""
611
+ return self._params["cardinality"]
612
+
613
+ @property
614
+ def execution_time(self) -> float:
615
+ """Get the actual execution time of the operator. Can be *NaN* if no measurement is available."""
616
+ return self._params["execution_time"]
617
+
618
+ @property
619
+ def cache_hits(self) -> Optional[int]:
620
+ """Get the number of page reads that were satisfied by the shared buffer.
621
+
622
+ If no measurement is available, *None* is returned.
623
+ """
624
+ return self._params["cache_hits"]
625
+
626
+ @property
627
+ def cache_misses(self) -> Optional[int]:
628
+ """Get the number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer.
629
+
630
+ If no measurement is available, *None* is returned.
631
+ """
632
+ return self._params["cache_misses"]
633
+
634
+ def get(self, key: str, default: Any = None) -> Any:
635
+ """Retrieves the value of a specific key from the measures.
636
+
637
+ This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
638
+ parameter with the given key. Instead, we return the default value, which is *None* by default.
639
+
640
+ Parameters
641
+ ----------
642
+ key : str
643
+ The parameter name
644
+ default : Any, optional
645
+ The default value to return if the parameter is not found. Defaults to *None*.
646
+
647
+ Returns
648
+ -------
649
+ Any
650
+ The parameter value if it exists, otherwise the default value.
651
+ """
652
+ value = self._params.get(key, default)
653
+ if isinstance(value, float) and math.isnan(value):
654
+ return default
655
+ return value
656
+
657
+ def items(self) -> Iterable[tuple[str, Any]]:
658
+ """Provides all measures as key-value pairs, similar to the *dict.items* method."""
659
+ return self._params.items()
660
+
661
+ def clone(self, *, deep: bool = False) -> PlanMeasures:
662
+ """Creates a copy of the current plan measures.
663
+
664
+ Parameters
665
+ ----------
666
+ deep : bool, optional
667
+ Whether to create a deep copy of all measures. Defaults to *False*.
668
+
669
+ Returns
670
+ -------
671
+ PlanMeasures
672
+ The copied measures.
673
+ """
674
+ return self.__deepcopy__({}) if deep else self.__copy__()
675
+
676
+ def __json__(self) -> jsondict:
677
+ return self._params
678
+
679
+ def __copy__(self) -> PlanMeasures:
680
+ return PlanMeasures(**self._params)
681
+
682
+ def __deepcopy__(self, memo: dict[int, object] = {}) -> PlanMeasures:
683
+ params = copy.deepcopy(self._params, memo)
684
+ return PlanMeasures(**params)
685
+
686
+ def __contains__(self, key: object) -> bool:
687
+ params = object.__getattribute__(self, "_params")
688
+ return key in params
689
+
690
+ def __getattribute__(self, name: str) -> Any:
691
+ params = object.__getattribute__(self, "_params")
692
+ if name == "_params":
693
+ return params
694
+ if name in params:
695
+ return params[name]
696
+ return object.__getattribute__(self, name)
697
+
698
+ def __setattr__(self, name, value) -> None:
699
+ if name == "_params":
700
+ return object.__setattr__(self, name, value)
701
+ params = object.__getattribute__(self, "_params")
702
+ params[name] = value
703
+
704
+ def __getitem__(self, key: str) -> Any:
705
+ return self._params[key]
706
+
707
+ def __setitem__(self, key: str, value: Any) -> None:
708
+ self._params[key] = value
709
+
710
+ def __bool__(self) -> bool:
711
+ return any(
712
+ not math.isnan(v) if isinstance(v, Number) else (v is not None)
713
+ for v in self._params.values()
714
+ )
715
+
716
+ def __repr__(self) -> str:
717
+ return str(self)
718
+
719
+ def __str__(self) -> str:
720
+ return str(self._params)
721
+
722
+
723
+ @dataclass(frozen=True)
724
+ class Subplan:
725
+ """Subplans are used to model subqueries whose results are used while processing another operator in the main query.
726
+
727
+ A typical example are correlated/dependent subqueries that are used in some predicate and need to be evaluated for each
728
+ tuple of the outer relation (unless some algebraic optimization has been applied beforehand).
729
+
730
+ Attributes
731
+ ----------
732
+ root : QueryPlan
733
+ The root operator of the subplan
734
+ target_name : str
735
+ The name of the target table that the subplan should produce
736
+ """
737
+
738
+ root: QueryPlan
739
+ target_name: str = ""
740
+
741
+ def tables(self) -> set[TableReference]:
742
+ """Provide all tables that are referenced at some point in the subplan.
743
+
744
+ Returns
745
+ -------
746
+ set[TableReference]
747
+ The tables. This set includes the target table that the subplan produces as a virtual table.
748
+ """
749
+ if not self.target_name:
750
+ return self.root.tables()
751
+ target_table = TableReference.create_virtual(self.target_name)
752
+ return self.root.tables() | {target_table}
753
+
754
+ def clone(self, *, deep: bool = False) -> Subplan:
755
+ """Creates a copy of the current subplan.
756
+
757
+ Parameters
758
+ ----------
759
+ deep : bool, optional
760
+ Whether to create a deep copy of all contained plans. Defaults to *False*.
761
+
762
+ Returns
763
+ -------
764
+ Subplan
765
+ The copied subplan.
766
+ """
767
+ return self.__deepcopy__({}) if deep else self.__copy__()
768
+
769
+ def __json__(self) -> jsondict:
770
+ return {"root": self.root, "target_name": self.target_name}
771
+
772
+ def __copy__(self) -> Subplan:
773
+ return Subplan(self.root.clone(deep=False), self.target_name)
774
+
775
+ def __deepcopy__(self, memo: dict[int, object] = {}) -> Subplan:
776
+ return Subplan(self.root.clone(deep=True), self.target_name)
777
+
778
+
779
+ class QueryPlan:
780
+ """Models the structure of a query execution plan (QEP).
781
+
782
+ Query plans are constructed as a tree of operators. Each operator represents an entire query plan by itself. Hence, we
783
+ use the *QueryPlan* to refer to the actual nodes in a hierarchical structure. Each node has a potentially large amount of
784
+ metadata attached to it, e.g. regarding the table being scanned for scan nodes, the estimated cost of the operator or the
785
+ actual cardinality of the result set. The different types of metadata are structured into three separate classes:
786
+
787
+ - `PlanParams` contain all structural metadata about the operator, e.g. the table being scanned or the filter predicate.
788
+ - `PlanEstimates` contain the optimizer's view on the operator, e.g. the estimated cardinality and cost.
789
+ - `PlanMeasures` contain the actual execution statistics of the operator, e.g. the actual cardinality and execution time.
790
+
791
+ Users are free to attach additional metadata to each of the containers to support there specific use-cases. However, these
792
+ additional fields are typically not considered by the standard methods available on query plans. For example, if users
793
+ store additional tables in the node, these are not considered in the `tables` method.
794
+
795
+ Each query plan can contain an arbitrary number of child nodes. This is true even for scans, to accomodate bitmap scans
796
+ that combine an arbitrary amount of index lookups with a final scan. If just a single child is present, it can be set more
797
+ expressively using the `input_node` property.
798
+
799
+ PostBOUND uses QEPs in two different ways: first, they can be used as the output of the optimization process (i.e. the
800
+ optimization pipelines), being constructed by the different optimization stages. Second, they can also be extracted from
801
+ an actual database system to encode the QEP that this system used to execute a specific query. This dichotomy leads to
802
+ different granularities of query plans: actual database systems often have much more detailed QEPs. For example, Postgres
803
+ represents a hash join as a hash join operator, whose inner child is a hash operator that constructs the hash table.
804
+ The optimizer stages will typically not worry about such fine-grained details and simply demand a join to be executed as
805
+ a hash join. To mitigate these issues, the query plans can be normalized by using the `canonical` method. This method
806
+ removes all unnecessary details and only retains the join and scan operators.
807
+
808
+ When constructing a query plan, the metadata can be provided in two ways: either as instances of the corresponding metadata
809
+ objects, or explicitly as keyword arguments to enable a more convenient usage. Notice however, that these two ways cannot
810
+ be mixed: either all metadata of a specific type is provided as wrapper instance, or all metadata is provided as keyword
811
+ arguments. Mixing is only allowed across different metadata types, e.g. providing the estimates as a `PlanEstimates` object
812
+ and the measurements as keyword arguments.
813
+
814
+ In addition to the pre-defined metadata types, you can also add additional metadata as part of the *kwargs*. These will
815
+ be added to the plan parameters (using the same mixing rules as the pre-defined types).
816
+ Each query plan provides dict-like access to the plan parameters, estimates and measures, e.g. ``plan["custom"] = 42``,
817
+ ``plan.get("custom", default)``, or ``"custom" in plan``.
818
+
819
+ Query plans provide rather extensive support methods to check their shape (e.g. `is_linear()` or `is_bushy()`), to aid with
820
+ traversal (e.g. `find_first_node()` or `find_all_nodes()`) or to extract specific information (e.g. `tables()` or
821
+ `qerror()`).
822
+
823
+ To convert between different optimization artifacts, a number of methods are available. For example, `to_query_plan` can
824
+ be used to construct a query plan from a join order and a set of operators. Likewise, `explode_query_plan` converts the
825
+ query plan back into join order, operators and parameters.
826
+
827
+ Query plans support *len()* (providing the plan depth without subplans) and *iter()* (providing all contained nodes
828
+ including subplans).
829
+
830
+ Parameters
831
+ ----------
832
+ node_type : str | PhysicalOperator
833
+ The name of the operator. If this is supplied as a physical operator, the name is inferred from it.
834
+ operator : Optional[PhysicalOperator], optional
835
+ The actual operator that is used to compute the result set. This can be empty if there is no specific operator
836
+ corresponding to the current node (e.g. for transient hash tables).
837
+ children : Optional[QueryPlan | Iterable[QueryPlan]], optional
838
+ The input nodes of the current operator. For nodes without an input (e.g. most scans), this can simply be *None* or
839
+ an empty list. Nodes with exactly one input node (e.g. most aggregations) can supply their input either directly as
840
+ a plan object, or as a singleton list. Nodes with two input nodes (e.g. joins) should supply them as an ordered
841
+ iterable with the outer child first.
842
+ plan_params : Optional[PlanParams], optional
843
+ Structural metadata (e.g. parallel workers or accessed indexes) of the operator. If this is provided, no other
844
+ plan parameters can be supplied as keyword arguments, including kwargs.
845
+ subplan : Optional[Subplan], optional
846
+ A subquery that has to be executed as part of this node. If this is provided, no other subplan components can be
847
+ supplied as keyword arguments.
848
+ estimates : Optional[PlanEstimates], optional
849
+ The optimizer's view on the operator (e.g. estimated cardinality and cost). If this is provided, no other estimates
850
+ can be supplied as keyword arguments.
851
+ measures : Optional[PlanMeasures], optional
852
+ The actual execution statistics of the operator (e.g. actual cardinality and execution time). If this is provided, no
853
+ other measures can be supplied as keyword arguments.
854
+ base_table : Optional[TableReference], optional
855
+ The table that is being scanned. This is only relevant for scan nodes and should be *None* for all other nodes.
856
+ If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
857
+ filter_predicate : Optional[AbstractPredicate], optional
858
+ An arbitrary predicate to restrict the allowed tuples in the output of a relation. This should be mostly used for
859
+ join nodes and scans. If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
860
+ parallel_workers : Optional[int], optional
861
+ The number of parallel workers that should be used to execute the operator. If this argument is used, no other plan
862
+ parameters can be supplied in the `plan_params` argument.
863
+ index : Optional[str], optional
864
+ The name of the index that should be used to scan the table. This is mostly relevant for scan nodes and should be
865
+ *None* for all other nodes. If this argument is used, no other plan parameters can be supplied in the `plan_params`
866
+ argument.
867
+ lookup_key : Optional[SqlExpression], optional
868
+ The expression that is used to lookup tuples in some indexing structure. For scans, this could actually be the
869
+ physical index. For intermediate operators such as hash tables or memoize nodes, this could be the expression that is
870
+ used to build the table or to structure the memo. If this argument is used, no other plan parameters can be supplied
871
+ in the `plan_params` argument.
872
+ sort_keys : Optional[Sequence[SortKey]], optional
873
+ How the tuples in a the output of a relation are sorted. Absence of a specific sort order can be indicated either
874
+ through an empty list or by setting this parameter to *None*. In this case, tuples are assumed to be in some random
875
+ order. If this argument is used, no other plan parameters can be supplied in the `plan_params` argument.
876
+ estimated_cardinality : Cardinality, optional
877
+ The estimated number of tuples that are produced by the operator. If no estimate is available, *NaN* can be used.
878
+ If this argument is used, no other estimates can be supplied in the `estimates` argument.
879
+ estimated_cost : Cost, optional
880
+ The approximate amount of abstract "work" that needs to be done to compute the result set of the operator. If no
881
+ estimate is available, *NaN* can be used. If this argument is used, no other estimates can be supplied in the
882
+ `estimates` argument.
883
+ actual_cardinality : Cardinality, optional
884
+ The actual number of tuples that are produced by the operator. If no measurement is available, *NaN* can be used.
885
+ If this argument is used, no other measures can be supplied in the `measures` argument.
886
+ execution_time : float, optional
887
+ The total time (in seconds) that was spent to compute the result set of the operator. If no measurement is available,
888
+ *NaN* can be used. If this argument is used, no other measures can be supplied in the `measures` argument.
889
+ cache_hits : Optional[int], optional
890
+ The number of page reads that were satisfied by the shared buffer. If no measurement is available, *None* can be
891
+ used. If this argument is used, no other measures can be supplied in the `measures` argument.
892
+ cache_misses : Optional[int], optional
893
+ The number of page reads that had to be delegated to the disk and could not be satisfied by the shared buffer. If no
894
+ measurement is available, *None* can be used. If this argument is used, no other measures can be supplied in the
895
+ `measures` argument.
896
+ subplan_root : Optional[QueryPlan], optional
897
+ The root operator of the subplan. If this argument is used, no other subplan components can be supplied in the
898
+ `subplan` argument.
899
+ subplan_target_name : str, optional
900
+ The name of the target table that the subplan should produce. If this argument is used, no other subplan components
901
+ can be supplied in the `subplan` argument.
902
+ **kwargs
903
+ Additional metadata that should be attached to the plan parameters. If this is used, no other plan parameters can be
904
+ supplied in the `plan_params` argument.
905
+
906
+ See Also
907
+ --------
908
+ to_query_plan
909
+ explode_query_plan
910
+ OptimizerInterface.query_plan
911
+ OptimizationPipeline.query_execution_plan
912
+ """
913
+
914
+ def __init__(
915
+ self,
916
+ node_type: str | PhysicalOperator,
917
+ *,
918
+ operator: Optional[PhysicalOperator] = None,
919
+ children: Optional[QueryPlan | Iterable[QueryPlan]] = None,
920
+ plan_params: Optional[PlanParams] = None,
921
+ subplan: Optional[Subplan] = None,
922
+ estimates: Optional[PlanEstimates] = None,
923
+ measures: Optional[PlanMeasures] = None,
924
+ base_table: Optional[TableReference] = None,
925
+ filter_predicate: Optional[AbstractPredicate] = None,
926
+ parallel_workers: Optional[int] = None,
927
+ index: Optional[str] = None,
928
+ sort_keys: Optional[Sequence[SortKey]] = None,
929
+ lookup_key: Optional[SqlExpression] = None,
930
+ estimated_cardinality: Cardinality = Cardinality.unknown(),
931
+ estimated_cost: Cost = math.nan,
932
+ actual_cardinality: Cardinality = Cardinality.unknown(),
933
+ execution_time: float = math.nan,
934
+ cache_hits: Optional[int] = None,
935
+ cache_misses: Optional[int] = None,
936
+ subplan_root: Optional[QueryPlan] = None,
937
+ subplan_target_name: str = "",
938
+ **kwargs,
939
+ ) -> None:
940
+ if not node_type:
941
+ raise ValueError("Node type must be provided")
942
+
943
+ custom_params = (
944
+ base_table,
945
+ filter_predicate,
946
+ parallel_workers,
947
+ index,
948
+ sort_keys,
949
+ lookup_key,
950
+ )
951
+ has_custom_params = any(v is not None for v in custom_params) or bool(kwargs)
952
+ if plan_params is not None and has_custom_params:
953
+ raise ValueError(
954
+ "PlanParams and individual parameters/kwargs cannot be provided at the same time"
955
+ )
956
+ if plan_params is None:
957
+ plan_params = PlanParams(
958
+ base_table=base_table,
959
+ filter_predicate=filter_predicate,
960
+ sort_keys=sort_keys,
961
+ lookup_key=lookup_key,
962
+ parallel_workers=parallel_workers,
963
+ index=index,
964
+ **kwargs,
965
+ )
966
+
967
+ if estimates is not None and any(
968
+ not math.isnan(v) for v in (estimated_cardinality, estimated_cost)
969
+ ):
970
+ raise ValueError(
971
+ "PlanEstimates and individual estimates cannot be provided at the same time"
972
+ )
973
+ if estimates is None:
974
+ estimates = PlanEstimates(
975
+ cardinality=estimated_cardinality, cost=estimated_cost
976
+ )
977
+
978
+ has_custom_measures = any(
979
+ v is not None and not math.isnan(v)
980
+ for v in (execution_time, cache_hits, cache_misses)
981
+ )
982
+ if measures is not None and has_custom_measures:
983
+ raise ValueError(
984
+ "PlanMeasures and individual measures cannot be provided at the same time"
985
+ )
986
+ if measures is None:
987
+ measures = PlanMeasures(
988
+ execution_time=execution_time,
989
+ cardinality=actual_cardinality,
990
+ cache_hits=cache_hits,
991
+ cache_misses=cache_misses,
992
+ )
993
+
994
+ if subplan is not None and (subplan_root is not None or subplan_target_name):
995
+ raise ValueError(
996
+ "Subplan and individual subplan components cannot be provided at the same time"
997
+ )
998
+ if subplan is None and (subplan_root is not None or subplan_target_name):
999
+ subplan = Subplan(subplan_root, subplan_target_name)
1000
+
1001
+ children = [] if children is None else util.enlist(children)
1002
+
1003
+ if isinstance(node_type, PhysicalOperator):
1004
+ operator = node_type
1005
+ node_type = operator.name
1006
+
1007
+ self._node_type = node_type
1008
+ self._operator = operator
1009
+
1010
+ if len(children) == 1:
1011
+ self._input_node = children[0]
1012
+ else:
1013
+ self._input_node = None
1014
+
1015
+ self._children: tuple[QueryPlan] = tuple(children) if children else ()
1016
+ self._plan_params = plan_params
1017
+ self._estimates = estimates
1018
+ self._measures = measures
1019
+ self._subplan = subplan
1020
+
1021
+ @property
1022
+ def node_type(self) -> str:
1023
+ """Get the name of the operator."""
1024
+ return self._node_type
1025
+
1026
+ @property
1027
+ def operator(self) -> Optional[PhysicalOperator]:
1028
+ """Get the actual operator that is used to compute the result set.
1029
+
1030
+ For transient operators (e.g. hash tables), this can be *None*.
1031
+ """
1032
+ return self._operator
1033
+
1034
+ @property
1035
+ def input_node(self) -> Optional[QueryPlan]:
1036
+ """Get the input node of the current operator.
1037
+
1038
+ For nodes without an input (e.g. most scans), or nodes with multiple inputs (e.g. joins), this is *None*.
1039
+ """
1040
+ return self._input_node
1041
+
1042
+ @property
1043
+ def children(self) -> Sequence[QueryPlan]:
1044
+ """Get the input nodes of the current operator.
1045
+
1046
+ For nodes without an input (e.g. most scans), this is an empty list. For nodes with exactly one input (e.g. most
1047
+ aggregations), this is a singleton list. For nodes with two input nodes (e.g. joins), this is an ordered iterable
1048
+ with the outer child first.
1049
+ """
1050
+ return self._children
1051
+
1052
+ @property
1053
+ def outer_child(self) -> Optional[QueryPlan]:
1054
+ """Get the outer input of the current operator.
1055
+
1056
+ For nodes that do not have exactly two inputs, this is *None*.
1057
+ """
1058
+ if len(self._children) == 2:
1059
+ return self._children[0]
1060
+ return None
1061
+
1062
+ @property
1063
+ def inner_child(self) -> Optional[QueryPlan]:
1064
+ """Get the inner input of the current operator.
1065
+
1066
+ For nodes that do not have exactly two inputs, this is *None*.
1067
+ """
1068
+ if len(self._children) == 2:
1069
+ return self._children[1]
1070
+ return None
1071
+
1072
+ @property
1073
+ def params(self) -> PlanParams:
1074
+ """Get the structural metadata of the operator."""
1075
+ return self._plan_params
1076
+
1077
+ @property
1078
+ def base_table(self) -> Optional[TableReference]:
1079
+ """Get the table that is being scanned. For non-scan nodes, this will probably is *None*.
1080
+
1081
+ This is just a shorthand for accessing the plan parameters manually.
1082
+
1083
+ See Also
1084
+ --------
1085
+ PlanParams.base_table
1086
+ """
1087
+ return self._plan_params.base_table
1088
+
1089
+ @property
1090
+ def filter_predicate(self) -> Optional[AbstractPredicate]:
1091
+ """Get the filter predicate that is used to restrict the tuples in the output of a relation.
1092
+
1093
+ This is just a shorthand for accessing the plan parameters manually.
1094
+
1095
+ See Also
1096
+ --------
1097
+ PlanParams.filter_predicate
1098
+ """
1099
+ return self._plan_params.filter_predicate
1100
+
1101
+ @property
1102
+ def sort_keys(self) -> Sequence[SortKey]:
1103
+ """Get the sort keys describing the ordering of tuples in the output relation.
1104
+
1105
+ Absence of a specific sort order is indicated by an empty sequence.
1106
+
1107
+ This is just a shorthand for accessing the plan parameters manually.
1108
+
1109
+ See Also
1110
+ --------
1111
+ PlanParams.sort_keys
1112
+ """
1113
+ return self._plan_params.sort_keys
1114
+
1115
+ @property
1116
+ def lookup_key(self) -> Optional[SqlExpression]:
1117
+ """Get the expression that is used to lookup tuples in some indexing structure.
1118
+
1119
+ This is just a shorthand for accessing the plan parameters manually.
1120
+
1121
+ See Also
1122
+ --------
1123
+ PlanParams.lookup_key
1124
+ """
1125
+ return self._plan_params.lookup_key
1126
+
1127
+ @property
1128
+ def parallel_workers(self) -> int:
1129
+ """Get the number of parallel workers that should be used to execute the operator.
1130
+
1131
+ Absence of parallel execution is indicated by 0.
1132
+
1133
+ This is just a shorthand for accessing the plan parameters manually.
1134
+
1135
+ See Also
1136
+ --------
1137
+ PlanParams.parallel_workers
1138
+ """
1139
+ return self._plan_params.parallel_workers
1140
+
1141
+ @property
1142
+ def estimates(self) -> PlanEstimates:
1143
+ """Get the optimizer's view on the operator."""
1144
+ return self._estimates
1145
+
1146
+ @property
1147
+ def estimated_cardinality(self) -> Cardinality:
1148
+ """Get the cardinality estimate of the optimizer.
1149
+
1150
+ This is just a shorthand for accessing the estimates manually.
1151
+
1152
+ See Also
1153
+ --------
1154
+ PlanEstimates.cardinality
1155
+ """
1156
+ return self._estimates.cardinality
1157
+
1158
+ @property
1159
+ def estimated_cost(self) -> Cost:
1160
+ """Get the cost estimate of the optimizer.
1161
+
1162
+ This is just a shorthand for accessing the estimates manually.
1163
+
1164
+ See Also
1165
+ --------
1166
+ PlanEstimates.cost
1167
+ """
1168
+ return self._estimates.cost
1169
+
1170
+ @property
1171
+ def measures(self) -> PlanMeasures:
1172
+ """Get the actual execution statistics of the operator."""
1173
+ return self._measures
1174
+
1175
+ @property
1176
+ def actual_cardinality(self) -> Cardinality:
1177
+ """Get the actual cardinality of the operator.
1178
+
1179
+ This is just a shorthand for accessing the measures manually.
1180
+
1181
+ See Also
1182
+ --------
1183
+ PlanMeasures.cardinality
1184
+ """
1185
+ return self._measures.cardinality
1186
+
1187
+ @property
1188
+ def execution_time(self) -> float:
1189
+ """Get the actual execution time of the operator.
1190
+
1191
+ This is just a shorthand for accessing the measures manually.
1192
+
1193
+ See Also
1194
+ --------
1195
+ PlanMeasures.execution_time
1196
+ """
1197
+ return self._measures.execution_time
1198
+
1199
+ @property
1200
+ def subplan(self) -> Optional[Subplan]:
1201
+ """Get the subplan that has to be executed as part of this node."""
1202
+ return self._subplan
1203
+
1204
+ def get(self, key: str, default: Any = None) -> Any:
1205
+ """Retrieves a specific parameter from the plan.
1206
+
1207
+ The lookup is performed in the following order:
1208
+
1209
+ 1. Plan parameters
1210
+ 2. Plan estimates
1211
+ 3. Plan measures
1212
+
1213
+ If none of these containers contains the requested key, the default value is returned.
1214
+ """
1215
+ value = self._plan_params.get(key)
1216
+ if value is not None:
1217
+ return value
1218
+ value = self._estimates.get(key)
1219
+ if value is not None:
1220
+ return value
1221
+ value = self._measures.get(key)
1222
+ if value is not None:
1223
+ return value
1224
+ return default
1225
+
1226
+ def is_join(self) -> bool:
1227
+ """Checks, whether the current node is a join operator."""
1228
+ return self._operator is not None and self._operator in JoinOperator
1229
+
1230
+ def is_scan(self) -> bool:
1231
+ """Checks, whether the current node is a scan operator."""
1232
+ return self._operator is not None and self._operator in ScanOperator
1233
+
1234
+ def is_auxiliary(self) -> bool:
1235
+ """Checks, whether the current node is an arbitrary intermediate operator (i.e. not a join nor a scan)."""
1236
+ return not self.is_join() and not self.is_scan()
1237
+
1238
+ def is_analyze(self) -> bool:
1239
+ """Checks, whether the plan was executed in ANALYZE mode, i.e. whether runtime measurements are available."""
1240
+ return bool(self._measures)
1241
+
1242
+ def is_ordered(self) -> bool:
1243
+ """Checks, whether the plan guarantees a specific order of the result tuples."""
1244
+ return bool(self._plan_params.sort_keys)
1245
+
1246
+ def is_linear(self) -> bool:
1247
+ """Checks, whether the plan performs all joins in a linear order.
1248
+
1249
+ This is the case if all join nodes compute their result by joining at least one base table (no matter whether it
1250
+ is the inner or outer child) with another relation (base relation or intermediate).
1251
+
1252
+ As a special case, scan nodes are considered to be linear as well.
1253
+ """
1254
+ if self.is_scan():
1255
+ return True
1256
+ outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
1257
+ inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
1258
+ return outer_join is None or inner_join is None
1259
+
1260
+ def is_bushy(self) -> bool:
1261
+ """Checks, whether the plan performs joins in a bushy order.
1262
+
1263
+ This is the case if at least one join node joins two intermediates that are themselves the result of a join.
1264
+ """
1265
+ if self.is_scan():
1266
+ return False
1267
+ outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
1268
+ inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
1269
+ return outer_join is not None and inner_join is not None
1270
+
1271
+ def is_left_deep(self) -> bool:
1272
+ """Checks, whether the plan performs all joins in a left-deep order.
1273
+
1274
+ Left deep order means that the plan is linear and all joins are performed with the base table as the inner relation.
1275
+ As a special case, scan nodes are considered to be right-deep as well.
1276
+ """
1277
+ if self.is_scan():
1278
+ return True
1279
+ inner_join = self.inner_child.find_first_node(QueryPlan.is_join)
1280
+ return inner_join is None
1281
+
1282
+ def is_right_deep(self) -> bool:
1283
+ """Checks, whether the plan performs all joins in a right-deep order.
1284
+
1285
+ Right deep order means that the plan is linear and all joins are performed with the base table as the outer relation.
1286
+ As a special case, scan nodes are considered to be right-deep as well.
1287
+ """
1288
+ if self.is_scan():
1289
+ return True
1290
+ outer_join = self.outer_child.find_first_node(QueryPlan.is_join)
1291
+ return outer_join is None
1292
+
1293
+ def is_zigzag(self) -> bool:
1294
+ """Checks, whether the plan performs all joins in a zigzag order.
1295
+
1296
+ Zig-zag order means that the plan is linear, but neither left-deep nor right-deep. Therefore, at least one join has
1297
+ to be performed with the base table as the outer relation and another join with the base table as the inner relation.
1298
+ As a special case, scan nodes are considered to be zig-zag as well.
1299
+ """
1300
+ if self.is_scan():
1301
+ return True
1302
+ return self.is_linear() and not self.is_left_deep() and not self.is_right_deep()
1303
+
1304
+ def is_scan_branch(self) -> bool:
1305
+ """Checks, whether the current node directly leads to a scan node.
1306
+
1307
+ For example, the plan *Hash(SeqScan(R))* is a scan branch, because the input of the hash node is a scan node.
1308
+ Likewise, the plan *Aggregate(Sort(R))* is a scan branch, because the input of the aggregate node is just a sort
1309
+ node which in turn contains a scan node. On the other hand, the plan *NestLoop(SeqScan(R), IdxScan(S))* is not a
1310
+ scan branch, because the nested-loop join contains two input nodes that are both scans.
1311
+
1312
+ If a plan is a scan branch, `fetch_base_table()` can be used to directly retrieve the base table that is being scanned.
1313
+ """
1314
+ return self.is_scan() or self.input_node.is_scan_branch()
1315
+
1316
+ def is_base_join(self) -> bool:
1317
+ """Checks, whether the current node is a join node that joins two base tables.
1318
+
1319
+ The base tables do not need to be direct children of the join, but both at least have to be scan branches, as in the
1320
+ case of *MergeJoin(Sort(SeqScan(R)), IdxScan(S))*.
1321
+
1322
+ See Also
1323
+ --------
1324
+ is_scan_branch
1325
+ """
1326
+ if not self.is_join():
1327
+ return False
1328
+ return all(child.is_scan_branch() for child in self.children)
1329
+
1330
+ def plan_depth(self) -> int:
1331
+ """Calculates the depth of the query plan.
1332
+
1333
+ The depth of a query plan is the length of the longest path from the root to a leaf node. The leaf node is included in
1334
+ the calculation, i.e. the depth of the plan *SeqScan(R)* is 1.
1335
+ """
1336
+ return 1 + max((child.plan_depth() for child in self.children), default=0)
1337
+
1338
+ def fetch_base_table(self) -> Optional[TableReference]:
1339
+ """Retrieves the base table that is being scanned by the plan.
1340
+
1341
+ The base table is only specified for plans that directly lead to a scan node, as defined by `is_scan_branch()`.
1342
+ """
1343
+ if self.is_scan():
1344
+ return self.base_table
1345
+ elif self.is_join():
1346
+ return None
1347
+
1348
+ if len(self.children) == 1:
1349
+ return self.children[0].fetch_base_table()
1350
+ return None
1351
+
1352
+ def outermost_scan(self) -> Optional[QueryPlan]:
1353
+ """Retrieves the scan node that is furthest to the "left", i.e. on the outer-most position in the plan."""
1354
+ if self.is_scan():
1355
+ return self
1356
+ elif self.is_join():
1357
+ return self.outer_child.outermost_scan()
1358
+
1359
+ assert self.input_node is not None
1360
+ return self.input_node.outermost_scan()
1361
+
1362
+ def tables(self) -> set[TableReference]:
1363
+ """Provides all tables that are accessed at some point in the plan.
1364
+
1365
+ Notice that tables that are only accessed as part of user-specific metadata are not considered.
1366
+ """
1367
+ subplan_tabs: set[TableReference] = (
1368
+ self._subplan.tables() if self._subplan else set()
1369
+ )
1370
+ return (
1371
+ self._plan_params.tables()
1372
+ | util.set_union(c.tables() for c in self._children)
1373
+ | subplan_tabs
1374
+ )
1375
+
1376
+ def columns(self) -> set[ColumnReference]:
1377
+ """Provides all columns that are accessed at some point in the plan.
1378
+
1379
+ Notice that columns that are only accessed as part of user-specific metadata are not considered.
1380
+ """
1381
+ subplan_cols = self._subplan.root.columns() if self._subplan else set()
1382
+ return (
1383
+ self._plan_params.columns()
1384
+ | util.set_union(c.columns() for c in self._children)
1385
+ | subplan_cols
1386
+ )
1387
+
1388
+ def iternodes(self) -> Iterable[QueryPlan]:
1389
+ """Provides all nodes that are contained in the plan in depth-first order, prioritizing outer child nodes."""
1390
+ return util.flatten(child.iternodes() for child in self._children) + [self]
1391
+
1392
+ def lookup(
1393
+ self, tables: TableReference | Iterable[TableReference]
1394
+ ) -> Optional[QueryPlan]:
1395
+ """Traverse the plan to find a specific intermediate node.
1396
+
1397
+ If two nodes compute the same intermediate (i.e. provide the same tables), the node that is higher up in the plan is
1398
+ returned. If both appear on the same level, the outer child is preferred.
1399
+
1400
+ Parameters
1401
+ ----------
1402
+ tables : TableReference | Iterable[TableReference]
1403
+ The tables that should be contained in the intermediate. If a single table is provided (either as-is or as a
1404
+ singleton iterable), the corresponding scan node will be returned. If multiple tables are provided, the highest
1405
+ node that provides all of them *exactly* is returned.
1406
+
1407
+ Returns
1408
+ -------
1409
+ Optional[QueryPlan]
1410
+ The join tree node that contains the specified tables. If no such node exists, *None* is returned.
1411
+ """
1412
+ needle: set[TableReference] = set(util.enlist(tables))
1413
+ candidates = self.tables()
1414
+
1415
+ if needle == candidates:
1416
+ return self
1417
+ if not needle.issubset(candidates):
1418
+ return None
1419
+
1420
+ for child in self.children:
1421
+ result = child.lookup(needle)
1422
+ if result is not None:
1423
+ return result
1424
+
1425
+ return None
1426
+
1427
+ def find_first_node(
1428
+ self,
1429
+ predicate: Callable[[QueryPlan], bool],
1430
+ *args,
1431
+ direction: JoinDirection = "outer",
1432
+ **kwargs,
1433
+ ) -> Optional[QueryPlan]:
1434
+ """Recursively searches for the first node that matches a specific predicate.
1435
+
1436
+ Parameters
1437
+ ----------
1438
+ predicate : Callable[[QueryPlan], bool]
1439
+ The predicate to check. The predicate is called on each node in the tree and should return a *True-ish* value if
1440
+ the node matches the desired search criteria.
1441
+ direction : JoinDirection, optional
1442
+ The traversal strategy to use. *Outer* (the default) indicates that the outer child should be traversed first if
1443
+ the check on the parent node fails. *Inner* indicates the opposite.
1444
+ args
1445
+ Additional positional arguments that are passed to the predicate *after* the current node.
1446
+ kwargs
1447
+ Additional keyword arguments that are passed to the predicate.
1448
+
1449
+ Returns
1450
+ -------
1451
+ Optional[QueryPlan]
1452
+ The first node that matches the predicate. If no such node exists, *None* is returned.
1453
+ """
1454
+ if predicate(self, *args, **kwargs):
1455
+ return self
1456
+ if not self.children:
1457
+ return None
1458
+
1459
+ if len(self.children) == 1:
1460
+ return self.input_node.find_first_node(
1461
+ predicate, *args, direction=direction, **kwargs
1462
+ )
1463
+
1464
+ first_candidate, second_candidate = (
1465
+ (self.outer_child, self.inner_child)
1466
+ if direction == "outer"
1467
+ else (self.inner_child, self.outer_child)
1468
+ )
1469
+ first_match = first_candidate.find_first_node(
1470
+ predicate, *args, direction=direction, **kwargs
1471
+ )
1472
+ if first_match:
1473
+ return first_match
1474
+
1475
+ second_match = second_candidate.find_first_node(
1476
+ predicate, *args, direction=direction, **kwargs
1477
+ )
1478
+ if second_match:
1479
+ return second_match
1480
+
1481
+ return (
1482
+ self._subplan.root.find_first_node(
1483
+ predicate, *args, direction=direction, **kwargs
1484
+ )
1485
+ if self._subplan
1486
+ else None
1487
+ )
1488
+
1489
+ def find_all_nodes(
1490
+ self, predicate: Callable[[QueryPlan], bool], *args, **kwargs
1491
+ ) -> Iterable[QueryPlan]:
1492
+ """Recursively searches for all nodes that match a specific predicate.
1493
+
1494
+ The order in which the matching nodes appear is an implementation detail and should not be relied upon.
1495
+
1496
+ Parameters
1497
+ ----------
1498
+ predicate : Callable[[QueryPlan], bool]
1499
+ The predicate to check. The predicate is called on each node in the tree and should return a *True-ish* value if
1500
+ the node matches the desired search criteria.
1501
+ args
1502
+ Additional positional arguments that are passed to the predicate *after* the current node.
1503
+ kwargs
1504
+ Additional keyword arguments that are passed to the predicate.
1505
+
1506
+ Returns
1507
+ -------
1508
+ Iterable[QueryPlan]
1509
+ All nodes that match the predicate. If no such nodes exist, an empty iterable is returned.
1510
+ """
1511
+ matches: list[QueryPlan] = [self] if predicate(self, *args, **kwargs) else []
1512
+ for child in self._children:
1513
+ matches.extend(child.find_all_nodes(predicate, *args, **kwargs))
1514
+ if self._subplan:
1515
+ matches.extend(
1516
+ self._subplan.root.find_all_nodes(predicate, *args, **kwargs)
1517
+ )
1518
+ return matches
1519
+
1520
+ def cout(self, *, include_auxiliaries: bool = True) -> float:
1521
+ """Computes the *C-out* value of the operator.
1522
+
1523
+ The *C-out* value is the sum of the cardinalities of the current operator and all its children.
1524
+
1525
+ If the plan does not contain a measurement of the actual cardinality, the *C-out* value is undefined (indicated as
1526
+ *NaN*).
1527
+
1528
+ Parameters
1529
+ ----------
1530
+ include_auxiliaries : bool, optional
1531
+ Whether to include auxiliary nodes in the computation (which is the default). If disabled, only the actual
1532
+ cardinality of join and scan nodes is considered.
1533
+ """
1534
+ if not self.is_analyze():
1535
+ return math.nan
1536
+ own_card = (
1537
+ self.actual_cardinality
1538
+ if include_auxiliaries or not self.is_auxiliary()
1539
+ else 0
1540
+ )
1541
+ return own_card + sum(
1542
+ c.cout(include_auxiliaries=include_auxiliaries) for c in self.children
1543
+ )
1544
+
1545
+ def qerror(self) -> float:
1546
+ """Computes the *Q-error* of the operator.
1547
+
1548
+ If the plan does not contain an estimate of the cardinality, the *Q-error* value is undefined (indicated as *NaN*).
1549
+
1550
+ Notes
1551
+ -----
1552
+ We use the a slight deviation from the standard definition:
1553
+
1554
+ .. math ::
1555
+ qerror(e, a) = \\frac{max(e, a) + 1}{min(e, a) + 1}
1556
+
1557
+ where *e* is the estimated cardinality of the node and *a* is the actual cardinality. Notice that we add 1 to both the
1558
+ numerator as well as the denominator to prevent infinity errors for nodes that do not process any rows (e.g. due to
1559
+ pruning).
1560
+ """
1561
+ if not self.is_analyze():
1562
+ return math.nan
1563
+
1564
+ larger = max(self.estimated_cardinality, self.actual_cardinality) + 1
1565
+ smaller = min(self.estimated_cardinality, self.actual_cardinality) + 1
1566
+ return larger / smaller
1567
+
1568
+ def parallelize(self, workers: int) -> QueryPlan:
1569
+ plan_params = self._plan_params.clone()
1570
+ plan_params.parallel_workers = workers
1571
+ return QueryPlan(
1572
+ self.node_type,
1573
+ operator=self.operator,
1574
+ children=self.children,
1575
+ plan_params=plan_params,
1576
+ estimates=self._estimates,
1577
+ measures=self._measures,
1578
+ subplan=self.subplan,
1579
+ )
1580
+
1581
+ def with_estimates(
1582
+ self,
1583
+ *,
1584
+ cardinality: Optional[Cardinality] = None,
1585
+ cost: Optional[Cost] = None,
1586
+ keep_measures: bool = False,
1587
+ ) -> QueryPlan:
1588
+ """Replaces the current estimates of the operator with new ones.
1589
+
1590
+ Parameters
1591
+ ----------
1592
+ cardinality : Optional[Cardinality], optional
1593
+ The new estimated cardinality of the operator. If the estimate should be dropped *NaN* can be used. If the current
1594
+ cardinality should be kept, *None* can be passed (which is the default).
1595
+ cost : Optional[Cost], optional
1596
+ The new estimated cost of the operator. If the estimate should be dropped, *NaN* can be used. If the current cost
1597
+ should be kept, *None* can be passed (which is the default).
1598
+ keep_measures : bool, optional
1599
+ Whether to keep the actual measurements of the operator. If this is set to *False*, the actual cardinality and
1600
+ execution time are dropped. Measures are dropped by default because they usually depend on the estimates (which
1601
+ are now changed).
1602
+ """
1603
+ cardinality = self.estimated_cardinality if cardinality is None else cardinality
1604
+ cost = self.estimated_cost if cost is None else cost
1605
+ updated_estimates = PlanEstimates(cardinality=cardinality, cost=cost)
1606
+ updated_measures = self._measures if keep_measures else None
1607
+ return QueryPlan(
1608
+ self.node_type,
1609
+ operator=self.operator,
1610
+ children=self.children,
1611
+ plan_params=self.params,
1612
+ estimates=updated_estimates,
1613
+ measures=updated_measures,
1614
+ subplan=self.subplan,
1615
+ )
1616
+
1617
+ def with_actual_card(
1618
+ self,
1619
+ *,
1620
+ cost_estimator: Optional[Callable[[QueryPlan, Cardinality], Cost]] = None,
1621
+ ignore_nan: bool = True,
1622
+ ) -> QueryPlan:
1623
+ """Replaces the current estimates of the operator with the actual measurements.
1624
+
1625
+ The updated plan will not contain any measurements anymore and the costs will be set to *Nan* unless an explicit cost
1626
+ estimator is provided.
1627
+
1628
+ Parameters
1629
+ ----------
1630
+ cost_estimator : Optional[Callable[[QueryPlan, Cardinality], Cost]], optional
1631
+ An optional cost function to compute new estimates based on the new estimates. If no cost estimator is provided,
1632
+ the cost is set to *NaN*. The estimator receives the old plan now along with the new cardinality estimate as input
1633
+ and should return the new cost estimate.
1634
+ ignore_nan : bool, optional
1635
+ Whether *NaN* cardinalities should also be swapped. By default, this is set to *True*, which only replaces the
1636
+ estimated cardinality if the actual cardinality is a meaningful value.
1637
+
1638
+ Returns
1639
+ -------
1640
+ QueryPlan
1641
+ A new query plan with the actual cardinality as the estimated cardinality and the actual execution time as the
1642
+ estimated cost. The current plan is not changed.
1643
+ """
1644
+ if self.actual_cardinality:
1645
+ updated_cardinality = (
1646
+ self.estimated_cardinality
1647
+ if ignore_nan and self.actual_cardinality.isnan()
1648
+ else self.actual_cardinality
1649
+ )
1650
+ updated_cost = (
1651
+ cost_estimator(self, updated_cardinality)
1652
+ if cost_estimator
1653
+ else math.nan
1654
+ )
1655
+ updated_estimates = PlanEstimates(
1656
+ cardinality=updated_cardinality, cost=updated_cost
1657
+ )
1658
+ updated_measures = None
1659
+ else:
1660
+ updated_estimates = self._estimates
1661
+ updated_measures = None
1662
+
1663
+ updated_children = [
1664
+ child.with_actual_card(cost_estimator=cost_estimator, ignore_nan=ignore_nan)
1665
+ for child in self.children
1666
+ ]
1667
+
1668
+ if self.subplan:
1669
+ updated_subplan_root = self.subplan.root.with_actual_card(
1670
+ cost_estimator=cost_estimator, ignore_nan=ignore_nan
1671
+ )
1672
+ updated_subplan = Subplan(updated_subplan_root, self.subplan.target_name)
1673
+ else:
1674
+ updated_subplan = None
1675
+
1676
+ return QueryPlan(
1677
+ self.node_type,
1678
+ operator=self.operator,
1679
+ children=updated_children,
1680
+ plan_params=self.params,
1681
+ estimates=updated_estimates,
1682
+ measures=updated_measures,
1683
+ subplan=updated_subplan,
1684
+ )
1685
+
1686
+ def canonical(self) -> QueryPlan:
1687
+ """Creates a normalized version of the query plan.
1688
+
1689
+ This normalized version will only contain scan and join nodes, without any auxiliary nodes. Estimates and measurements
1690
+ of these nodes are kept as they are.
1691
+
1692
+ This method is mostly intended to remove system-specific elements of the QEP and provide a more stable representation.
1693
+ For example, Postgres uses a combination of Hash join node and Hash node to represent an actual hash join. Likewise,
1694
+ bitmap scans are represented as a bitmap heap scan with a number of bitmap index scans (and optional bitmap ANDs and
1695
+ ORs) as child nodes. With `canonical` all of these "implementation details" are removed and only the core of the query
1696
+ plan is kept.
1697
+
1698
+ Notice that aggregations and groupings are also auxiliary nodes and will not be available after canonicalization.
1699
+ Therefore, the cost of the canonical query plan might be less than the cost of the original plan.
1700
+ """
1701
+ if self.subplan:
1702
+ updated_subplan_root = self.subplan.root.canonical()
1703
+ updated_subplan = Subplan(updated_subplan_root, self.subplan.target_name)
1704
+ else:
1705
+ updated_subplan = None
1706
+
1707
+ if self.is_scan():
1708
+ # we remove all child nodes from scans to prevent any bitmap-scan shenanigans
1709
+ return QueryPlan(
1710
+ self.node_type,
1711
+ operator=self.operator,
1712
+ children=[],
1713
+ plan_params=self.params, # params include the base table
1714
+ estimates=self.estimates,
1715
+ measures=self.measures,
1716
+ subplan=updated_subplan,
1717
+ )
1718
+
1719
+ if not self.is_scan() and not self.is_join():
1720
+ # skip over auxiliary nodes
1721
+ return self.input_node.canonical()
1722
+
1723
+ children = [child.canonical() for child in self.children]
1724
+ return QueryPlan(
1725
+ self.node_type,
1726
+ operator=self.operator,
1727
+ children=children,
1728
+ plan_params=self.params,
1729
+ estimates=self.estimates,
1730
+ measures=self.measures,
1731
+ subplan=updated_subplan,
1732
+ )
1733
+
1734
+ def inspect(self, *, fields: Optional[Iterable[str]] = None) -> str:
1735
+ """Provides a human-readable representation of the query plan, inspired by Postgre's *EXPLAIN* output.
1736
+
1737
+ By default, the output will contain fields akin to the *EXPLAIN* output of Postgres. For example, this includes the
1738
+ estimated cardinality and the operator cost, or for *ANALYZE* plans also the actual measurements.
1739
+
1740
+ This can be customized by providing a list of fields that should be included in the output. The fields can either
1741
+ reference properties of the plan itself (e.g. ``estimated_cardinality``) or of a redirection to the metadata properties
1742
+ (e.g. ``params.index``). However, the current implementation only supports a single level of indirection, i.e. no
1743
+ ``params.custom_property.one_more_level``.
1744
+ """
1745
+ fields = [] if fields is None else list(fields)
1746
+ return _explainify(self, fields=fields)
1747
+
1748
+ def explain(self) -> str:
1749
+ """Alias for `inspect`
1750
+
1751
+ See Also
1752
+ --------
1753
+ inspect
1754
+ """
1755
+ return self.inspect()
1756
+
1757
+ def plan_summary(self) -> dict[str, object]:
1758
+ """Provides a quick summary of important properties of the query plan, inspired by Panda's *describe* method."""
1759
+ all_nodes = list(self.iternodes())
1760
+ summary: dict[str, object] = {
1761
+ "operator": self.node_type,
1762
+ "intermediate": " ⋈ ".join(str(t) for t in self.tables()),
1763
+ "estimated_card": round(self.estimated_cardinality, 3),
1764
+ "actual_card": round(self.actual_cardinality, 3),
1765
+ "estimated_cost": round(self.estimated_cost, 3),
1766
+ "c_out": self.cout(),
1767
+ "max_qerror": round(max(node.qerror() for node in all_nodes), 3),
1768
+ "avg_qerror": round(
1769
+ sum(node.qerror() for node in all_nodes) / len(all_nodes), 3
1770
+ ),
1771
+ "phys_ops": collections.Counter(child.node_type for child in all_nodes),
1772
+ }
1773
+ return summary
1774
+
1775
+ def ast(self) -> str:
1776
+ """Provides the tree-structure of the plan in a human-readable format."""
1777
+ return _astify(self)
1778
+
1779
+ def clone(self, *, deep: bool = False) -> QueryPlan:
1780
+ return self.__deepcopy__({}) if deep else self.__copy__()
1781
+
1782
+ def __json__(self) -> jsondict:
1783
+ return {
1784
+ "node_type": self.node_type,
1785
+ "operator": self.operator,
1786
+ "children": self.children,
1787
+ "plan_params": self._plan_params,
1788
+ "estimates": self._estimates,
1789
+ "measures": self._measures,
1790
+ "subplan": self._subplan,
1791
+ }
1792
+
1793
+ def __copy__(self) -> QueryPlan:
1794
+ return QueryPlan(
1795
+ self._node_type,
1796
+ operator=self._operator,
1797
+ children=self._children,
1798
+ plan_params=self._plan_params.clone(deep=False),
1799
+ estimates=self._estimates.clone(deep=False),
1800
+ measures=self._measures.clone(deep=False) if self._measures else None,
1801
+ subplan=self._subplan.clone(deep=False) if self._subplan else None,
1802
+ )
1803
+
1804
+ def __deepcopy__(self, memo: dict[int, object] = {}) -> QueryPlan:
1805
+ return QueryPlan(
1806
+ self._node_type,
1807
+ operator=self._operator,
1808
+ children=[child.__deepcopy__(memo) for child in self._children],
1809
+ plan_params=self._plan_params.clone(deep=True),
1810
+ estimates=self._estimates.clone(deep=True),
1811
+ measures=self._measures.clone(deep=True) if self._measures else None,
1812
+ subplan=self._subplan.clone(deep=True) if self._subplan else None,
1813
+ )
1814
+
1815
+ def __len__(self) -> int:
1816
+ return self.plan_depth()
1817
+
1818
+ def __contains__(
1819
+ self, key: str | TableReference | Iterable[TableReference]
1820
+ ) -> bool:
1821
+ if isinstance(key, TableReference):
1822
+ return key in self.tables()
1823
+ elif isinstance(key, Iterable):
1824
+ return set(key).issubset(self.tables())
1825
+
1826
+ return (
1827
+ key in self._plan_params
1828
+ or key in self._estimates
1829
+ or (self._measures and key in self._measures)
1830
+ )
1831
+
1832
+ def __getitem__(self, key: str) -> Any:
1833
+ if key in self._plan_params:
1834
+ return self._plan_params[key]
1835
+ if key in self._estimates:
1836
+ return self._estimates[key]
1837
+ if self._measures and key in self._measures:
1838
+ return self._measures[key]
1839
+ raise KeyError(f"'{key}' not found")
1840
+
1841
+ def __iter__(self) -> Iterator[QueryPlan]:
1842
+ yield self
1843
+ for child in self.children:
1844
+ yield from child
1845
+ if self.subplan:
1846
+ yield from self.subplan.root
1847
+
1848
+ def __eq__(self, other: object) -> bool:
1849
+ return (
1850
+ isinstance(other, type(self))
1851
+ and self._node_type == other._node_type
1852
+ and self.base_table == other.base_table
1853
+ and self._children == other._children
1854
+ )
1855
+
1856
+ def __hash__(self) -> int:
1857
+ return hash((self.node_type, self.base_table, self._children))
1858
+
1859
+ def __repr__(self) -> str:
1860
+ return str(self)
1861
+
1862
+ def __str__(self) -> str:
1863
+ normalized_node_type = self.node_type.replace(" ", "")
1864
+ if self.base_table:
1865
+ return f"{normalized_node_type}({self.base_table.identifier()})"
1866
+ child_texts = ", ".join(str(child) for child in self.children)
1867
+ return f"{normalized_node_type}({child_texts})"
1868
+
1869
+
1870
+ _starting_indentation = 0
1871
+
1872
+
1873
+ def _default_explain(plan: QueryPlan, *, padding: str) -> str:
1874
+ """Generates the Postgres-style *EXPLAIN* output for the current node."""
1875
+ components: list[str] = []
1876
+ metadata_indent = " " if padding else " "
1877
+
1878
+ estimated_card = round(plan.estimated_cardinality, 3)
1879
+ estimated_cost = round(plan.estimated_cost, 3)
1880
+ components.append(
1881
+ f"{padding}{metadata_indent}Estimated Cardinality={estimated_card}, Estimated Cost={estimated_cost}"
1882
+ )
1883
+
1884
+ if plan.is_analyze():
1885
+ actual_card = round(plan.actual_cardinality, 3)
1886
+ exec_time = round(plan.execution_time, 3)
1887
+ components.append(
1888
+ f"{padding}{metadata_indent}Actual Cardinality={actual_card}, Actual Time={exec_time}s"
1889
+ )
1890
+
1891
+ measures = plan.measures
1892
+ if measures.cache_hits is not None or measures.cache_misses is not None:
1893
+ cache_hits = (
1894
+ measures.cache_hits if measures.cache_hits is not None else math.nan
1895
+ )
1896
+ cache_misses = (
1897
+ measures.cache_misses if measures.cache_misses is not None else math.nan
1898
+ )
1899
+ components.append(
1900
+ f"{padding}{metadata_indent}Cache Hits={cache_hits}, Cache Misses={cache_misses}"
1901
+ )
1902
+
1903
+ params = plan.params
1904
+ if params.parallel_workers:
1905
+ components.append(
1906
+ f"{padding}{metadata_indent}Parallel Workers={params.parallel_workers}"
1907
+ )
1908
+ if params.lookup_key:
1909
+ components.append(f"{padding}{metadata_indent}Lookup Key={params.lookup_key}")
1910
+
1911
+ path_props: list[str] = []
1912
+ if params.index:
1913
+ path_props.append(f"Index={params.index}")
1914
+ if params.sort_keys:
1915
+ sort_keys = ", ".join(str(key) for key in params.sort_keys)
1916
+ path_props.append(f"Sort Keys={sort_keys}")
1917
+ if path_props:
1918
+ components.append(f"{padding}{metadata_indent}{', '.join(path_props)}")
1919
+
1920
+ return "\n".join(components)
1921
+
1922
+
1923
+ def _custom_explain(plan: QueryPlan, *, fields: list[str], padding: str) -> str:
1924
+ """Generates the user-specific *EXPLAIN* output for the current node."""
1925
+ attr_values: dict[str, str] = {}
1926
+ for attr in fields:
1927
+ if "." in attr:
1928
+ container_name, attr_name = attr.split(".")
1929
+ container = getattr(plan, container_name)
1930
+ value = getattr(container, attr_name)
1931
+ else:
1932
+ value = getattr(plan, attr)
1933
+
1934
+ attr_values[attr] = (
1935
+ str(round(value, 3)) if isinstance(value, Number) else str(value)
1936
+ )
1937
+
1938
+ attr_str = " ".join(f"{attr}={val}" for attr, val in attr_values.items())
1939
+ explain_data = f"{padding} [{attr_str}]"
1940
+ return explain_data
1941
+
1942
+
1943
+ def _explainify(
1944
+ plan: QueryPlan, *, fields: list[str], level: int = _starting_indentation
1945
+ ) -> str:
1946
+ """Handler method to generate the *EXPLAIN* output for the current node and its children."""
1947
+ padding = "" if not level else " " + " " * (level - 1)
1948
+ prefix = f"{padding}-> " if padding else ""
1949
+
1950
+ header = (
1951
+ f"{plan.node_type}({plan.base_table})" if plan.is_scan() else plan.node_type
1952
+ )
1953
+ explain_data = (
1954
+ _custom_explain(plan, fields=fields, padding=padding)
1955
+ if fields
1956
+ else _default_explain(plan, padding=padding)
1957
+ )
1958
+ child_explains = "\n".join(
1959
+ f"{_explainify(child, fields=fields, level=level + 1)}"
1960
+ for child in plan.children
1961
+ )
1962
+ subplan_explains = (
1963
+ _explainify(plan.subplan.root, fields=fields, level=level + 1)
1964
+ if plan.subplan
1965
+ else ""
1966
+ )
1967
+ if subplan_explains:
1968
+ child_explains = f"{child_explains}\n{subplan_explains}"
1969
+
1970
+ if not child_explains:
1971
+ return f"{prefix}{header}\n{explain_data}"
1972
+ return f"{prefix}{header}\n{explain_data}\n{child_explains}"
1973
+
1974
+
1975
+ def _astify(plan: QueryPlan, *, indentation: int = _starting_indentation) -> str:
1976
+ """Handler method to generate a tree-structure of the query plan."""
1977
+ padding = " " * indentation
1978
+ prefix = f"{padding}-> " if padding else ""
1979
+ if plan.is_scan():
1980
+ item_str = f"{prefix}{plan.node_type}({plan.base_table})"
1981
+ else:
1982
+ item_str = f"{prefix}{plan.node_type}"
1983
+ child_str = "\n".join(
1984
+ _astify(child, indentation=indentation + 2) for child in plan.children
1985
+ )
1986
+ return f"{item_str}\n{child_str}" if child_str else item_str