PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,12 @@
1
+ """Contains utilities to conveniently execute individual queries or entire workloads and to evaluate their results.
2
+
3
+ This module provides direct access to some frequently-used functionality, mostly related to workload modelling and execution.
4
+ Other modules need to be imported explicitly.
5
+
6
+ Specifically, this package provides the following modules:
7
+
8
+ - `analysis` provides a loose collection of utilities and formulas somewhat related to query optimization
9
+ - `querygen` provides a simple random query generator
10
+ - `ceb` provides an implementation of the Cardinality Estimation Benchmark workload generator
11
+ - `interactive` contains a simple interactive join order optimizer
12
+ """
@@ -0,0 +1,674 @@
1
+ """Provides a collection of utilities related to query optimization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import collections
6
+ import math
7
+ from collections.abc import Collection
8
+ from dataclasses import dataclass
9
+ from typing import Any, Literal, Optional
10
+
11
+ import Levenshtein
12
+
13
+ from .. import util
14
+ from .._core import ColumnReference, PhysicalOperator, TableReference
15
+ from .._jointree import JoinTree
16
+ from .._qep import QueryPlan
17
+ from ..db._db import Database, DatabasePool
18
+ from ..qal import parser, transform
19
+ from ..qal._qal import (
20
+ AbstractPredicate,
21
+ BinaryPredicate,
22
+ ColumnExpression,
23
+ CompoundPredicate,
24
+ SqlQuery,
25
+ StaticValueExpression,
26
+ Where,
27
+ )
28
+ from ..util import StateError
29
+
30
+
31
+ def possible_plans_bound(
32
+ query: SqlQuery,
33
+ *,
34
+ join_operators: set[str] = {"nested-loop join", "hash join", "sort-merge join"},
35
+ scan_operators: set[str] = {"sequential scan", "index scan"},
36
+ ) -> int:
37
+ """Computes a quick upper bound on the maximum number of possible query execution plans for a given query.
38
+
39
+ This upper bound is a very coarse one, based on three assumptions:
40
+
41
+ 1. any join sequence (even involving cross-products) of any form (i.e. right-deep, bushy, ...) is allowed
42
+ 2. the choice of scan operators and join operators can be varied freely
43
+ 3. each table can be scanned using arbitrary operators
44
+
45
+ The number of real-world query execution plans will typically be much smaller, because cross-products are only
46
+ used if really necessary and the selected join operator influences the scan operators and vice-versa.
47
+
48
+ Parameters
49
+ ----------
50
+ query : SqlQuery
51
+ The query for which the bound should be computed
52
+ join_operators : set[str], optional
53
+ The allowed join operators, by default {"nested-loop join", "hash join", "sort-merge join"}
54
+ scan_operators : set[str], optional
55
+ The allowed scan operators, by default {"sequential scan", "index scan"}
56
+
57
+ Returns
58
+ -------
59
+ int
60
+ An upper bound on the number of possible query execution plans
61
+ """
62
+ n_tables = len(query.tables())
63
+
64
+ join_orders = util.stats.catalan_number(n_tables)
65
+ joins = (n_tables - 1) * len(join_operators)
66
+ scans = n_tables * len(scan_operators)
67
+
68
+ return join_orders * joins * scans
69
+
70
+
71
+ def actual_plan_cost(
72
+ query: SqlQuery, analyze_plan: QueryPlan, *, database: Optional[Database] = None
73
+ ) -> float:
74
+ """Utility to compute the true cost of a query plan based on the actual cardinalities.
75
+
76
+ Parameters
77
+ ----------
78
+ query : SqlQuery
79
+ The query to analyze
80
+ analyze_plan : QueryPlan
81
+ The executed query which also contains the true cardinalities
82
+ database : Optional[Database], optional
83
+ The database providing the cost model. If omitted, the database is inferred from the database pool.
84
+
85
+ Returns
86
+ -------
87
+ float
88
+ _description_
89
+ """
90
+ if not analyze_plan.is_analyze():
91
+ raise ValueError("The provided plan is not an ANALYZE plan")
92
+ database = database if database is not None else DatabasePool().get_instance()
93
+ hinted_query = database.hinting().generate_hints(
94
+ query, analyze_plan.with_actual_card()
95
+ )
96
+ return database.optimizer().cost_estimate(hinted_query)
97
+
98
+
99
+ def text_diff(left: str, right: str, *, sep: str = " | ") -> str:
100
+ """Merges two text snippets to allow for a comparison on a per-line basis.
101
+
102
+ The two snippets are split into their individual lines and then merged back together.
103
+
104
+ Parameters
105
+ ----------
106
+ left : str
107
+ The text snippet to display on the left-hand side.
108
+ right : str
109
+ The text snippet to display on the right-hand side.
110
+ sep : str, optional
111
+ The separator to use between the left and right text snippets, by default `` | ``.
112
+
113
+ Returns
114
+ -------
115
+ str
116
+ The combined text snippet
117
+ """
118
+ left_lines = left.splitlines()
119
+ right_lines = right.splitlines()
120
+
121
+ max_left_len = max(len(line) for line in left_lines)
122
+ left_lines_padded = [line.ljust(max_left_len) for line in left_lines]
123
+
124
+ merged_lines = [
125
+ f"{left_line}{sep}{right_line}"
126
+ for left_line, right_line in zip(left_lines_padded, right_lines)
127
+ ]
128
+ return "\n".join(merged_lines)
129
+
130
+
131
+ def star_query_cardinality(
132
+ query: SqlQuery,
133
+ fact_table_pk_column: ColumnReference,
134
+ *,
135
+ database: Optional[Database] = None,
136
+ verbose: bool = False,
137
+ ) -> int:
138
+ """Utility function to manually compute the cardinality of a star query.
139
+
140
+ This function is intended for situations where the database is unable to compute the cardinality because the intermediates
141
+ involved in the query become to large or the query plans are simply too bad. It operates by manually computing the number
142
+ of output tuples for each of the entries in the fact table by sequentially joining the fact table with each dimension
143
+ table.
144
+
145
+ Parameters
146
+ ----------
147
+ query : SqlQuery
148
+ The query to compute the cardinality for. This is assumed to be a **SELECT \\*** query and the actual **SELECT** clause
149
+ is ignored completely.
150
+ fact_table_pk_column : ColumnReference
151
+ The fact table's primary key column. All dimension tables must perform an equi-join on this column.
152
+ database : Optional[Database], optional
153
+ The actual database. If this is omitted, the current database from the database pool is used.
154
+ verbose : bool, optional
155
+ Whether progress information should be printed during the computation. If this is enabled, the function will report
156
+ every 1000th value processed.
157
+
158
+ Returns
159
+ -------
160
+ int
161
+ The cardinality (i.e. number of output tuples) of the query
162
+
163
+ Warnings
164
+ --------
165
+ Currently, this function works well for simple SPJ-based queries, more complicated features might lead to wrong results.
166
+ Similarly, only pure star queries are supported, i.e. there has to be one central fact table and each dimension table
167
+ performs exactly one equi-join with the fact table's primary key. There may not be additional joins on the dimension
168
+ tables. If such additional dimension joins exist, they have to be pre-processed (e.g. by introducing materialized views)
169
+ and the query has to be rewritten to operate on the views instead.
170
+ It is the user's responsibility to ensure that the query is well-formed in these regards.
171
+ """
172
+ logger = util.make_logger(verbose, prefix=util.timestamp)
173
+ database = (
174
+ DatabasePool().get_instance().current_database()
175
+ if database is None
176
+ else database
177
+ )
178
+ fact_table = (
179
+ fact_table_pk_column.table
180
+ if fact_table_pk_column.is_bound()
181
+ else database.schema().lookup_column(fact_table_pk_column, query.tables())
182
+ )
183
+ if fact_table is None:
184
+ raise ValueError(
185
+ f"Cannot infer fact table from column '{fact_table_pk_column}'"
186
+ )
187
+ fact_table_pk_column = fact_table_pk_column.bind_to(fact_table)
188
+
189
+ id_vals_query = parser.parse_query(f"""
190
+ SELECT {fact_table_pk_column}, COUNT(*) AS card
191
+ FROM {fact_table}
192
+ GROUP BY {fact_table_pk_column}""")
193
+ if query.predicates().filters_for(fact_table):
194
+ filter_clause = Where(query.predicates().filters_for(fact_table))
195
+ id_vals_query = transform.add_clause(id_vals_query, filter_clause)
196
+ id_vals: list[tuple[Any, int]] = database.execute_query(id_vals_query)
197
+
198
+ base_query_fragments: dict[AbstractPredicate, SqlQuery] = {}
199
+ for join_pred in query.predicates().joins_for(fact_table):
200
+ join_partner = join_pred.join_partners_of(fact_table)
201
+ if not len(join_partner) == 1:
202
+ raise ValueError("Currently only singular joins are supported")
203
+
204
+ partner_table: ColumnReference = util.simplify(join_partner).table
205
+ query_fragment = transform.extract_query_fragment(
206
+ query, [fact_table, partner_table]
207
+ )
208
+ base_query_fragments[join_pred] = transform.as_count_star_query(query_fragment)
209
+
210
+ total_cardinality = 0
211
+ total_ids = len(id_vals)
212
+ for value_idx, (id_value, current_card) in enumerate(id_vals):
213
+ if value_idx % 1000 == 0:
214
+ logger("--", value_idx, "out of", total_ids, "values processed")
215
+
216
+ id_filter = BinaryPredicate.equal(
217
+ ColumnExpression(fact_table_pk_column),
218
+ StaticValueExpression(id_value),
219
+ )
220
+
221
+ for join_pred, base_query in base_query_fragments.items():
222
+ if current_card == 0:
223
+ break
224
+
225
+ expanded_predicate = CompoundPredicate.create_and(
226
+ [base_query.where_clause.predicate, id_filter]
227
+ )
228
+ expanded_where_clause = Where(expanded_predicate)
229
+
230
+ dimension_query = transform.replace_clause(
231
+ base_query, expanded_where_clause
232
+ )
233
+ dimension_card = database.execute_query(dimension_query)
234
+
235
+ current_card *= dimension_card
236
+
237
+ total_cardinality += current_card
238
+
239
+ return total_cardinality
240
+
241
+
242
+ def jointree_similarity_topdown(
243
+ a: JoinTree, b: JoinTree, *, symmetric: bool = False, gamma: float = 1.1
244
+ ) -> float:
245
+ """Computes the similarity of two join trees using a top-down approach.
246
+
247
+ Parameters
248
+ ----------
249
+ a : JoinTree
250
+ The first join tree
251
+ b : JoinTree
252
+ The second join tree
253
+ symmetric : bool, optional
254
+ Whether the calculation should be symmetric. If true, the occurence of joins in different branches is not
255
+ penalized. See Notes for details.
256
+ gamma : float, optional
257
+ The reinforcement factor to prioritize similarity of earlier (i.e. deeper) joins. The higher the value, the
258
+ stronger the amplification, by default 1.1
259
+
260
+ Returns
261
+ -------
262
+ float
263
+ An artificial similarity score in [0, 1]. Higher values indicate larger similarity.
264
+
265
+ Notes
266
+ -----
267
+ TODO: add discussion of the algorithm
268
+ """
269
+ tables_a, tables_b = a.tables(), b.tables()
270
+ total_n_tables = len(tables_a | tables_b)
271
+ normalization_factor = 1 / total_n_tables
272
+
273
+ # similarity between two leaf nodes
274
+ if len(tables_a) == 1 and len(tables_b) == 1:
275
+ return 1 if tables_a == tables_b else 0
276
+
277
+ # similarity between leaf node and intermediate node
278
+ if len(tables_a) == 1 or len(tables_b) == 1:
279
+ leaf_tree = a if len(tables_a) == 1 else b
280
+ intermediate_tree = b if leaf_tree == a else a
281
+
282
+ inner_score = util.jaccard(
283
+ leaf_tree.tables(), intermediate_tree.inner_child.tables()
284
+ )
285
+ outer_score = util.jaccard(
286
+ leaf_tree.tables(), intermediate_tree.outer_child.tables()
287
+ )
288
+
289
+ return normalization_factor * max(inner_score, outer_score)
290
+
291
+ # similarity between two intermediate nodes
292
+ a_inner, a_outer = a.inner_child, a.outer_child
293
+ b_inner, b_outer = b.inner_child, b.outer_child
294
+
295
+ symmetric_score = util.jaccard(a_inner.tables(), b_inner.tables()) + util.jaccard(
296
+ a_outer.tables(), b_outer.tables()
297
+ )
298
+ crossover_score = (
299
+ util.jaccard(a_inner.tables(), b_outer.tables())
300
+ + util.jaccard(a_outer.tables(), b_inner.tables())
301
+ if symmetric
302
+ else 0
303
+ )
304
+ node_score = normalization_factor * max(symmetric_score, crossover_score)
305
+
306
+ if symmetric and crossover_score > symmetric_score:
307
+ child_score = jointree_similarity_topdown(
308
+ a_inner, b_outer, symmetric=symmetric, gamma=gamma
309
+ ) + jointree_similarity_topdown(
310
+ a_outer, b_inner, symmetric=symmetric, gamma=gamma
311
+ )
312
+ else:
313
+ child_score = jointree_similarity_topdown(
314
+ a_inner, b_inner, symmetric=symmetric, gamma=gamma
315
+ ) + jointree_similarity_topdown(
316
+ a_outer, b_outer, symmetric=symmetric, gamma=gamma
317
+ )
318
+
319
+ return node_score + gamma * child_score
320
+
321
+
322
+ def jointree_similarity_bottomup(a: JoinTree, b: JoinTree) -> float:
323
+ """Computes the similarity of two join trees based on a bottom-up approach.
324
+
325
+ Parameters
326
+ ----------
327
+ a : JoinTree
328
+ The first join tree to compare
329
+ b : JoinTree
330
+ The second join tree to compare
331
+
332
+ Returns
333
+ -------
334
+ float
335
+ An artificial similarity score in [0, 1]. Higher values indicate larger similarity.
336
+
337
+ Notes
338
+ -----
339
+ TODO: add discussion of the algorithm
340
+ """
341
+ a_subtrees = {join.tables() for join in a.iterjoins()}
342
+ b_subtrees = {join.tables() for join in b.iterjoins()}
343
+ return util.jaccard(a_subtrees, b_subtrees)
344
+
345
+
346
+ def linearized_levenshtein_distance(a: JoinTree, b: JoinTree) -> int:
347
+ """Computes the levenshtein distance of the table sequences of two join trees.
348
+
349
+ Parameters
350
+ ----------
351
+ a : JoinTree
352
+ The first join tree to compare
353
+ b : JoinTree
354
+ The second join tree to compare
355
+
356
+ Returns
357
+ -------
358
+ int
359
+ The distance score. Higher values indicate larger distance.
360
+
361
+ References
362
+ ----------
363
+
364
+ .. Levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance
365
+ """
366
+ return Levenshtein.distance(a.itertables(), b.itertables())
367
+
368
+
369
+ _DepthState = collections.namedtuple("_DepthState", ["current_level", "depths"])
370
+ """Keeps track of the current calculated depths of different base tables."""
371
+
372
+
373
+ def _traverse_join_tree_depth(
374
+ current_node: JoinTree, current_depth: _DepthState
375
+ ) -> _DepthState:
376
+ """Calculates a new depth state for the current join tree node based on the current depth.
377
+
378
+ This is the handler method for `join_depth`.
379
+
380
+ Depending on the specific node, different calculations are applied:
381
+
382
+ - for base tables, a new entry of depth one is inserted into the depth state
383
+ - for intermediate nodes, the children are visited to integrate their depth states. Afterwards, their depth is
384
+ increase to incoporate the join
385
+
386
+ Parameters
387
+ ----------
388
+ current_node : JoinTree
389
+ The node whose depth information should be integrated
390
+ current_depth : _DepthState
391
+ The current depth state
392
+
393
+ Returns
394
+ -------
395
+ _DepthState
396
+ The updated depth state
397
+
398
+ Raises
399
+ ------
400
+ TypeError
401
+ If the node is neither a base table node, nor an intermediate join node. This indicates that the class
402
+ hierarchy of join tree nodes was expanded, and this method was not updated properly.
403
+ """
404
+ if current_node.is_scan():
405
+ return _DepthState(1, current_depth.depths | {current_node.base_table: 1})
406
+
407
+ if current_node.is_join():
408
+ raise TypeError("Unknown current node type: " + str(current_node))
409
+
410
+ inner_child, outer_child = current_node.inner_child, current_node.outer_child
411
+ if current_node.is_base_join():
412
+ return _DepthState(
413
+ 1,
414
+ current_depth.depths
415
+ | {inner_child.base_table: 1, outer_child.base_table: 1},
416
+ )
417
+ elif inner_child.is_scan():
418
+ outer_depth = _traverse_join_tree_depth(outer_child, current_depth)
419
+ updated_depth = outer_depth.current_level + 1
420
+ return _DepthState(
421
+ updated_depth, outer_depth.depths | {inner_child.base_table: updated_depth}
422
+ )
423
+ elif outer_child.is_scan():
424
+ inner_depth = _traverse_join_tree_depth(inner_child, current_depth)
425
+ updated_depth = inner_depth.current_level + 1
426
+ return _DepthState(
427
+ updated_depth, inner_depth.depths | {outer_child.table: updated_depth}
428
+ )
429
+ else:
430
+ inner_depth = _traverse_join_tree_depth(inner_child, current_depth)
431
+ outer_depth = _traverse_join_tree_depth(outer_child, current_depth)
432
+ updated_depth = max(inner_depth.current_level, outer_depth.current_level) + 1
433
+ return _DepthState(updated_depth, inner_depth.depths | outer_depth.depths)
434
+
435
+
436
+ def join_depth(join_tree: JoinTree) -> dict[TableReference, int]:
437
+ """Calculates for each base table in a join tree the join index when it was integrated into an intermediate result.
438
+
439
+ For joins of two base tables, the depth value is 1. If a table is joined with the intermediate result of the base
440
+ table join, its depth is 2. Generally speaking, the depth of each table is 1 plus the maximum depth of any table
441
+ in the intermediate result that the new table is joined with.
442
+
443
+ Parameters
444
+ ----------
445
+ join_tree : JoinTree
446
+ The join tree for which the depths should be calculated.
447
+
448
+ Returns
449
+ -------
450
+ dict[TableReference, int]
451
+ A mapping from tables to their depth values.
452
+
453
+ Examples
454
+ --------
455
+ TODO add examples
456
+ """
457
+ if join_tree.is_empty():
458
+ return {}
459
+ return _traverse_join_tree_depth(join_tree, _DepthState(0, {})).depths
460
+
461
+
462
+ @dataclass
463
+ class PlanChangeEntry:
464
+ """Models a single diff between two join trees.
465
+
466
+ The compared join trees are referred two as the left tree and the right tree, respectively.
467
+
468
+ Attributes
469
+ ----------
470
+ change_type : Literal["tree-structure", "join-direction", "physical-op", "card-est"]
471
+ Describes the precise difference between the trees. *tree-structure* indicates that the two trees are fundamentally
472
+ different. This occurs when the join orders are not the same. *join-direction* means that albeit the join orders are
473
+ the same, the roles in a specific join are reversed: the inner relation of one tree acts as the outer relation in the
474
+ other one and vice-versa. *physical-op* means that two structurally identical nodes (i.e. same join or base table)
475
+ differ in the assigned physical operator. *card-est* indicates that two structurally identifcal nodes (i.e. same join
476
+ or base table) differ in the estimated cardinality, while *cost-est* does the same, just for the estimated cost.
477
+ left_state : frozenset[TableReference] | PhysicalOperator | float
478
+ Depending on the `change_type` this attribute describes the left tree. For example, for different tree structures,
479
+ these are the tables in the left subtree, for different physical operators, this is the operator assigned to the node
480
+ in the left tree and so on. For different join directions, this is the entire join node
481
+ right_state : frozenset[TableReference] | PhysicalOperator | float
482
+ Equivalent attribute to `left_state`, just for the right tree.
483
+ context : Optional[frozenset[TableReference]], optional
484
+ For different physical operators or cardinality estimates, this describes the intermediate that is different. This
485
+ attribute is unset by default.
486
+ """
487
+
488
+ change_type: Literal[
489
+ "tree-structure",
490
+ "join-direction",
491
+ "physical-op",
492
+ "card-est",
493
+ "cost-est",
494
+ "actual-card",
495
+ ]
496
+ left_state: frozenset[TableReference] | PhysicalOperator | float
497
+ right_state: frozenset[TableReference] | PhysicalOperator | float
498
+ context: Optional[frozenset[TableReference]] = None
499
+
500
+ def inspect(self) -> str:
501
+ """Provides a human-readable string of the diff.
502
+
503
+ Returns
504
+ -------
505
+ str
506
+ The diff
507
+ """
508
+ match self.change_type:
509
+ case "tree-structure":
510
+ left_str = [tab.identifier() for tab in self.left_state]
511
+ right_str = [tab.identifier() for tab in self.right_state]
512
+ return f"Different subtrees: left={left_str} right={right_str}"
513
+ case "join-direction":
514
+ left_str = [tab.identifier() for tab in self.left_state]
515
+ right_str = [tab.identifier() for tab in self.right_state]
516
+ return f"Swapped join direction: left={left_str} right={right_str}"
517
+ case "physical-op":
518
+ return f"Different physical operators on node {self.context}: left={self.left_state} right={self.right_state}"
519
+ case "card-est":
520
+ return (
521
+ f"Different cardinality estimates on node {self.context}: "
522
+ f"left={self.left_state} right={self.right_state}"
523
+ )
524
+ case "cost-est":
525
+ return (
526
+ f"Different cost estimates on node {self.context}: "
527
+ f"left={self.left_state} right={self.right_state}"
528
+ )
529
+ case "actual-card":
530
+ return (
531
+ f"Different actual cardinality on node {self.context}: "
532
+ f"left={self.left_state} right={self.right_state}"
533
+ )
534
+ case _:
535
+ raise StateError(f"Unknown change type '{self.change_type}'")
536
+
537
+
538
+ @dataclass
539
+ class PlanChangeset:
540
+ """Captures an arbitrary amount of join tree diffs.
541
+
542
+ Attributes
543
+ ----------
544
+ changes : Collection[JointreeChangeEntry]
545
+ The diffs
546
+ """
547
+
548
+ changes: Collection[PlanChangeEntry]
549
+
550
+ def inspect(self) -> str:
551
+ """Provides a human-readable string of the entire diff.
552
+
553
+ The diff will typically contain newlines to separate individual entries.
554
+
555
+ Returns
556
+ -------
557
+ str
558
+ The diff
559
+ """
560
+ return "\n".join(entry.inspect() for entry in self.changes)
561
+
562
+
563
+ def compare_query_plans(left: QueryPlan, right: QueryPlan) -> PlanChangeset:
564
+ """Computes differences between two query execution plans.
565
+
566
+ Parameters
567
+ ----------
568
+ left : QueryPlan
569
+ The first plan to compare
570
+ right : QueryPlan
571
+ The second plan to compare
572
+
573
+ Returns
574
+ -------
575
+ JointreeChangeset
576
+ A diff between the two join trees
577
+ """
578
+ # FIXME: query plans might contain auxiliary nodes that are currently not handled/recognized
579
+ if left.find_first_node(lambda node: node.is_auxiliary()) or right.find_first_node(
580
+ lambda node: node.is_auxiliary()
581
+ ):
582
+ raise ValueError(
583
+ "Comparison of query plans with auxiliary (i.e. non-join and non-scan) operators "
584
+ "is currently not supported"
585
+ )
586
+
587
+ if left.tables() != right.tables():
588
+ changeset = [
589
+ PlanChangeEntry(
590
+ "tree-structure", left_state=left.tables(), right_state=right.tables()
591
+ )
592
+ ]
593
+ return PlanChangeset(changeset)
594
+
595
+ changes: list[PlanChangeEntry] = []
596
+
597
+ left_card_est, right_card_est = (
598
+ left.estimated_cardinality,
599
+ right.estimated_cardinality,
600
+ )
601
+ left_card_actual, right_card_actual = (
602
+ left.actual_cardinality,
603
+ right.actual_cardinality,
604
+ )
605
+ left_cost, right_cost = left.estimated_cost, right.estimated_cost
606
+ if left_card_est != right_card_est and not (
607
+ math.isnan(left_card_est) and math.isnan(right_card_est)
608
+ ):
609
+ changes.append(
610
+ PlanChangeEntry(
611
+ "card-est",
612
+ left_state=left_card_est,
613
+ right_state=right_card_est,
614
+ context=left.tables(),
615
+ )
616
+ )
617
+ if left_card_actual != right_card_actual and not (
618
+ math.isnan(left_card_actual) and math.isnan(right_card_actual)
619
+ ):
620
+ changes.append(
621
+ PlanChangeEntry(
622
+ "actual-card",
623
+ left_state=left_card_actual,
624
+ right_state=right_card_actual,
625
+ context=left.tables(),
626
+ )
627
+ )
628
+ if left_cost != right_cost and not (
629
+ math.isnan(left_cost) and math.isnan(left_cost)
630
+ ):
631
+ changes.append(
632
+ PlanChangeEntry(
633
+ "cost-est",
634
+ left_state=left_cost,
635
+ right_state=right_cost,
636
+ context=left.tables(),
637
+ )
638
+ )
639
+
640
+ left_op, right_op = left.node_type, right.node_type
641
+ if left_op != right_op:
642
+ changes.append(
643
+ PlanChangeEntry(
644
+ "physical-op",
645
+ left_state=left_op,
646
+ right_state=right_op,
647
+ context=left.tables(),
648
+ )
649
+ )
650
+
651
+ if left.is_join():
652
+ # we can also assume that right is an intermediate node since we know both nodes have the same tables and the left tree
653
+ # is an intermediate node
654
+
655
+ join_direction_swap = left.inner_child.tables() == right.outer_child.tables()
656
+ if join_direction_swap:
657
+ changes.append(
658
+ PlanChangeEntry("join-direction", left_state=left, right_state=right)
659
+ )
660
+ changes.extend(
661
+ compare_query_plans(left.inner_child, right.outer_child).changes
662
+ )
663
+ changes.extend(
664
+ compare_query_plans(left.outer_child, right.inner_child).changes
665
+ )
666
+ else:
667
+ changes.extend(
668
+ compare_query_plans(left.inner_child, right.inner_child).changes
669
+ )
670
+ changes.extend(
671
+ compare_query_plans(left.outer_child, right.inner_child).changes
672
+ )
673
+
674
+ return PlanChangeset(changes)