PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1479 @@
1
+ """Implementation of the TONIC algorithm for learned operator selections [1]_.
2
+
3
+ References
4
+ ----------
5
+
6
+ .. [1] A. Hertzschuch et al.: "Turbo-Charging SPJ Query Plans with Learned Physical Join Operator Selections.", VLDB'2022
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import collections
12
+ import itertools
13
+ import json
14
+ import math
15
+ import random
16
+ from collections.abc import Iterable, Sequence
17
+ from typing import Any, Optional
18
+
19
+ from .. import db, qal, util
20
+ from .._core import ColumnReference, JoinOperator, TableReference
21
+ from .._hints import JoinOperatorAssignment, PhysicalOperatorAssignment
22
+ from .._jointree import JoinTree, jointree_from_plan
23
+ from .._qep import QueryPlan
24
+ from .._stages import PhysicalOperatorSelection
25
+ from ..qal import parser
26
+
27
+ # TODO: there should be more documentation of the technical design of the QEP-S structure
28
+ # More specifically, this documentation should describe the strategies to integrate subquery nodes, and the QEP-S traversal
29
+
30
+
31
+ def _left_query_plan_child(node: QueryPlan) -> QueryPlan:
32
+ """Infers the left child node for a query execution plan.
33
+
34
+ Since query execution plans do not carry a notion of directional children directly, this method applies the following rule:
35
+ If the plan node contains an outer child, this is the left child. Otherwise, the first child is returned. If the node does
36
+ not have at least one children,
37
+
38
+ Parameters
39
+ ----------
40
+ node : QueryPlan
41
+ The execution plan node for which the children should be found
42
+
43
+ Returns
44
+ -------
45
+ QueryPlan
46
+ The child node
47
+
48
+ Raises
49
+ ------
50
+ IndexError
51
+ If the node does not contain any children.
52
+ """
53
+ return node.outer_child
54
+
55
+
56
+ def _right_query_plan_child(node: QueryPlan) -> QueryPlan:
57
+ """Infers the right child node for a query execution plan.
58
+
59
+ Since query execution plans do not carry a notion of directional children directly, this method applies the following rule:
60
+ If the plan node contains an inner child, this is the right child. Otherwise, the second child is returned.
61
+
62
+ Parameters
63
+ ----------
64
+ node : QueryPlan
65
+ The execution plan node for which the children should be found
66
+
67
+ Returns
68
+ -------
69
+ QueryPlan
70
+ The child node
71
+
72
+ Raises
73
+ ------
74
+ IndexError
75
+ If the node contains less than two children.
76
+ """
77
+ return node.inner_child
78
+
79
+
80
+ def _iterate_query_plan(current_node: QueryPlan) -> Sequence[QueryPlan]:
81
+ """Provides all joins along the deepest join path in the query plan.
82
+
83
+ Parameters
84
+ ----------
85
+ current_node : QueryPlan
86
+ The node from which the iteration should start
87
+
88
+ Returns
89
+ -------
90
+ Sequence[QueryPlan]
91
+ The join nodes along the deepest path, starting with the deepest nodes.
92
+ """
93
+ if current_node.is_scan():
94
+ return []
95
+ if not current_node.is_join():
96
+ assert current_node.input_node is not None
97
+ return _iterate_query_plan(current_node.input_node)
98
+ left_child, right_child = (
99
+ _left_query_plan_child(current_node),
100
+ _right_query_plan_child(current_node),
101
+ )
102
+ left_child, right_child = (
103
+ (right_child, left_child)
104
+ if right_child.plan_depth() < left_child.plan_depth()
105
+ else (left_child, right_child)
106
+ )
107
+ return list(_iterate_query_plan(right_child)) + [current_node]
108
+
109
+
110
+ def _iterate_join_tree(current_node: JoinTree) -> Sequence[JoinTree]:
111
+ """Provides all joins along the deepest join path in the join tree.
112
+
113
+ Parameters
114
+ ----------
115
+ current_node : JoinTree
116
+ The node from which the iteration should start
117
+
118
+ Returns
119
+ -------
120
+ Sequence[jointree.IntermediateJoinNode]
121
+ The joins along the deepest path, starting with the deepest nodes.
122
+ """
123
+ if current_node.is_scan():
124
+ return []
125
+ assert current_node.is_join()
126
+ left_child, right_child = current_node.outer_child, current_node.inner_child
127
+ left_child, right_child = (
128
+ (right_child, left_child)
129
+ if right_child.plan_depth() < left_child.plan_depth()
130
+ else (left_child, right_child)
131
+ )
132
+ return list(_iterate_join_tree(right_child)) + [current_node]
133
+
134
+
135
+ def _normalize_filter_predicate(
136
+ tables: TableReference | Iterable[TableReference],
137
+ filter_predicate: Optional[qal.AbstractPredicate],
138
+ ) -> Optional[qal.AbstractPredicate]:
139
+ """Removes all alias information from a specific set of tables in a predicate.
140
+
141
+ Parameters
142
+ ----------
143
+ tables : TableReference | Iterable[TableReference]
144
+ The tables whose alias information should be removed
145
+ filter_predicate : Optional[qal.AbstractPredicate]
146
+ The predicate from which the alias information should be removed. Can be ``None``, in which case no removal is
147
+ performed.
148
+
149
+ Returns
150
+ -------
151
+ Optional[qal.AbstractPredicate]
152
+ The normalized predicate or ``None`` if no predicate was given in the first place.
153
+ """
154
+ if not filter_predicate:
155
+ return None
156
+ tables: set[TableReference] = set(util.enlist(tables))
157
+ referenced_tables = tables & filter_predicate.tables()
158
+ renamed_tables = {table: table.drop_alias() for table in referenced_tables}
159
+ renamed_columns = {
160
+ col: ColumnReference(col.name, renamed_tables[col.table])
161
+ for col in filter_predicate.columns()
162
+ if col.table in renamed_tables
163
+ }
164
+ return qal.transform.rename_columns_in_predicate(filter_predicate, renamed_columns)
165
+
166
+
167
+ def _tables_in_qeps_path(
168
+ qeps_path: Sequence[QepsIdentifier],
169
+ ) -> frozenset[TableReference]:
170
+ """Extracts all tables along a QEP-S path
171
+
172
+ Parameters
173
+ ----------
174
+ qeps_path : Sequence[QepsIdentifier]
175
+ The path to analyze
176
+
177
+ Returns
178
+ -------
179
+ frozenset[TableReference]
180
+ All tables in the path
181
+ """
182
+ return util.set_union(identifier.tables() for identifier in qeps_path)
183
+
184
+
185
+ class QepsIdentifier:
186
+ """Models the identifiers of QEP-S nodes.
187
+
188
+ Each identifier can either describe a base table node, or an intermediate join node. This depends on the supplied `tables`.
189
+ A single table corresponds to a base table node, whereas multiple tables corresponds to the join of the individual base
190
+ tables. Furthermore, each identifier can optionally be annotated by a filter predicate that can be used to distinguish two
191
+ identifiers over the same tables.
192
+
193
+ Identifiers provide efficient hashing and equality comparisons.
194
+
195
+ Parameters
196
+ ----------
197
+ tables : TableReference | Iterable[TableReference]
198
+ The tables that constitute the QEP-S node. Subquery nodes consist of multiple tables (the tables in the subquery) and
199
+ scan nodes consist of a single table
200
+ filter_predicate : Optional[qal.AbstractPredicate], optional
201
+ The filter predicate that is used to restrict the allowed tuples of the base table. This does not have any meaning for
202
+ subquery nodes.
203
+
204
+ Raises
205
+ ------
206
+ ValueError
207
+ If no table is supplied (either as a ``None`` argument, or as an empty iterable).
208
+ """
209
+
210
+ def __init__(
211
+ self,
212
+ tables: TableReference | Iterable[TableReference],
213
+ filter_predicate: Optional[qal.AbstractPredicate] = None,
214
+ ) -> None:
215
+ if not tables:
216
+ raise ValueError("Tables required")
217
+ self._tables = frozenset(tab.drop_alias() for tab in util.enlist(tables))
218
+ self._filter_predicate = _normalize_filter_predicate(tables, filter_predicate)
219
+ self._hash_val = hash((self._tables, self._filter_predicate))
220
+
221
+ @property
222
+ def table(self) -> Optional[TableReference]:
223
+ """Get the table that is represented by this base table identifier.
224
+
225
+ Returns
226
+ -------
227
+ Optional[TableReference]
228
+ The table or ``None`` if this node corresponds to a subquery node.
229
+ """
230
+ if not len(self._tables) == 1:
231
+ return None
232
+ return util.collections.get_any(self._tables)
233
+
234
+ @property
235
+ def tables(self) -> frozenset[TableReference]:
236
+ """Get the tables that represent this identifier.
237
+
238
+ Returns
239
+ -------
240
+ frozenset[TableReference]
241
+ The tables. This can be a set of just a single table for base table identifiers, but the set will never be empty.
242
+ """
243
+ return self._tables
244
+
245
+ @property
246
+ def filter_predicate(self) -> Optional[qal.AbstractPredicate]:
247
+ """Get the filter predicate that is used to describe this identifier.
248
+
249
+ Returns
250
+ -------
251
+ Optional[qal.AbstractPredicate]
252
+ The predicate. May be ``None`` if no predicate exists or was specified. For subquery node identifiers, this should
253
+ always be ``None``.
254
+ """
255
+ return self._filter_predicate
256
+
257
+ def is_base_table_id(self) -> bool:
258
+ """Checks, whether this identifier describes a normal base table scan.
259
+
260
+ Returns
261
+ -------
262
+ bool
263
+ True if its a base table identifier, false otherwise
264
+ """
265
+ return len(self._tables) == 1
266
+
267
+ def is_subquery_id(self) -> bool:
268
+ """Checks, whether this identifier describes a subquery (a branch in the join order).
269
+
270
+ Returns
271
+ -------
272
+ bool
273
+ True if its a subquery identifier, false otherwise
274
+ """
275
+ return len(self._tables) > 1
276
+
277
+ def __json__(self) -> dict:
278
+ return {"tables": self._tables, "filter_predicate": self._filter_predicate}
279
+
280
+ def __hash__(self) -> int:
281
+ return self._hash_val
282
+
283
+ def __eq__(self, other: object) -> bool:
284
+ return (
285
+ isinstance(other, type(self))
286
+ and self.tables == other.tables
287
+ and self.filter_predicate == other.filter_predicate
288
+ )
289
+
290
+ def __repr__(self) -> str:
291
+ return str(self)
292
+
293
+ def __str__(self) -> str:
294
+ table_str = (
295
+ self.table.identifier()
296
+ if len(self.tables) == 1
297
+ else "#" + "#".join(tab.identifier() for tab in self.tables)
298
+ )
299
+ filter_str = f"[{self.filter_predicate}]" if self.filter_predicate else ""
300
+ return table_str + filter_str
301
+
302
+
303
+ class QEPsNode:
304
+ """Models the a join path with its learned operator costs.
305
+
306
+ QEP-S nodes form a tree structure, with each branch corresponding to a different join path. Each node is identified by
307
+ a `QepsIdentifier` that corresponds to the table or subquery that is joined at this point. The join at each QEP-S node can
308
+ be determined by the tables of its predecessor nodes and the table(s) in its identifier.
309
+
310
+ Each node maintains the costs of different join operators that it has learned so far.
311
+
312
+ Take a look at the fundamental paper on TONIC [1] for more details on the different parameters.
313
+
314
+ Parameters
315
+ ----------
316
+ filter_aware : bool
317
+ Whether child nodes should be created for each joined table (not filter aware), or for each pair of joined table,
318
+ filter predicate on that table (filter aware).
319
+ gamma : float
320
+ Controls the balance betwee new cost information and learned costs for the physical operators.
321
+ identifier : Optional[QepsIdentifier], optional
322
+ The identifier of this node. Can be ``None`` for the root node of the entire QEP-S or for subquery nodes. All other
323
+ nodes should have a valid identifier.
324
+ parent : Optional[QepsNode], optional
325
+ The predecessor node of this node. Can be ``None`` for the root node of the entire QEP-S or for subquery nodes. All
326
+ other nodes should have a valid parent.
327
+
328
+ Attributes
329
+ ----------
330
+ operator_costs : dict[JoinOperators, float]
331
+ The learned costs of different physical join operators to perform the join of the current path with the identifier
332
+ relation.
333
+ child_nodes : dict[QepsIdentifier, QepsNode]
334
+ The children of the current QEP-S node. Each child corresponds to a different join path and a join of a different
335
+ (potentially intermediate) relation. Children are created automatically as necessary.
336
+
337
+ References
338
+ ----------
339
+
340
+ .. A. Hertzschuch et al.: "Turbo-Charging SPJ Query Plans with Learned Physical Join Operator Selections.", VLDB'2022
341
+ """
342
+
343
+ def __init__(
344
+ self,
345
+ filter_aware: bool,
346
+ gamma: float,
347
+ *,
348
+ identifier: Optional[QepsIdentifier] = None,
349
+ parent: Optional[QEPsNode] = None,
350
+ ) -> None:
351
+ self.filter_aware = filter_aware
352
+ self.gamma = gamma
353
+ self.operator_costs: dict[JoinOperator, float] = collections.defaultdict(float)
354
+ self.child_nodes = util.dicts.DynamicDefaultDict(self._init_qeps)
355
+ self._subquery_root: Optional[QEPsNode] = None # only used for subquery nodes
356
+ self._parent = parent
357
+ self._identifier = identifier
358
+
359
+ @property
360
+ def subquery_root(self) -> QEPsNode:
361
+ """The subquery that starts at the current node.
362
+
363
+ Accessing this property means that this node is a subquery root. All child nodes are joins that should be executed
364
+ after the subquery.
365
+
366
+ If this node has a subquery root, its identifier should be a subquery identifier.
367
+
368
+ Returns
369
+ -------
370
+ QepsNode
371
+ The first table in the subquery.
372
+ """
373
+ if self._subquery_root is None:
374
+ self._subquery_root = QEPsNode(self.filter_aware, self.gamma)
375
+ return self._subquery_root
376
+
377
+ def is_root_node(self) -> bool:
378
+ """Checks, if the current QEP-S node is a root node
379
+
380
+ Returns
381
+ -------
382
+ bool
383
+ Whether the node is a root, i.e. a QEP-S node with no predecessor
384
+ """
385
+ return self._parent is None
386
+
387
+ def path(self) -> Sequence[QepsIdentifier]:
388
+ """Provides the join path that leads to the current node.
389
+
390
+ This includes all identifiers along the path, including the identifier of the current node.
391
+
392
+ Returns
393
+ -------
394
+ Optional[Sequence[QepsIdentifier]]
395
+ All identifiers in sequence starting from the root node. For the root node itself, the path is empty.
396
+ """
397
+ if not self._identifier:
398
+ return []
399
+ parent_path = self._parent.path() if self._parent else []
400
+ return parent_path + [self._identifier] if parent_path else [self._identifier]
401
+
402
+ def tables(self) -> frozenset[TableReference]:
403
+ """Provides all tables along the join path that leads to the current node.
404
+
405
+ Returns
406
+ -------
407
+ frozenset[TableReference]
408
+ All tables of all identifiers along the path. For the root node, the set is empty. Notice that this does only
409
+ include directly designated tables, i.e. tables from filter predicates are neglected.
410
+ """
411
+ return frozenset(util.set_union(qeps_id.tables for qeps_id in self.path()))
412
+
413
+ def recommend_operators(
414
+ self,
415
+ query: qal.SqlQuery,
416
+ join_order: Sequence[JoinTree],
417
+ current_assignment: PhysicalOperatorAssignment,
418
+ *,
419
+ _skip_first_table: bool = False,
420
+ ) -> None:
421
+ """Inserts the operator with the minimum cost into an operator assignment.
422
+
423
+ This method consumes the join order step-by-step, navigating the QEP-S tree along its path. The recommendation
424
+ automatically continues with the next child node.
425
+
426
+ In case of an unkown join order, the QEP-S tree is prepared to store costs of that sequence later on.
427
+
428
+ Parameters
429
+ ----------
430
+ query : qal.SqlQuery
431
+ The query for which operators should be recommended. This parameter is necessary to infer the applicable filter
432
+ predicates for base tables.
433
+ join_order : Sequence[JoinTree]
434
+ A path to navigate the QEP-S tree. The recommendation logic consumes the next join and supplies all future joins to
435
+ the applicable child node.
436
+ current_assignment : PhysicalOperatorAssignment
437
+ Operators that have already been recommended. This structure is successively inflated with repeated recommendation
438
+ calls.
439
+ _skip_first_table : bool, optional
440
+ Internal parameter that should only be set by the QEP-S implementation. This parameter is required to correctly
441
+ start the path traversal at the bottom join.
442
+ """
443
+ if not join_order:
444
+ return
445
+
446
+ next_join, *remaining_joins = join_order
447
+ recommendation = self.current_recommendation()
448
+ if recommendation:
449
+ current_assignment.set_join_operator(
450
+ JoinOperatorAssignment(recommendation, self.tables())
451
+ )
452
+
453
+ if next_join.is_bushy():
454
+ subquery_child = (
455
+ next_join.outer_child
456
+ if next_join.outer_child.plan_depth()
457
+ >= next_join.inner_child.plan_depth()
458
+ else next_join.inner_child
459
+ )
460
+ qeps_subquery_id = QepsIdentifier(subquery_child.tables())
461
+ qeps_subquery_node = self.child_nodes[qeps_subquery_id]
462
+ qeps_subquery_node.subquery_root.recommend_operators(
463
+ query, _iterate_join_tree(subquery_child), current_assignment
464
+ )
465
+ qeps_subquery_node.recommend_operators(
466
+ query, remaining_joins, current_assignment
467
+ )
468
+ return
469
+
470
+ if next_join.is_base_join():
471
+ first_table, second_table = (
472
+ next_join.outer_child.base_table,
473
+ next_join.inner_child.base_table,
474
+ )
475
+ first_table, second_table = (
476
+ (second_table, first_table)
477
+ if second_table < first_table
478
+ else (first_table, second_table)
479
+ )
480
+
481
+ if not _skip_first_table:
482
+ qeps_child_id = self._make_identifier(query, first_table)
483
+ qeps_child_node = self.child_nodes[qeps_child_id]
484
+ qeps_child_node.recommend_operators(
485
+ query, join_order, current_assignment, _skip_first_table=True
486
+ )
487
+ return
488
+ else:
489
+ next_table = second_table
490
+ else:
491
+ # join between intermediate (our current QEP-S path) and a base table (next node in our QEP-S path)
492
+ next_table = (
493
+ next_join.inner_child.base_table
494
+ if next_join.inner_child.is_scan()
495
+ else next_join.inner_child.outer_child.base_table
496
+ )
497
+
498
+ qeps_child_id = self._make_identifier(query, next_table)
499
+ qeps_child_node = self.child_nodes[qeps_child_id]
500
+ qeps_child_node.recommend_operators(query, remaining_joins, current_assignment)
501
+
502
+ def integrate_costs(
503
+ self,
504
+ query: qal.SqlQuery,
505
+ query_plan: Sequence[QueryPlan],
506
+ *,
507
+ _skip_first_table: bool = False,
508
+ ) -> None:
509
+ """Updates the internal cost model with the costs of the execution plan nodes.
510
+
511
+ Notice that the costs of the plan nodes can be calculated using arbitrary strategies and do not need to originate from
512
+ a physical database system. This allows the usage of arbitrary cost models.
513
+
514
+ Parameters
515
+ ----------
516
+ query : qal.SqlQuery
517
+ The query which is used to determine new costs. This parameter is necessary to infer the applicable filter
518
+ predicates for base tables.
519
+ query_plan : Sequence[QueryPlan]
520
+ A sequence of join nodes that provide the updated cost information. The update logic consumes the costs of the
521
+ first join and delegates all further updates to the next child node. This requires all plan nodes to contain cost
522
+ information as well as information about the physical join operators.
523
+ _skip_first_table : bool, optional
524
+ Internal parameter that should only be set by the QEP-S implementation. This parameter is required to correctly
525
+ start the path traversal at the bottom join.
526
+
527
+ Raises
528
+ ------
529
+ ValueError
530
+ If plan nodes do not contain information about the join costs, or the join operator.
531
+
532
+ Notes
533
+ -----
534
+ The implementation of the cost integration uses a "look ahead" approach. This means that each QEP-S node determines the
535
+ next QEP-S node based on the first join in the plan sequence. This node corresponds to a child node of the current
536
+ QEP-S node. If no such child exists, it will be created. Once the next QEP-S node is determined, it is updated with the
537
+ costs of the plan node. Afterwards, the cost integration continues with the next plan node on the next QEP-S node.
538
+ """
539
+ if not query_plan:
540
+ return
541
+
542
+ next_node, *remaining_nodes = query_plan
543
+ if not next_node.is_join:
544
+ self.integrate_costs(query, remaining_nodes)
545
+
546
+ first_child, second_child = (
547
+ _left_query_plan_child(next_node),
548
+ _right_query_plan_child(next_node),
549
+ )
550
+ if next_node.is_bushy():
551
+ first_child, second_child = (
552
+ (second_child, first_child)
553
+ if second_child.plan_depth() < first_child.plan_depth()
554
+ else (first_child, second_child)
555
+ )
556
+ qeps_subquery_id = QepsIdentifier(first_child.tables())
557
+ qeps_subquery_node = self.child_nodes[qeps_subquery_id]
558
+ qeps_subquery_node.update_costs(
559
+ next_node.operator, next_node.estimated_cost
560
+ )
561
+ qeps_subquery_node.subquery_root.integrate_costs(
562
+ query, _iterate_query_plan(first_child)
563
+ )
564
+ qeps_subquery_node.integrate_costs(query, remaining_nodes)
565
+ return
566
+ elif next_node.is_base_join():
567
+ first_child, second_child = (
568
+ (second_child, first_child)
569
+ if second_child.fetch_base_table() < first_child.fetch_base_table()
570
+ else (first_child, second_child)
571
+ )
572
+ if not _skip_first_table:
573
+ qeps_child_id = self._make_identifier(
574
+ query, first_child.fetch_base_table()
575
+ )
576
+ qeps_child_node = self.child_nodes[qeps_child_id]
577
+ qeps_child_node.integrate_costs(
578
+ query, query_plan, _skip_first_table=True
579
+ )
580
+ return
581
+ else:
582
+ child_node = second_child
583
+ else:
584
+ # join between intermediate (our current QEP-S path) and a base table (next node in our QEP-S path)
585
+ child_node = first_child if first_child.is_scan_branch() else second_child
586
+
587
+ child_table = child_node.fetch_base_table()
588
+ qeps_child_id = self._make_identifier(query, child_table)
589
+ qeps_child_node = self.child_nodes[qeps_child_id]
590
+ qeps_child_node.update_costs(next_node.operator, next_node.estimated_cost)
591
+ qeps_child_node.integrate_costs(query, remaining_nodes)
592
+
593
+ def detect_unknown_costs(
594
+ self,
595
+ query: qal.SqlQuery,
596
+ join_order: Sequence[JoinTree],
597
+ allowed_operators: frozenset[JoinOperator],
598
+ unknown_ops: dict[frozenset[TableReference], frozenset[JoinOperator]],
599
+ _skip_first_table: bool = False,
600
+ ) -> None:
601
+ """Collects all joins in the QEP-S that do not have cost information for all possible operators.
602
+
603
+ The missing operators are stored in the `unknown_ops` parameter which is inflated as part of the method execution and
604
+ QEP-S traversal, acting as an *output* parameter.
605
+
606
+ Parameters
607
+ ----------
608
+ query : qal.SqlQuery
609
+ The query describing the filter predicates to navigate the QEP-S
610
+ join_order : Sequence[JoinTree]
611
+ The join order to navigate the QEP-S
612
+ allowed_operators : frozenset[JoinOperators]
613
+ Operators for which cost information should exist. If the node does not have a cost information for any of the
614
+ operators, this is an unknown cost
615
+ unknown_ops : dict[frozenset[TableReference], frozenset[JoinOperators]]
616
+ The unknown operators that have been detected so far
617
+ _skip_first_table : bool, optional
618
+ Internal parameter that should only be set by the QEP-S implementation. This parameter is required to correctly
619
+ start the path traversal at the bottom join.
620
+ """
621
+ if not join_order:
622
+ return
623
+
624
+ if not self.is_root_node() and not self._parent.is_root_node():
625
+ own_unknown_ops = frozenset(
626
+ [
627
+ operator
628
+ for operator in allowed_operators
629
+ if operator not in self.operator_costs
630
+ ]
631
+ )
632
+ unknown_ops[_tables_in_qeps_path(self.path())] = own_unknown_ops
633
+
634
+ next_join, *remaining_joins = join_order
635
+ if next_join.is_bushy():
636
+ subquery_child = (
637
+ next_join.outer_child
638
+ if next_join.outer_child.plan_depth()
639
+ >= next_join.inner_child.plan_depth()
640
+ else next_join.inner_child
641
+ )
642
+ qeps_subquery_id = QepsIdentifier(subquery_child.tables())
643
+ qeps_subquery_node = self.child_nodes[qeps_subquery_id]
644
+ qeps_subquery_node.subquery_root.detect_unknown_costs(
645
+ query,
646
+ _iterate_join_tree(subquery_child),
647
+ allowed_operators,
648
+ unknown_ops,
649
+ )
650
+ qeps_subquery_node.detect_unknown_costs(
651
+ query, remaining_joins, allowed_operators, unknown_ops
652
+ )
653
+ return
654
+
655
+ if next_join.is_base_join():
656
+ first_table, second_table = (
657
+ next_join.outer_child.base_table,
658
+ next_join.inner_child.base_table,
659
+ )
660
+ first_table, second_table = (
661
+ (second_table, first_table)
662
+ if second_table < first_table
663
+ else (first_table, second_table)
664
+ )
665
+
666
+ if not _skip_first_table:
667
+ qeps_child_id = self._make_identifier(query, first_table)
668
+ qeps_child_node = self.child_nodes[qeps_child_id]
669
+ qeps_child_node.detect_unknown_costs(
670
+ query,
671
+ join_order,
672
+ allowed_operators,
673
+ unknown_ops,
674
+ _skip_first_table=True,
675
+ )
676
+ return
677
+ else:
678
+ next_table = second_table
679
+ else:
680
+ # join between intermediate (our current QEP-S path) and a base table (next node in our QEP-S path)
681
+ next_table = (
682
+ next_join.inner_child.base_table
683
+ if next_join.inner_child.is_scan()
684
+ else next_join.outer_child.base_table
685
+ )
686
+
687
+ qeps_child_id = self._make_identifier(query, next_table)
688
+ qeps_child_node = self.child_nodes[qeps_child_id]
689
+ qeps_child_node.detect_unknown_costs(
690
+ query, remaining_joins, allowed_operators, unknown_ops
691
+ )
692
+
693
+ def current_recommendation(self) -> Optional[JoinOperator]:
694
+ """Provides the operator with the minimum cost.
695
+
696
+ Returns
697
+ -------
698
+ Optional[JoinOperators]
699
+ The best operator, or ``None`` if not enough information exists to make a good decision.
700
+ """
701
+ return (
702
+ util.argmin(self.operator_costs) if len(self.operator_costs) > 1 else None
703
+ )
704
+
705
+ def update_costs(self, operator: JoinOperator, cost: float) -> None:
706
+ """Updates the cost of a specific operator for this node.
707
+
708
+ Parameters
709
+ ----------
710
+ operator : JoinOperators
711
+ The operator whose costs should be updated.
712
+ cost : float
713
+ The new cost information.
714
+
715
+ Raises
716
+ ------
717
+ ValueError
718
+ If the cost is not a valid number (e.g. NaN or infinity)
719
+ """
720
+ if not operator or math.isinf(cost) or math.isnan(cost):
721
+ raise ValueError("Operator and cost required")
722
+ current_cost = self.operator_costs[operator]
723
+ self.operator_costs[operator] = cost + self.gamma * current_cost
724
+
725
+ def inspect(self, *, _current_indentation: int = 0) -> str:
726
+ """Provides a nice hierarchical representation of the QEP-S structure.
727
+
728
+ The representation typically spans multiple lines and uses indentation to separate parent nodes from their
729
+ children.
730
+
731
+ Parameters
732
+ ----------
733
+ _current_indentation : int, optional
734
+ Internal parameter to the `inspect` function. Should not be modified by the user. Denotes how deeply
735
+ recursed we are in the QEP-S tree. This enables the correct calculation of the current indentation level.
736
+ Defaults to 0 for the root node.
737
+
738
+ Returns
739
+ -------
740
+ str
741
+ A string representatio of the QEP-S
742
+ """
743
+ if not _current_indentation:
744
+ return "[ROOT]\n" + self._child_inspect(2)
745
+
746
+ prefix = " " * _current_indentation
747
+
748
+ cost_str = prefix + self._cost_str()
749
+ subquery_content = (
750
+ self.subquery_root.inspect(_current_indentation=_current_indentation + 2)
751
+ if self._subquery_root
752
+ else ""
753
+ )
754
+ subquery_str = (
755
+ f"{prefix}[SQ] ->\n{subquery_content}" if subquery_content else ""
756
+ )
757
+ child_content = self._child_inspect(_current_indentation)
758
+ child_str = (
759
+ f"{prefix}[CHILD] ->\n{child_content}"
760
+ if child_content
761
+ else f"{prefix}[no children]"
762
+ )
763
+
764
+ inspect_entries = [cost_str, subquery_str, child_str]
765
+ return "\n".join(entry for entry in inspect_entries if entry)
766
+
767
+ def _init_qeps(self, identifier: QepsIdentifier) -> QEPsNode:
768
+ """Generates a new QEP-S node with a specific identifier.
769
+
770
+ The new node "inherits" configuration settings from the current node. This includes filter awareness and gamma value.
771
+ Likewise, the node is correctly linked up with the current node.
772
+
773
+ Parameters
774
+ ----------
775
+ identifier : QepsIdentifier
776
+ The identifier of the new node
777
+
778
+ Returns
779
+ -------
780
+ QepsNode
781
+ The new node
782
+ """
783
+ return QEPsNode(
784
+ self.filter_aware, self.gamma, parent=self, identifier=identifier
785
+ )
786
+
787
+ def _make_identifier(
788
+ self, query: qal.SqlQuery, table: TableReference | Iterable[TableReference]
789
+ ) -> QepsIdentifier:
790
+ """Generates an identifier for a specific table(s).
791
+
792
+ The concrete identifier information depends on the configuration of this node, e.g. regarding the filter behavior.
793
+
794
+ Parameters
795
+ ----------
796
+ query : qal.SqlQuery
797
+ The query for which the QEP-S identifier should be created. This parameter is necessary to infer filter predicates
798
+ if necessary.
799
+ table : TableReference | Iterable[TableReference]
800
+ The table that should be stored in the identifier. Subquery identifiers will contain multiple tables, but no filter
801
+ predicate.
802
+
803
+ Returns
804
+ -------
805
+ QepsIdentifier
806
+ The identifier
807
+ """
808
+ table = util.simplify(table)
809
+ filter_predicate = (
810
+ query.predicates().filters_for(table) if self.filter_aware else None
811
+ )
812
+ return QepsIdentifier(table, filter_predicate)
813
+
814
+ def _child_inspect(self, indentation: int) -> str:
815
+ """Worker method to generate the inspection text for child nodes.
816
+
817
+ Parameters
818
+ ----------
819
+ indentation : int
820
+ The current indentation level. This parameter will be increased for deeper levels in the QEP-S hierarchy.
821
+
822
+ Returns
823
+ -------
824
+ str
825
+ The inspection text
826
+ """
827
+ prefix = " " * indentation
828
+ child_content = []
829
+ for identifier, child_node in self.child_nodes.items():
830
+ child_inspect = child_node.inspect(_current_indentation=indentation + 2)
831
+ child_content.append(f"{prefix}QEP-S node {identifier}\n{child_inspect}")
832
+ return f"\n{prefix}-----\n".join(child for child in child_content)
833
+
834
+ def _cost_str(self) -> str:
835
+ """Generates a human-readable string for the cost information in this node.
836
+
837
+ Returns
838
+ -------
839
+ str
840
+ The cost information
841
+ """
842
+ cost_content = ", ".join(
843
+ f"{operator.value}={cost}" for operator, cost in self.operator_costs.items()
844
+ )
845
+ return f"[{cost_content}]" if self.operator_costs else "[no cost]"
846
+
847
+ def __json__(self) -> dict:
848
+ cost_json = {
849
+ operator.value: cost for operator, cost in self.operator_costs.items()
850
+ }
851
+ children_json = [
852
+ {"identifier": qeps_id, "node": node}
853
+ for qeps_id, node in self.child_nodes.items()
854
+ ]
855
+ return {
856
+ "costs": cost_json,
857
+ "children": children_json,
858
+ "subquery": self._subquery_root,
859
+ }
860
+
861
+ def __bool__(self) -> bool:
862
+ return len(self.child_nodes) > 0 or len(self.operator_costs) > 0
863
+
864
+ def __repr__(self) -> str:
865
+ return str(self)
866
+
867
+ def __str__(self) -> str:
868
+ qeps_path = self.path()
869
+ identifier = (
870
+ " -> ".join(str(qeps_id) for qeps_id in qeps_path)
871
+ if qeps_path
872
+ else "[ROOT]"
873
+ )
874
+ costs = self._cost_str()
875
+ return f"{identifier} {costs}"
876
+
877
+
878
+ class QEPSynopsis:
879
+ """The plan synopsis maintains a hierarchy of QEP-S nodes, starting at a single root node.
880
+
881
+ Most of the methods this synopsis provides simply delegate to the root node.
882
+
883
+ Parameters
884
+ ----------
885
+ root : QepsNode
886
+ The root node of the QEP-S tree. This node does not have any predecessor, nor an identifier.
887
+
888
+ See Also
889
+ --------
890
+ QepsNode
891
+ """
892
+
893
+ @staticmethod
894
+ def create(filter_aware: bool, gamma: float) -> QEPSynopsis:
895
+ """Generates a new synopsis with specific settings.
896
+
897
+ Parameters
898
+ ----------
899
+ filter_aware : bool
900
+ Whether filter predicates should be included in the QEP-S identifiers.
901
+ gamma : float
902
+ The update factor to balance recency and learning of cost information.
903
+
904
+ Returns
905
+ -------
906
+ QEPSynopsis
907
+ The synopsis
908
+ """
909
+ root = QEPsNode(filter_aware, gamma)
910
+ return QEPSynopsis(root)
911
+
912
+ def __init__(self, root: QEPsNode) -> None:
913
+ self.root = root
914
+
915
+ def recommend_operators(
916
+ self, query: qal.SqlQuery, join_order: JoinTree
917
+ ) -> PhysicalOperatorAssignment:
918
+ """Provides the optimal operators according to the current QEP-S for a specific join order.
919
+
920
+ Parameters
921
+ ----------
922
+ query : qal.SqlQuery
923
+ The query for which the operators should be optimized
924
+ join_order : jointree.JoinTree
925
+ A join order to traverse the QEP-S
926
+
927
+ Returns
928
+ -------
929
+ PhysicalOperatorAssignment
930
+ The best operators as learned by the QEP-S
931
+ """
932
+ current_assignment = PhysicalOperatorAssignment()
933
+ self.root.recommend_operators(
934
+ query, _iterate_join_tree(join_order), current_assignment
935
+ )
936
+ return current_assignment
937
+
938
+ def integrate_costs(self, query: qal.SqlQuery, query_plan: QueryPlan) -> None:
939
+ """Updates the cost information of the QEP-S with the costs from the query plan.
940
+
941
+ Parameters
942
+ ----------
943
+ query : qal.SqlQuery
944
+ The query correponding to the execution plan
945
+ query_plan : QueryPlan
946
+ An execution plan providing the operators and their costs. This information is used for the QEP-S traversal as well
947
+ as the actual update.
948
+ """
949
+ self.root.integrate_costs(query, _iterate_query_plan(query_plan))
950
+
951
+ def detect_unknown_costs(
952
+ self,
953
+ query: qal.SqlQuery,
954
+ join_order: JoinTree,
955
+ allowed_operators: set[JoinOperator],
956
+ ) -> dict[frozenset[TableReference], frozenset[JoinOperator]]:
957
+ """Collects all joins in the QEP-S that do not have cost information for all possible operators.
958
+
959
+ Parameters
960
+ ----------
961
+ query : qal.SqlQuery
962
+ The query describing the filter predicates to navigate the QEP-S
963
+ join_order : Sequence[JoinTree]
964
+ The join order to navigate the QEP-S
965
+ allowed_operators : frozenset[JoinOperators]
966
+ Operators for which cost information should exist. If the node does not have a cost information for any of the
967
+ operators, this is an unknown cost
968
+
969
+ Returns
970
+ -------
971
+ dict[frozenset[TableReference], frozenset[JoinOperators]]
972
+ A mapping from join to the unknown operators at that join. If a join is not contained in the mapping, it is either
973
+ not contained in the `join_order`, or it has cost information for all operators.
974
+ """
975
+ unknown_costs: dict[frozenset[TableReference], frozenset[JoinOperator]] = {}
976
+ self.root.detect_unknown_costs(
977
+ query, _iterate_join_tree(join_order), allowed_operators, unknown_costs
978
+ )
979
+ return unknown_costs
980
+
981
+ def reset(self) -> None:
982
+ """Removes all learned information from the QEP-S.
983
+
984
+ This does not only include cost information, but also the tree structure itself.
985
+ """
986
+ self.root = QEPsNode(self.root.filter_aware, self.root.gamma)
987
+
988
+ def inspect(self) -> str:
989
+ """Provides a nice hierarchical representation of the QEP-S structure.
990
+
991
+ The representation typically spans multiple lines and uses indentation to separate parent nodes from their
992
+ children.
993
+
994
+ Returns
995
+ -------
996
+ str
997
+ A string representatio of the QEP-S
998
+ """
999
+ return self.root.inspect()
1000
+
1001
+ def __json__(self) -> dict:
1002
+ return {
1003
+ "root": self.root,
1004
+ "gamma": self.root.gamma,
1005
+ "filter_aware": self.root.filter_aware,
1006
+ }
1007
+
1008
+
1009
+ def _load_qeps_id_from_json(json_data: dict) -> QepsIdentifier:
1010
+ """Creates a QEP-S identifier from its JSON representation.
1011
+
1012
+ This is undoes the JSON-serialization via the ``__json__`` method on identifier instances. Whether to create an identifier
1013
+ with a filter predicate or a plain identifier is inferred based on the encoded data. The same applies to whether a
1014
+ subquery identifier or a normal base table identifier should be created.
1015
+
1016
+ Parameters
1017
+ ----------
1018
+ json_data : dict
1019
+ The encoded identifier
1020
+
1021
+ Returns
1022
+ -------
1023
+ QepsIdentifier
1024
+ The corresponding identifier object
1025
+
1026
+ Raises
1027
+ ------
1028
+ ValueError
1029
+ If the encoding does not contain any tables
1030
+ """
1031
+ tables = [
1032
+ parser.load_table_json(json_table) for json_table in json_data.get("tables", [])
1033
+ ]
1034
+ filter_pred = parser.load_predicate_json(json_data.get("filter_predicate"), {})
1035
+ return QepsIdentifier(tables, filter_pred)
1036
+
1037
+
1038
+ def _load_qeps_from_json(
1039
+ json_data: dict,
1040
+ qeps_id: Optional[QepsIdentifier],
1041
+ parent: Optional[QEPsNode],
1042
+ filter_aware: bool,
1043
+ gamma: float,
1044
+ ) -> QEPsNode:
1045
+ """Creates a QEP-S node from its JSON representation.
1046
+
1047
+ Parameters
1048
+ ----------
1049
+ json_data : dict
1050
+ The encoded node data
1051
+ qeps_id : Optional[QepsIdentifier]
1052
+ The identifier of the node. Can be ``None`` for root nodes.
1053
+ parent : Optional[QepsNode]
1054
+ The parent of the node. Can be ``None`` for root nodes.
1055
+ filter_aware : bool
1056
+ Whether child identifiers should also consider the filter predicates that are applied to base tables.
1057
+ gamma : float
1058
+ Mediation factor for recent and previous cost information
1059
+
1060
+ Returns
1061
+ -------
1062
+ QepsNode
1063
+ The node instance
1064
+
1065
+ Raises
1066
+ ------
1067
+ KeyError
1068
+ If any of the child node encodings does not contain an identifier
1069
+ KeyError
1070
+ If any of the child node encodings does not contain an actual node encoding
1071
+ """
1072
+ node = QEPsNode(filter_aware, gamma, identifier=qeps_id, parent=parent)
1073
+
1074
+ cost_info = {
1075
+ JoinOperator(operator_str): cost
1076
+ for operator_str, cost in json_data.get("costs", {}).items()
1077
+ }
1078
+ subquery = (
1079
+ _load_qeps_from_json(json_data["subquery"], None, None, filter_aware, gamma)
1080
+ if "subquery" in json_data
1081
+ else None
1082
+ )
1083
+ children: dict[QepsIdentifier, QEPsNode] = {}
1084
+ for child_json in json_data.get("children", []):
1085
+ child_id = _load_qeps_id_from_json(child_json["identifier"])
1086
+ child_node = _load_qeps_from_json(
1087
+ json_data["node"], child_id, node, filter_aware, gamma
1088
+ )
1089
+ children[child_id] = child_node
1090
+
1091
+ node.operator_costs = cost_info
1092
+ node._subquery_root = subquery
1093
+ node.child_nodes = children
1094
+ return node
1095
+
1096
+
1097
+ def make_qeps(
1098
+ path: Iterable[TableReference],
1099
+ root: Optional[QEPsNode] = None,
1100
+ *,
1101
+ gamma: float = 0.8,
1102
+ ) -> QEPsNode:
1103
+ """Generates a QEP-S for the given join path.
1104
+
1105
+ Parameters
1106
+ ----------
1107
+ path : Iterable[TableReference]
1108
+ The join sequence corresponding to the branch in the QEP-S.
1109
+ root : Optional[QepsNode], optional
1110
+ An optional root node. If this is specified, a branch below that node is inserted. This can be used to construct bushy
1111
+ QEP-S via repeated calls to `make_qeps`.
1112
+ gamma : float, optional
1113
+ The update factor to balance recency and learning of cost information. Defaults to 0.8
1114
+
1115
+ Returns
1116
+ -------
1117
+ QepsNode
1118
+ The QEP-S. The synopsis is not filter-aware.
1119
+ """
1120
+ current_node = root if root is not None else QEPsNode(False, gamma)
1121
+ root = current_node
1122
+ for table in path:
1123
+ current_node = current_node.child_nodes[QepsIdentifier(table)]
1124
+ return root
1125
+
1126
+
1127
+ def _obtain_accurate_cost_estimate(
1128
+ query: qal.SqlQuery, database: db.Database
1129
+ ) -> QueryPlan:
1130
+ """Determines the cost information for a query based on the actual cardinalities of the execution plan.
1131
+
1132
+ This simulates a cost model with perfect input data.
1133
+
1134
+ Parameters
1135
+ ----------
1136
+ query : qal.SqlQuery
1137
+ The query to generate the estimate for. This should be a query with a hint block that describes the physical query
1138
+ plan. However, this is not required.
1139
+ database : db.Database
1140
+ The database which provides the cost model.
1141
+
1142
+ Returns
1143
+ -------
1144
+ QueryPlan
1145
+ The execution plan with cost information
1146
+ """
1147
+ query_plan = database.optimizer().analyze_plan(query)
1148
+ query_with_true_hints = database.hinting().generate_hints(
1149
+ query, query_plan.with_actual_card()
1150
+ )
1151
+ return database.optimizer().query_plan(query_with_true_hints)
1152
+
1153
+
1154
+ def _generate_all_cost_estimates(
1155
+ query: qal.SqlQuery,
1156
+ join_order: JoinTree,
1157
+ available_operators: dict[frozenset[TableReference], frozenset[JoinOperator]],
1158
+ database: db.Database,
1159
+ ) -> Iterable[QueryPlan]:
1160
+ """Provides all cost estimates based on plans with specific operator combinations.
1161
+
1162
+ The cost estimates are based on the true cardinalities of all intermediate results, i.e. the method first determines the
1163
+ true cardinalities for each intermediate. Afterwards, the cost model is queried again with the true cardinalities as input
1164
+ while fixing the previous execution plan.
1165
+
1166
+ Parameters
1167
+ ----------
1168
+ query : qal.SqlQuery
1169
+ The query for which the cost estimates should be generated
1170
+ join_order : JoinTree
1171
+ The join order to use
1172
+ available_operators : dict[frozenset[TableReference], frozenset[JoinOperators]]
1173
+ A mapping from joins to allowed operators. All possible combinations will be explored.
1174
+ database : db.Database
1175
+ The database to use for the query execution and cost estimation.
1176
+
1177
+ Returns
1178
+ -------
1179
+ Iterable[QueryPlan]
1180
+ All query plans with the actual costs.
1181
+ """
1182
+ plans = []
1183
+ joins, operators = (
1184
+ list(available_operators.keys()),
1185
+ list(available_operators.values()),
1186
+ )
1187
+ for current_operator_selection in itertools.product(*operators):
1188
+ current_join_pairs = zip(joins, current_operator_selection)
1189
+ current_assignment = PhysicalOperatorAssignment()
1190
+ for join, operator in current_join_pairs:
1191
+ current_assignment.set_join_operator(JoinOperatorAssignment(operator, join))
1192
+ optimized_query = database.hinting().generate_hints(
1193
+ query, join_order=join_order, physical_operators=current_assignment
1194
+ )
1195
+ plans.append(_obtain_accurate_cost_estimate(optimized_query, database))
1196
+ return plans
1197
+
1198
+
1199
+ def _sample_cost_estimates(
1200
+ query: qal.SqlQuery,
1201
+ join_order: JoinTree,
1202
+ available_operators: dict[frozenset[TableReference], frozenset[JoinOperator]],
1203
+ n_samples: int,
1204
+ database: db.Database,
1205
+ ) -> Iterable[QueryPlan]:
1206
+ """Generates cost estimates based on sampled plans with specific operator combinations.
1207
+
1208
+ The samples are generated based on random operator selections.
1209
+
1210
+ The cost estimates are based on the true cardinalities of all intermediate results, i.e. the method first determines the
1211
+ true cardinalities for each intermediate. Afterwards, the cost model is queried again with the true cardinalities as input
1212
+ while fixing the previous execution plan.
1213
+
1214
+ Parameters
1215
+ ----------
1216
+ query : qal.SqlQuery
1217
+ The query for which the cost estimates should be generated
1218
+ join_order : jointree.JoinTree
1219
+ The join order to use
1220
+ available_operators : dict[frozenset[TableReference], frozenset[JoinOperators]]
1221
+ A mapping from joins to allowed operators. The actual operator assignments will be sampled from this mapping.
1222
+ n_samples : int
1223
+ The number of samples to generate. If there are less unique plans than samples requested, only the unique plans are
1224
+ sampled. Likewise, if the method fails to generate more samples but the requested number of samples is not yet reached,
1225
+ (due to bad luck or the number of theoretically available unique plans being close to the number of requested samples),
1226
+ the actual number of sampled plans might also be smaller.
1227
+ database : db.Database
1228
+ The database to use for the query execution and cost estimation.
1229
+
1230
+ Returns
1231
+ -------
1232
+ Iterable[QueryPlan]
1233
+ Query plans with the actual costs
1234
+ """
1235
+ plans: list[QueryPlan] = []
1236
+ sampled_assignments = set()
1237
+ n_tries = 0
1238
+ max_tries = 3 * n_samples
1239
+ while len(plans) < n_samples and n_tries < max_tries:
1240
+ n_tries += 1
1241
+ current_assignment = PhysicalOperatorAssignment()
1242
+ for join, operators in available_operators.items():
1243
+ selected_operator = random.choice(list(operators))
1244
+ current_assignment.set_join_operator(
1245
+ JoinOperatorAssignment(selected_operator, join)
1246
+ )
1247
+ current_hash = hash(current_assignment)
1248
+ if current_hash in sampled_assignments:
1249
+ continue
1250
+ else:
1251
+ sampled_assignments.add(current_hash)
1252
+ optimized_query = database.hinting().generate_hints(
1253
+ query, join_order=join_order, physical_operators=current_assignment
1254
+ )
1255
+ plans.append(_obtain_accurate_cost_estimate(optimized_query, database))
1256
+ return plans
1257
+
1258
+
1259
+ class TonicOperatorSelection(PhysicalOperatorSelection):
1260
+ """Implementation of the TONIC/QEP-S learned operator recommendation.
1261
+
1262
+ The implementation supports bushy join orders, plain QEP-S and filter-aware QEP-S
1263
+
1264
+ Parameters
1265
+ ----------
1266
+ filter_aware : bool, optional
1267
+ Whether to use the filter-aware QEP-S or the plain QEP-S. Defaults to ``False``, which creates a plain QEP-S.
1268
+ gamma : float, optional
1269
+ Cost update factor to mediate the bias towards more recent cost information.
1270
+ database : Optional[db.Database], optional
1271
+ A database to use for the incorporation of native operator costs. If this parameter is omitted, it will be inferred
1272
+ from the database pool.
1273
+
1274
+ References
1275
+ ----------
1276
+
1277
+ .. [1] A. Hertzschuch et al.: "Turbo-Charging SPJ Query Plans with Learned Physical Join Operator Selections.", VLDB'2022
1278
+ """
1279
+
1280
+ @staticmethod
1281
+ def load_model(
1282
+ filename: str,
1283
+ database: Optional[db.Database] = None,
1284
+ *,
1285
+ encoding: str = "utf-8",
1286
+ ) -> TonicOperatorSelection:
1287
+ """Re-generates a pre-trained TONIC QEP-S model from disk.
1288
+
1289
+ The model has to be encoded in a JSON file as generated by the jsonize utility
1290
+
1291
+ Parameters
1292
+ ----------
1293
+ filename : str
1294
+ The file that contains the JSON model
1295
+ database : Optional[db.Database], optional
1296
+ The database that should be used for trainining the model. If omitted, the database is inferred from the
1297
+ `DatabasePool`.
1298
+ encoding : str, optional
1299
+ Enconding of the model file, by default "utf-8"
1300
+
1301
+ Returns
1302
+ -------
1303
+ TonicOperatorSelection
1304
+ The TONIC model
1305
+ """
1306
+ json_data: dict = {}
1307
+ with open(filename, "r", encoding=encoding) as json_file:
1308
+ json_data = json.load(json_file)
1309
+
1310
+ filter_aware = json_data.get("filter_aware", False)
1311
+ gamma = json_data.get("gamma", 0.8)
1312
+ qeps_root = _load_qeps_from_json(
1313
+ json_data["root"], None, None, filter_aware, gamma
1314
+ )
1315
+ qeps = QEPSynopsis(qeps_root)
1316
+
1317
+ tonic_model = TonicOperatorSelection(filter_aware, gamma, database=database)
1318
+ tonic_model.qeps = qeps
1319
+ return tonic_model
1320
+
1321
+ def __init__(
1322
+ self,
1323
+ filter_aware: bool = False,
1324
+ gamma: float = 0.8,
1325
+ *,
1326
+ database: Optional[db.Database] = None,
1327
+ ) -> None:
1328
+ super().__init__()
1329
+ self.filter_aware = filter_aware
1330
+ self.gamma = gamma
1331
+ self.qeps = QEPSynopsis.create(filter_aware, gamma)
1332
+ self._db = (
1333
+ database if database else db.DatabasePool.get_instance().current_database()
1334
+ )
1335
+
1336
+ def integrate_cost(
1337
+ self, query: qal.SqlQuery, query_plan: Optional[QueryPlan] = None
1338
+ ) -> None:
1339
+ """Uses cost information from a query plan to update the QEP-S costs.
1340
+
1341
+ Notice that the costs stored in the query plan do not need to correspond to native costs. Instead, the costs can be
1342
+ calculated using arbitrary cost models.
1343
+
1344
+ Parameters
1345
+ ----------
1346
+ query : qal.SqlQuery
1347
+ The query for which the query plan was created.
1348
+ query_plan : Optional[QueryPlan], optional
1349
+ The query plan which contains the cost information. If this parameter is omitted, the native optimizer of the
1350
+ `database` will be queried to obtain the costs of the input query. Notice that is enables the integration of costs
1351
+ for arbitrary query plans by setting the hint block of the query.
1352
+ """
1353
+ query_plan = (
1354
+ self._db.optimizer().query_plan(query) if query_plan is None else query_plan
1355
+ )
1356
+ self.qeps.integrate_costs(query, query_plan)
1357
+
1358
+ def simulate_feedback(self, query: qal.SqlQuery) -> None:
1359
+ """Updates the QEP-S cost information with feedback from a specific query.
1360
+
1361
+ This feedback process operates in two stages: in the first stage, the query is executed in *analyze* mode on the native
1362
+ optimizer of the database. This results in two crucial sets of information: the actual physical query plan, as well as
1363
+ the true cardinalities at each operator. In the second phase, the same input query is enriched with the former plan
1364
+ information, as well as the true cardinalities. For this modified query the native optimizer is once again used to
1365
+ obtain a query plan. However, this time the cost information is based on the former query plan, but with the true
1366
+ cardinalities. Therefore, it resembles the true cost of the query for the database system. Finally, this cost
1367
+ information is used to update the QEP-S.
1368
+
1369
+ Parameters
1370
+ ----------
1371
+ query : qal.SqlQuery
1372
+ The query to obtain the cost for
1373
+ """
1374
+ query_plan = self._db.optimizer().analyze_plan(query)
1375
+ hinted_query = self._db.hinting().generate_hints(
1376
+ query, query_plan.with_actual_card()
1377
+ )
1378
+ self.integrate_cost(hinted_query)
1379
+
1380
+ def explore_costs(
1381
+ self,
1382
+ query: qal.SqlQuery,
1383
+ join_order: Optional[JoinTree] = None,
1384
+ *,
1385
+ allowed_operators: Optional[Iterable[JoinOperator]] = None,
1386
+ max_combinations: Optional[int] = None,
1387
+ ) -> None:
1388
+ """Generates cost information along a specific path in the QEP-S.
1389
+
1390
+ The cost information is generated based on the native optimizer of the database system while using the true
1391
+ cardinalities of the intermediate joins.
1392
+
1393
+ For each QEP-S node operators different join operators are selected, independent on the cost information that is
1394
+ already available. If the cost information for an operators does already exist, it is updated according to the normal
1395
+ updating logic.
1396
+
1397
+ Parameters
1398
+ ----------
1399
+ query : qal.SqlQuery
1400
+ The query to obtain the cost for
1401
+ join_order : Optional[JoinTree], optional
1402
+ The QEP-S path along which the cost should be generated. Defaults to ``None``, in which case the join order of the
1403
+ native query optimizer of the database system is used.
1404
+ allowed_operators : Optional[Iterable[JoinOperators]], optional
1405
+ The operators for which cost information should be generated. If a QEP-S node does not have a cost information for
1406
+ one of the operators, it is generated. If the node already has a cost information for the operator, this
1407
+ information is left as-is. Defaults to all join operators.
1408
+ max_combinations : Optional[int], optional
1409
+ The maximum number of operator combinations that should be explored. If more combinations are available, a random
1410
+ subset of `max_combinations` many samples is explored.
1411
+ """
1412
+ join_order = (
1413
+ join_order
1414
+ if join_order is not None
1415
+ else self._obtain_native_join_order(query)
1416
+ )
1417
+
1418
+ allowed_operators = (
1419
+ set(allowed_operators) if allowed_operators else set(JoinOperator)
1420
+ )
1421
+ supported_operators = {
1422
+ join_op
1423
+ for join_op in JoinOperator
1424
+ if self._db.hinting().supports_hint(join_op)
1425
+ }
1426
+ allowed_operators = frozenset(allowed_operators & supported_operators)
1427
+
1428
+ unknown_costs = {
1429
+ intermediate.tables(): allowed_operators
1430
+ for intermediate in join_order.iterjoins()
1431
+ }
1432
+ total_unknown_combinations = math.prod(
1433
+ [len(unknown_ops) for unknown_ops in unknown_costs.values()]
1434
+ )
1435
+
1436
+ query_plans = (
1437
+ _sample_cost_estimates(
1438
+ query, join_order, unknown_costs, max_combinations, self._db
1439
+ )
1440
+ if total_unknown_combinations > max_combinations
1441
+ else _generate_all_cost_estimates(
1442
+ query, join_order, unknown_costs, self._db
1443
+ )
1444
+ )
1445
+ for plan in query_plans:
1446
+ self.integrate_cost(query, plan)
1447
+
1448
+ def reset(self) -> None:
1449
+ """Generates a brand new QEP-S."""
1450
+ self.qeps.reset()
1451
+
1452
+ def select_physical_operators(
1453
+ self, query: qal.SqlQuery, join_order: Optional[JoinTree]
1454
+ ) -> PhysicalOperatorAssignment:
1455
+ if not join_order or join_order.is_empty():
1456
+ join_order = self._obtain_native_join_order(query)
1457
+ return self.qeps.recommend_operators(query, join_order)
1458
+
1459
+ def describe(self) -> dict:
1460
+ return {"name": "tonic", "filter_aware": self.filter_aware, "gamma": self.gamma}
1461
+
1462
+ def _obtain_native_join_order(self, query: qal.SqlQuery) -> JoinTree:
1463
+ """Generates the join order for a specific query based on the native database optimizer.
1464
+
1465
+ Parameters
1466
+ ----------
1467
+ query : qal.SqlQuery
1468
+ The query to obtain the join order for
1469
+
1470
+ Returns
1471
+ -------
1472
+ JoinTree
1473
+ The join order the database system would use
1474
+ """
1475
+ native_plan = self._db.optimizer().query_plan(query)
1476
+ return jointree_from_plan(native_plan)
1477
+
1478
+ def __json__(self) -> Any:
1479
+ return self.qeps