PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1607 @@
1
+ """Generalized implementation of the UES join order optimization algorihtm [1]_.
2
+
3
+ Our implementation differs from the original algorithm in a number of ways, most importantly by making policies more explicit
4
+ and enabling their variation. More specifically, we enable variation of the following parts of the algorithm:
5
+
6
+ - estimation strategy for the cardinality of (filtered) base tables
7
+ - estimation strategy for the cardinality of joins, thereby enabling the usage of different statistics. For example, this
8
+ enables top-k list statistics instead of only using the maximum value frequency
9
+ - deciding when to generate subqueries for primary key/foreign key joins
10
+
11
+ Additionally, our implementation has a stricter treatment of chains of primary key/foreign key joins. Consider a join of the
12
+ form A ⋈ B ⋈ C. Here, A ⋈ B is primary key/foreign key join with A acting as the foreign key partner and B acting as the
13
+ primary key partner. At the same time, B ⋈ C is also a primary key/foreign key join, but this time B acts as the foreign key
14
+ partner and C is the primary key partner. The original implementation did not specify how such situations should be handled and
15
+ multiple possible approaches exist (e.g. treating the entire join sequence as one large primary key/foreign key join or
16
+ invalidating the second join once the primary key/foreign key join between A and B has been performed). Our implementation can
17
+ use the first strategy (the join is treated as one large primary key/foreign key join and the subquery contains all the
18
+ related tables) but defaults to the second one.
19
+
20
+ References
21
+ ----------
22
+
23
+ .. [1] A. Hertzschuch et al.: "Simplicity Done Right for Join Ordering", CIDR'2021
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import abc
29
+ import copy
30
+ import math
31
+ import operator
32
+ import typing
33
+ from collections.abc import Iterable
34
+ from typing import Generic, Optional
35
+
36
+ import numpy as np
37
+
38
+ from .. import util
39
+ from .._core import Cardinality, ColumnReference, JoinOperator, TableReference
40
+ from .._hints import PhysicalOperatorAssignment
41
+ from .._jointree import JoinTree, LogicalJoinTree
42
+ from .._stages import (
43
+ CardinalityEstimator,
44
+ JoinOrderOptimization,
45
+ JoinOrderOptimizationError,
46
+ OptimizationPreCheck,
47
+ PhysicalOperatorSelection,
48
+ )
49
+ from .._validation import (
50
+ DependentSubqueryPreCheck,
51
+ EmptyPreCheck,
52
+ EquiJoinPreCheck,
53
+ ImplicitQueryPreCheck,
54
+ SetOperationsPreCheck,
55
+ VirtualTablesPreCheck,
56
+ merge_checks,
57
+ )
58
+ from ..db._db import Database, DatabasePool, DatabaseStatistics
59
+ from ..qal._qal import (
60
+ AbstractPredicate,
61
+ BaseProjection,
62
+ ImplicitFromClause,
63
+ ImplicitSqlQuery,
64
+ Select,
65
+ SqlQuery,
66
+ Where,
67
+ )
68
+ from ._joingraph import JoinGraph, JoinPath
69
+
70
+ ColumnType = typing.TypeVar("ColumnType")
71
+ """The type of the columns for which statistics are generated."""
72
+
73
+ StatsType = typing.TypeVar("StatsType")
74
+ """The type of the actual statistics that are stored, e.g. single values or frequency lists."""
75
+
76
+ MaxFrequency = typing.NewType("MaxFrequency", int)
77
+ """Type alias for maximum frequency statistics of columns (which are just integer values).
78
+
79
+ The maximum frequency of a column is the maximum number of occurrences of a column value within that column.
80
+
81
+ For example, consider a column R.a with values ``[a, b, a, a, b, c]``. In this case, maximum column frequency for R.a is 3.
82
+ """
83
+
84
+ MostCommonElements = typing.NewType("MostCommonElements", list[tuple[ColumnType, int]])
85
+ """Type alias for top-k lists statistics. The top-k list is generic over the actual column type."""
86
+
87
+
88
+ class StatsContainer(abc.ABC, Generic[StatsType]):
89
+ """The statistics container eases the management of the statistics lifecycle.
90
+
91
+ It provides means to store different kinds of statistics as attributes and can take care of their update automatically.
92
+ Each statistics container instance is intended for one specific query and has to be initialized for that query using the
93
+ `setup_for_query` method.
94
+
95
+ A statistics container is abstract to enable a tailored implementation of the loading and updating procedures for
96
+ different statistics types.
97
+
98
+ Attributes
99
+ ----------
100
+ base_table_estimates : dict[TableReference, int]
101
+ These statistics are intended for tables that are not part of the intermediate result, yet. The estimates approximate
102
+ the number of rows that are returned when scanning the table.
103
+ upper_bounds : dict[TableReference | jointree.LogicalJoinTree, int]
104
+ These statistics contain the cardinality estimates for intermediate results of the input query. Inserting new bounds
105
+ can result in an update of the column statistics.
106
+ attribute_frequencies : dict[ColumnReference, StatsType]
107
+ This statistic contains the current statistics value for individual columns. This is the main data structure that has
108
+ to be maintained during the query optimization process to update the column statistics once they become part of an
109
+ intermediate result (and get changed as part of the join process).
110
+ query : Optional[SqlQuery]
111
+ Stores the query that this container is created for
112
+ """
113
+
114
+ def __init__(self) -> None:
115
+ self.base_table_estimates: dict[TableReference, int] = {}
116
+ self.upper_bounds: dict[TableReference | LogicalJoinTree, int] = {}
117
+ self.attribute_frequencies: dict[ColumnReference, StatsType] = {}
118
+ self.query: Optional[SqlQuery] = None
119
+
120
+ def setup_for_query(
121
+ self,
122
+ query: SqlQuery,
123
+ base_table_estimator: BaseTableEstimator,
124
+ ) -> None:
125
+ """Initializes the internal data of the statistics container for a specific query.
126
+
127
+ Parameters
128
+ ----------
129
+ query : SqlQuery
130
+ The query that
131
+ base_table_estimator : card_policy.BaseTableCardinalityEstimator
132
+ Estimator to inflate the `base_table_estimates` for all tables that are contained in the query. The estimator has
133
+ to set-up properly.
134
+ """
135
+ self._reset_containers()
136
+ self.query = query
137
+ self._inflate_base_table_estimates(base_table_estimator)
138
+ self._inflate_attribute_frequencies()
139
+
140
+ def join_bounds(self) -> dict[LogicalJoinTree, int]:
141
+ """Provides the cardinality estimates of all join trees that are currently stored in the container.
142
+
143
+ Returns
144
+ -------
145
+ dict[jointree.LogicalJoinTree, int]
146
+ The bounds for all intermediate results
147
+ """
148
+ return {
149
+ join_tree: bound
150
+ for join_tree, bound in self.upper_bounds.items()
151
+ if isinstance(join_tree, JoinTree)
152
+ }
153
+
154
+ def trigger_frequency_update(
155
+ self,
156
+ join_tree: LogicalJoinTree,
157
+ joined_table: TableReference,
158
+ join_condition: AbstractPredicate,
159
+ ) -> None:
160
+ """Updates the `attribute_frequencies` according to a new n:m join.
161
+
162
+ The update procedure distinguishes between two different types of column statistics and uses different
163
+ (and statistics-dependent) update methods for each: partner columns and third-party columns.
164
+
165
+ Partner columns are those columns from the intermediate query result, that are directly involved in the join
166
+ predicate, i.e. they are a join partner for some column of the newly joined table. On the other hand, third
167
+ party columns are part of the intermediate result, but not directly involved in the join. In order to update
168
+ them, some sort of correlation info is usually required.
169
+
170
+ The precise update semantics depend on the specific statistic type. Hence, the updates are performed via abstract
171
+ methods.
172
+
173
+ Parameters
174
+ ----------
175
+ join_tree : jointree.LogicalJoinTree
176
+ A join order that indicates the last join that was performed. This is the join that is used to infer the necessary
177
+ updates.
178
+ joined_table : TableReference
179
+ The actual table that was joined. Remember that UES performs either primary key/foreign key joins, or joins with
180
+ exactly one n:m table join partner. In the first case, no frequency updates are necessary since cardinalities may
181
+ never increase when the foreign key is already part of an intermediate result. In the second case, there is exactly
182
+ one partner table that is denoted by this parameter.
183
+ join_condition : AbstractPredicate
184
+ The predicate that was used for the join. This is required to determine the columns that were directly involved in
185
+ the join. These columns have to be updated in a different way compared to other columns in the intermediate result.
186
+ """
187
+ partner_tables = join_tree.tables() - {joined_table}
188
+
189
+ third_party_columns: set[ColumnReference] = set()
190
+ for third_party_table in partner_tables:
191
+ potential_partners = partner_tables - {third_party_table}
192
+ join_pred = self.query.predicates().joins_between(
193
+ third_party_table, potential_partners
194
+ )
195
+ if not join_pred:
196
+ continue
197
+ third_party_columns |= join_pred.columns()
198
+
199
+ for col1, col2 in join_condition.join_partners():
200
+ joined_column, partner_column = (
201
+ (col1, col2) if col1.table == joined_table else (col2, col1)
202
+ )
203
+ self._update_partner_column_frequency(joined_column, partner_column)
204
+
205
+ joined_columns_frequencies = {
206
+ joined_col: self.attribute_frequencies[joined_col]
207
+ for joined_col in join_condition.columns_of(joined_table)
208
+ }
209
+ lowest_joined_column_frequency = util.argmin(joined_columns_frequencies)
210
+ for third_party_column in third_party_columns:
211
+ self._update_third_party_column_frequency(
212
+ lowest_joined_column_frequency, third_party_column
213
+ )
214
+
215
+ @abc.abstractmethod
216
+ def describe(self) -> dict:
217
+ """Generates a JSON-serializable description of the specific container, including the actual statistics type.
218
+
219
+ Returns
220
+ -------
221
+ dict
222
+ The description
223
+ """
224
+ raise NotImplementedError
225
+
226
+ def _reset_containers(self) -> None:
227
+ """Drops all currently stored statistics. This is a necessary preparation step when a new input query is encoutered."""
228
+ self.base_table_estimates = {}
229
+ self.upper_bounds = {}
230
+ self.attribute_frequencies = {}
231
+ self.query = None
232
+
233
+ def _inflate_base_table_estimates(self, base_table_estimator: BaseTableEstimator):
234
+ """Retrieves the base table estimate for each table in the current query.
235
+
236
+ Parameters
237
+ ----------
238
+ base_table_estimator : card_policy.BaseTableCardinalityEstimator
239
+ The strategy that should be used to obtain the estimate.
240
+ """
241
+ for table in self.query.tables():
242
+ table_estimate = base_table_estimator.estimate_for(table)
243
+ self.base_table_estimates[table] = table_estimate
244
+
245
+ @abc.abstractmethod
246
+ def _inflate_attribute_frequencies(self):
247
+ """Loads the attribute frequency statistics for all required columns.
248
+
249
+ The precise statistics that have to be loaded, as well as the columns that require loading of statistics is completely
250
+ up to the specific statistics container.
251
+ """
252
+ raise NotImplementedError
253
+
254
+ @abc.abstractmethod
255
+ def _update_partner_column_frequency(
256
+ self, joined_column: ColumnReference, partner_column: ColumnReference
257
+ ) -> None:
258
+ """Performs the frequency update for a partner column.
259
+
260
+ This implies that there is a join between the joined column and the partner column, and the partner column is already
261
+ part of the intermediate result. Likewise, the joined column has just become part of the intermediate result as of this
262
+ join.
263
+
264
+ Parameters
265
+ ----------
266
+ joined_column : ColumnReference
267
+ A column that is already part of the intermediate result
268
+ partner_column : ColumnReference
269
+ A column of the relation that has just been joined with the current intermedaite result
270
+ """
271
+ raise NotImplementedError
272
+
273
+ @abc.abstractmethod
274
+ def _update_third_party_column_frequency(
275
+ self, joined_column: ColumnReference, third_party_column: ColumnReference
276
+ ) -> None:
277
+ """Performs the frequency update for a third party column (see `trigger_frequency_update`).
278
+
279
+ This implies that there is a join between the joined column and some other column from the intermediate result. The
280
+ third party columns is already part of the intermediate result, but not directly involved in the join. The joined
281
+ column has just become part of the intermediate result as of this join.
282
+
283
+ Parameters
284
+ ----------
285
+ joined_column : ColumnReference
286
+ A column that is already part of the intermediate result
287
+ third_party_column : ColumnReference
288
+ A column of the relation that has just been joined with the current intermediate result
289
+ """
290
+ raise NotImplementedError
291
+
292
+
293
+ class MaxFrequencyStats(StatsContainer[MaxFrequency]):
294
+ """Statistics container that stores the maximum frequency of the join columns.
295
+
296
+ The frequency updates happen pessimistically, which means that each column frequency of the intermediate result is
297
+ multiplied by the maximum frequency of the partner column. This ensures that no under-estimation is possible, but
298
+ over-estimates the true frequencies by a very large margin. However, in order to close this gap, correlation information is
299
+ required.
300
+
301
+ See Also
302
+ --------
303
+ MaxFrequency
304
+ """
305
+
306
+ def __init__(self, database_stats: DatabaseStatistics):
307
+ super().__init__()
308
+ self.database_stats = database_stats
309
+
310
+ def describe(self) -> dict:
311
+ return {"name": "max_column_frequency"}
312
+
313
+ def _inflate_attribute_frequencies(self):
314
+ referenced_columns = set()
315
+ for join_predicate in self.query.predicates().joins():
316
+ referenced_columns |= join_predicate.columns()
317
+
318
+ for column in referenced_columns:
319
+ top1_list = self.database_stats.most_common_values(column, k=1)
320
+ if not top1_list:
321
+ mcv_frequency = self._uniform_frequency(column)
322
+ else:
323
+ _, mcv_frequency = util.simplify(top1_list)
324
+ self.attribute_frequencies[column] = mcv_frequency
325
+
326
+ def _update_partner_column_frequency(
327
+ self, joined_column: ColumnReference, partner_column: ColumnReference
328
+ ) -> None:
329
+ joined_frequency = self.attribute_frequencies[joined_column]
330
+ partner_frequency = self.attribute_frequencies[partner_column]
331
+ self.attribute_frequencies[joined_column] *= partner_frequency
332
+ self.attribute_frequencies[partner_column] *= joined_frequency
333
+
334
+ def _update_third_party_column_frequency(
335
+ self, joined_column: ColumnReference, third_party_column: ColumnReference
336
+ ) -> None:
337
+ self.attribute_frequencies[third_party_column] *= self.attribute_frequencies[
338
+ joined_column
339
+ ]
340
+
341
+ def _uniform_frequency(self, column: ColumnReference) -> float:
342
+ """Calculates the value frequency for a column, assuming that all values are uniformly distributed.
343
+
344
+ Parameters
345
+ ----------
346
+ column : ColumnReference
347
+ The column to calculate for
348
+
349
+ Returns
350
+ -------
351
+ float
352
+ The estimated frequency of all column values
353
+ """
354
+ n_tuples = self.database_stats.total_rows(column.table)
355
+ n_tuples = 1 if n_tuples is None else n_tuples
356
+ n_distinct = self.database_stats.distinct_values(column)
357
+ n_distinct = 1 if n_distinct is None else n_distinct
358
+ return n_tuples / n_distinct
359
+
360
+
361
+ class BaseTableEstimator(abc.ABC):
362
+ """The base table estimator calculates cardinality estimates for filtered base tables.
363
+
364
+ Implementations could for example use direct computation based on advanced statistics, sampling strategies or
365
+ machine learning-based approaches.
366
+
367
+ Each strategy provides dict-like access to the estimates: ``estimator[my_table]`` works as expected.
368
+
369
+ Parameters
370
+ ----------
371
+ name : str
372
+ The name of the actual estimation strategy.
373
+ """
374
+
375
+ def __init__(self, name: str) -> None:
376
+ self.name = name
377
+
378
+ @abc.abstractmethod
379
+ def setup_for_query(self, query: SqlQuery) -> None:
380
+ """Enables the estimator to prepare internal data structures.
381
+
382
+ Parameters
383
+ ----------
384
+ query : SqlQuery
385
+ The query for which cardinalities should be estimated.
386
+ """
387
+ raise NotImplementedError
388
+
389
+ @abc.abstractmethod
390
+ def estimate_for(self, table: TableReference) -> Cardinality:
391
+ """Calculates the cardinality for an arbitrary base table of the query.
392
+
393
+ If the query is not filtered, this method should fall back to `estimate_total_rows`. Furthermore, the table can be
394
+ assumed to not be part of any intermediate result, yet.
395
+
396
+ Parameters
397
+ ----------
398
+ table : TableReference
399
+ The table to estimate.
400
+
401
+ Returns
402
+ -------
403
+ Cardinality
404
+ The estimated number of rows
405
+ """
406
+ raise NotImplementedError
407
+
408
+ @abc.abstractmethod
409
+ def estimate_total_rows(self, table: TableReference) -> Cardinality:
410
+ """Calculates an estimate of the number of rows in the table, ignoring all filter predicates.
411
+
412
+ Parameters
413
+ ----------
414
+ table : TableReference
415
+ The table to estimate.
416
+
417
+ Returns
418
+ -------
419
+ Cardinality
420
+ The estimated number of rows
421
+ """
422
+ raise NotImplementedError
423
+
424
+ @abc.abstractmethod
425
+ def describe(self) -> dict:
426
+ """Provides a JSON-serializable representation of the selected cardinality estimation strategy.
427
+
428
+ Returns
429
+ -------
430
+ dict
431
+ The representation
432
+ """
433
+ raise NotImplementedError
434
+
435
+ def pre_check(self) -> OptimizationPreCheck:
436
+ """Provides requirements that an input query has to satisfy in order for the estimator to work properly.
437
+
438
+ Returns
439
+ -------
440
+ OptimizationPreCheck
441
+ The requirements check
442
+ """
443
+ return EmptyPreCheck()
444
+
445
+ def __getitem__(self, item: TableReference) -> int:
446
+ return self.estimate_for(item)
447
+
448
+ def __repr__(self) -> str:
449
+ return str(self)
450
+
451
+ def __str__(self) -> str:
452
+ return f"BaseTableCardinalityEstimator[{self.name}]"
453
+
454
+
455
+ class NativeCardinalityEstimator(BaseTableEstimator):
456
+ """Provides cardinality estimates for base tables using the optimizer of some database system.
457
+
458
+ Parameters
459
+ ----------
460
+ database : Database
461
+ The database system that should be used to obtain the estimates
462
+ """
463
+
464
+ def __init__(self, database: Database) -> None:
465
+ super().__init__("native_optimizer")
466
+ self.database = database
467
+ self.query: SqlQuery | None = None
468
+
469
+ def setup_for_query(self, query: SqlQuery) -> None:
470
+ self.query = query
471
+
472
+ def estimate_for(self, table: TableReference) -> Cardinality:
473
+ filters = self.query.predicates().filters_for(table)
474
+ if not filters:
475
+ return self.estimate_total_rows(table)
476
+
477
+ select_clause = Select(BaseProjection.star())
478
+ from_clause = ImplicitFromClause.create_for(table)
479
+ where_clause = Where(filters) if filters else None
480
+
481
+ emulated_query = ImplicitSqlQuery(
482
+ select_clause=select_clause,
483
+ from_clause=from_clause,
484
+ where_clause=where_clause,
485
+ )
486
+ return self.database.optimizer().cardinality_estimate(emulated_query)
487
+
488
+ def estimate_total_rows(self, table: TableReference) -> Cardinality:
489
+ return Cardinality(self.database.statistics().total_rows(table, emulated=True))
490
+
491
+ def describe(self) -> dict:
492
+ return {"name": "native", "database": self.database.describe()}
493
+
494
+
495
+ class PreciseCardinalityEstimator(BaseTableEstimator):
496
+ """Obtains true cardinality counts by executing COUNT queries against a database system.
497
+
498
+ This strategy provides a better reproducibility than the native estimates, but can be more compute-intense if caching is
499
+ disabled.
500
+
501
+ The executed COUNT queries account for all filters on the base table.
502
+
503
+ Parameters
504
+ ----------
505
+ database : Database
506
+ The database system that should be used to obtain the estimates
507
+ """
508
+
509
+ def __init__(self, database: Database) -> None:
510
+ super().__init__("precise_estimates")
511
+ self.database = database
512
+ self.query: SqlQuery | None = None
513
+
514
+ def setup_for_query(self, query: SqlQuery) -> None:
515
+ self.query = query
516
+
517
+ def estimate_for(self, table: TableReference) -> Cardinality:
518
+ select_clause = Select(BaseProjection.count_star())
519
+ from_clause = ImplicitFromClause.create_for(table)
520
+
521
+ filters = self.query.predicates().filters_for(table)
522
+ where_clause = Where(filters) if filters else None
523
+
524
+ emulated_query = ImplicitSqlQuery(
525
+ select_clause=select_clause,
526
+ from_clause=from_clause,
527
+ where_clause=where_clause,
528
+ )
529
+
530
+ cache_enabled = (
531
+ self.database.statistics().cache_enabled
532
+ ) # this should be treated like a statistics query
533
+ return Cardinality(
534
+ self.database.execute_query(emulated_query, cache_enabled=cache_enabled)
535
+ )
536
+
537
+ def estimate_total_rows(self, table: TableReference) -> Cardinality:
538
+ return Cardinality(self.database.statistics().total_rows(table, emulated=False))
539
+
540
+ def describe(self) -> dict:
541
+ return {"name": "precise", "database": self.database.describe()}
542
+
543
+
544
+ class _CardinalityEstimatorWrapper(BaseTableEstimator):
545
+ def __init__(
546
+ self, estimator: CardinalityEstimator, *, target_db: Optional[Database] = None
547
+ ) -> None:
548
+ super().__init__(type(estimator).__name__)
549
+ self.estimator = estimator
550
+ self.target_db = target_db or DatabasePool.get_instance().current_database()
551
+ self.query: SqlQuery | None = None
552
+
553
+ def setup_for_query(self, query: SqlQuery) -> None:
554
+ self.query = query
555
+ self.estimator.initialize(self.target_db, query)
556
+
557
+ def estimate_for(self, table: TableReference) -> Cardinality:
558
+ return self.estimator.calculate_estimate(self.query, table)
559
+
560
+ def estimate_total_rows(self, table) -> Cardinality:
561
+ # The implementation of this method is a bit of a hassle since estimation without filters is not really supported
562
+ # by the CardinalityEstimator interface. To work around this issue, we estimate the cardinality of a new query
563
+ # that coincides with the total number of rows.
564
+
565
+ # First, free all resources of the current query
566
+ self.estimator.cleanup()
567
+
568
+ # Now, create our new query and obtain the cardinality estimate
569
+ star_query = ImplicitSqlQuery(
570
+ select_clause=Select.star(),
571
+ from_clause=ImplicitFromClause.create_for(table),
572
+ )
573
+ self.estimator.initialize(self.target_db, star_query)
574
+ cardinality = self.estimator.calculate_estimate(star_query, table)
575
+
576
+ # Lastly, undo all of our temporary changes
577
+ self.estimator.cleanup()
578
+ self.estimator.initialize(self.target_db, self.query)
579
+ return cardinality
580
+
581
+ def describe(self) -> util.jsondict:
582
+ return self.estimator.describe()
583
+
584
+ def pre_check(self) -> OptimizationPreCheck:
585
+ return self.estimator.pre_check()
586
+
587
+
588
+ class JoinEstimator(abc.ABC):
589
+ """The join cardinality estimator calculates cardinality estimates for arbitrary n-ary joins.
590
+
591
+ Implementations could for example use direct computation based on advanced statistics, sampling strategies or
592
+ machine learning-based approaches.
593
+
594
+ Parameters
595
+ ----------
596
+ name : str
597
+ The name of the actual estimation strategy.
598
+ """
599
+
600
+ def __init__(self, name: str) -> None:
601
+ self.name = name
602
+
603
+ @abc.abstractmethod
604
+ def setup_for_query(self, query: SqlQuery) -> None:
605
+ """Enables the estimator to prepare internal data structures.
606
+
607
+ Parameters
608
+ ----------
609
+ query : SqlQuery
610
+ The query for which cardinalities should be estimated.
611
+ """
612
+ raise NotImplementedError
613
+
614
+ @abc.abstractmethod
615
+ def estimate_for(
616
+ self, join_edge: AbstractPredicate, join_graph: JoinGraph
617
+ ) -> Cardinality:
618
+ """Calculates the cardinality estimate for a specific join predicate, given the current state in the join graph.
619
+
620
+ Parameters
621
+ ----------
622
+ join_edge : AbstractPredicate
623
+ The predicate that should be estimated.
624
+ join_graph : JoinGraph
625
+ A graph describing the currently joined relations as well as the join types (e.g. primary key/foreign key or n:m
626
+ joins).
627
+
628
+ Returns
629
+ -------
630
+ int
631
+ The estimated join cardinality
632
+ """
633
+ raise NotImplementedError
634
+
635
+ @abc.abstractmethod
636
+ def describe(self) -> dict:
637
+ """Provides a JSON-serializable representation of the selected cardinality estimation strategy.
638
+
639
+ Returns
640
+ -------
641
+ dict
642
+ The representation
643
+ """
644
+ raise NotImplementedError
645
+
646
+ def pre_check(self) -> OptimizationPreCheck:
647
+ """Provides requirements that an input query has to satisfy in order for the estimator to work properly.
648
+
649
+ Returns
650
+ -------
651
+ OptimizationPreCheck
652
+ The requirements check
653
+ """
654
+ return EmptyPreCheck()
655
+
656
+ def __repr__(self) -> str:
657
+ return str(self)
658
+
659
+ def __str__(self) -> str:
660
+ return f"JoinCardinalityEstimator[{self.name}]"
661
+
662
+
663
+ class UESBoundEstimator(JoinEstimator):
664
+ """Implementation of the UES formula to calculate upper bounds of join cardinalities.
665
+
666
+ The formula distinuishes two cases: n:m joins are estimated according to the maximum frequencies of the join columns.
667
+ Primary key/foreign key joins are estimated according to the cardinality of the foreign key column. The calculation also
668
+ accounts for conjunctive join predicates, but is still limited to equi joins.
669
+ """
670
+
671
+ def __init__(self) -> None:
672
+ super().__init__("UES join estimator")
673
+ self.query: ImplicitSqlQuery | None = None
674
+ self.stats_container: StatsContainer[MaxFrequency] | None = None
675
+
676
+ def setup_for_query(self, query: SqlQuery) -> None:
677
+ self.query = query
678
+
679
+ def setup_for_stats(self, stats_container: StatsContainer[MaxFrequency]) -> None:
680
+ """Configures the statistics container that contains the actual frequencies and cardinalities to use.
681
+
682
+ Parameters
683
+ ----------
684
+ stats_container : StatisticsContainer[MaxFrequency]
685
+ The statistics to use
686
+ """
687
+ self.stats_container = stats_container
688
+
689
+ def estimate_for(
690
+ self, join_edge: AbstractPredicate, join_graph: JoinGraph
691
+ ) -> Cardinality:
692
+ current_min_bound = np.inf
693
+
694
+ for base_predicate in join_edge.base_predicates():
695
+ first_col, second_col = util.simplify(base_predicate.join_partners())
696
+ if join_graph.is_pk_fk_join(first_col.table, second_col.table):
697
+ join_bound = self._estimate_pk_fk_join(first_col, second_col)
698
+ elif join_graph.is_pk_fk_join(second_col.table, first_col.table):
699
+ join_bound = self._estimate_pk_fk_join(second_col, first_col)
700
+ else:
701
+ join_bound = self._estimate_n_m_join(first_col, second_col)
702
+
703
+ if join_bound < current_min_bound:
704
+ current_min_bound = join_bound
705
+
706
+ return Cardinality(current_min_bound)
707
+
708
+ def describe(self) -> dict:
709
+ return {"name": "ues"}
710
+
711
+ def pre_check(self) -> Optional[OptimizationPreCheck]:
712
+ # TODO: the UES check is slightly too restrictive here.
713
+ # It suffices to check that there are only conjunctive equi joins.
714
+ return UESOptimizationPreCheck # this is a pre-generated check instance, don't call () it here!
715
+
716
+ def _estimate_pk_fk_join(
717
+ self, fk_column: ColumnReference, pk_column: ColumnReference
718
+ ) -> Cardinality:
719
+ """Estimation formula for primary key/foreign key joins.
720
+
721
+ Parameters
722
+ ----------
723
+ fk_column : ColumnReference
724
+ The foreign key column
725
+ pk_column : ColumnReference
726
+ The primary key column
727
+
728
+ Returns
729
+ -------
730
+ Cardinality
731
+ An upper bound of the primary key/foreign key join cardinality.
732
+ """
733
+ pk_cardinality = Cardinality(
734
+ self.stats_container.base_table_estimates[pk_column.table]
735
+ )
736
+ fk_frequency = self.stats_container.attribute_frequencies[fk_column]
737
+ return math.ceil(fk_frequency * pk_cardinality)
738
+
739
+ def _estimate_n_m_join(
740
+ self, first_column: ColumnReference, second_column: ColumnReference
741
+ ) -> Cardinality:
742
+ """Estimation formula for n:m joins.
743
+
744
+ Parameters
745
+ ----------
746
+ first_column : ColumnReference
747
+ The join column from the first partner
748
+ second_column : ColumnReference
749
+ The join column from the second partner
750
+
751
+ Returns
752
+ -------
753
+ Cardinality
754
+ An upper bound of the n:m join cardinality.
755
+ """
756
+ first_bound, second_bound = (
757
+ self._fetch_bound(first_column),
758
+ self._fetch_bound(second_column),
759
+ )
760
+ first_freq = self.stats_container.attribute_frequencies[first_column]
761
+ second_freq = self.stats_container.attribute_frequencies[second_column]
762
+
763
+ if any(
764
+ var == 0 for var in [first_bound, second_bound, first_freq, second_freq]
765
+ ):
766
+ return 0
767
+
768
+ first_distinct_vals = first_bound / first_freq
769
+ second_distinct_vals = second_bound / second_freq
770
+
771
+ n_m_bound = (
772
+ min(first_distinct_vals, second_distinct_vals) * first_freq * second_freq
773
+ )
774
+ return Cardinality(math.ceil(n_m_bound))
775
+
776
+ def _fetch_bound(self, column: ColumnReference) -> Cardinality:
777
+ """Provides the appropriate table bound (based on upper bound or base table estimate) for the given column.
778
+
779
+ This is a utility method to work with the statistics container in a more convenient way, since the container can store
780
+ the table cardinality at two different places: as a base table estimate, or as a intermediate estimate for base tables
781
+ that can be filtered via a primary key/foreign key join.
782
+
783
+ Parameters
784
+ ----------
785
+ column : ColumnReference
786
+ The column for which the upper bound of the corresponding base table should be loaded.
787
+
788
+ Returns
789
+ -------
790
+ Cardinality
791
+ An upper bound on the cardinality of the table
792
+ """
793
+ table = column.table
794
+ card = (
795
+ self.stats_container.upper_bounds[table]
796
+ if table in self.stats_container.upper_bounds
797
+ else self.stats_container.base_table_estimates[table]
798
+ )
799
+ return Cardinality(card)
800
+
801
+
802
+ class BranchingPolicy(abc.ABC):
803
+ """This policy influences the creation of branches in the join tree in contrast to linear join paths.
804
+
805
+ The terminology used in this policy treats branches in the join tree and subqueries as synonymous.
806
+
807
+ If an implementation of this policy requires additional information to work properly, this information should be supplied
808
+ via custom setup methods.
809
+
810
+ Parameters
811
+ ----------
812
+ name : str
813
+ The name of the actual branching strategy.
814
+ """
815
+
816
+ def __init__(self, name: str) -> None:
817
+ self.name = name
818
+
819
+ @abc.abstractmethod
820
+ def setup_for_query(self, query: SqlQuery) -> None:
821
+ """Enables the policy to setup of internal data structures.
822
+
823
+ Parameters
824
+ ----------
825
+ query : SqlQuery
826
+ The query that should be optimized next
827
+ """
828
+ raise NotImplementedError
829
+
830
+ @abc.abstractmethod
831
+ def generate_subquery_for(
832
+ self, join: AbstractPredicate, join_graph: JoinGraph
833
+ ) -> bool:
834
+ """Decides whether the given join should be executed in a subquery.
835
+
836
+ Parameters
837
+ ----------
838
+ join : AbstractPredicate
839
+ The join that should be executed **within the subquery**. This is not the predicate that should be used to combine
840
+ the results of two intermediate relations.
841
+ join_graph : JoinGraph
842
+ The current optimization state, providing information about joined relations and the join types (e.g. primary
843
+ key/foreign key or n:m joins).
844
+
845
+ Returns
846
+ -------
847
+ bool
848
+ Whether a branch should be created for the join
849
+ """
850
+ raise NotImplementedError
851
+
852
+ @abc.abstractmethod
853
+ def describe(self) -> dict:
854
+ """Provides a JSON-serializable representation of the selected branching strategy.
855
+
856
+ Returns
857
+ -------
858
+ dict
859
+ The representation
860
+ """
861
+ raise NotImplementedError
862
+
863
+ def pre_check(self) -> OptimizationPreCheck:
864
+ """Provides requirements that an input query has to satisfy in order for the policy to work properly.
865
+
866
+ Returns
867
+ -------
868
+ OptimizationPreCheck
869
+ The requirements check
870
+ """
871
+ return EmptyPreCheck()
872
+
873
+ def __repr__(self) -> str:
874
+ return str(self)
875
+
876
+ def __str__(self) -> str:
877
+ return f"SubqueryGenerationStrategy[{self.name}]"
878
+
879
+
880
+ class LinearJoinTreePolicy(BranchingPolicy):
881
+ """Branching strategy that leaves all join paths linear, and therefore does not generate subqueries at all."""
882
+
883
+ def __init__(self):
884
+ super().__init__("Linear subquery policy")
885
+
886
+ def setup_for_query(self, query: SqlQuery) -> None:
887
+ pass
888
+
889
+ def generate_subquery_for(
890
+ self, join: AbstractPredicate, join_graph: JoinGraph
891
+ ) -> bool:
892
+ return False
893
+
894
+ def describe(self) -> dict:
895
+ return {"name": "linear"}
896
+
897
+
898
+ class UESSubqueryPolicy(BranchingPolicy):
899
+ """Implementation of the UES policy to decide when to insert branches into the join order.
900
+
901
+ In short, the policy generates subqueries whenever they guarantee a reduction of the upper bound of the n:m join partner
902
+ table.
903
+ """
904
+
905
+ def __init__(self):
906
+ super().__init__("UES subquery policy")
907
+ self.query: SqlQuery | None = None
908
+ self.stats_container: StatsContainer | None = None
909
+
910
+ def setup_for_query(self, query: SqlQuery) -> None:
911
+ self.query = query
912
+
913
+ def setup_for_stats_container(self, stats_container: StatsContainer) -> None:
914
+ """Configures the statistics container that contains the actual frequencies and bounds to use.
915
+
916
+ Parameters
917
+ ----------
918
+ stats_container : StatisticsContainer[MaxFrequency]
919
+ The statistics to use
920
+ """
921
+ self.stats_container = stats_container
922
+
923
+ def generate_subquery_for(
924
+ self, join: AbstractPredicate, join_graph: JoinGraph
925
+ ) -> bool:
926
+ if join_graph.count_consumed_tables() < 2:
927
+ return False
928
+
929
+ stats_container = self.stats_container
930
+ for first_col, second_col in join.join_partners():
931
+ first_tab, second_tab = first_col.table, second_col.table
932
+ if join_graph.is_pk_fk_join(first_tab, second_tab):
933
+ joined_table = first_tab
934
+ elif join_graph.is_pk_fk_join(second_tab, first_tab):
935
+ joined_table = second_tab
936
+ else:
937
+ continue
938
+
939
+ generate_subquery = (
940
+ stats_container.upper_bounds[joined_table]
941
+ < stats_container.base_table_estimates[joined_table]
942
+ )
943
+ if generate_subquery:
944
+ return True
945
+
946
+ return False
947
+
948
+ def describe(self) -> dict:
949
+ return {"name": "defensive"}
950
+
951
+
952
+ class UESJoinOrderOptimizer(JoinOrderOptimization):
953
+ """Implementation of the UES join order algorithm.
954
+
955
+ Our implementation expands upon the original algorithm in a number of ways. These are used to enable a variation of
956
+ different policies during optimization, and to apply the algorithm to a larger set of queries. See the module documentation
957
+ for details.
958
+
959
+ Parameters
960
+ ----------
961
+ base_table_estimation : Optional[BaseTableEstimator], optional
962
+ A strategy to estimate the cardinalities/bounds of (filtered) base tables. Defaults to a native estimation by the
963
+ optimizer of the `database`.
964
+ join_estimation : Optional[JoinEstimator], optional
965
+ A strategy to estimate the upper bounds of intermediate joins. Defaults to the `UESBoundEstimator`.
966
+ subquery_policy : Optional[BranchingPolicy], optional
967
+ A strategy to determine when to insert subqueries into the resulting join tree. Defaults to the
968
+ `UESSubqueryPolicy`.
969
+ stats_container : Optional[StatsContainer], optional
970
+ The statistics used to calcualte the different upper bounds. These have to be compatible with the `join_estimation`.
971
+ Defaults to a `MaxFrequencyStatsContainer`.
972
+ pull_eager_pk_tables : bool, optional
973
+ How to deal with chains of primary key/foreign key joins (joins where the primary key table for one join acts as the
974
+ foreign key in another join). By default, only the first primary key/foreign key join is used as a filter. If eager
975
+ pulling is enabled, the subsequent primary key filters are also included.
976
+ database : Optional[db.Database], optional
977
+ The database whose statistics should be used. The database has to be configured appropriately already (e.g. regarding
978
+ the usage of emulated statistics). If this parameter is omitted, it is inferred from the `db.DatabasePool`.
979
+ verbose : bool, optional
980
+ Whether to log internal progress and bound statistics. This is off by default.
981
+
982
+ References
983
+ ----------
984
+
985
+ .. A. Hertzschuch et al.: "Simplicity Done Right for Join Ordering", CIDR'2021
986
+ """
987
+
988
+ def __init__(
989
+ self,
990
+ *,
991
+ base_table_estimation: Optional[
992
+ CardinalityEstimator | BaseTableEstimator
993
+ ] = None,
994
+ join_estimation: Optional[JoinEstimator] = None,
995
+ subquery_policy: Optional[BranchingPolicy] = None,
996
+ stats_container: Optional[StatsContainer] = None,
997
+ pull_eager_pk_tables: bool = False,
998
+ database: Optional[Database] = None,
999
+ verbose: bool = False,
1000
+ ) -> None:
1001
+ super().__init__()
1002
+ self.database = (
1003
+ database if database else DatabasePool().get_instance().current_database()
1004
+ )
1005
+
1006
+ match base_table_estimation:
1007
+ case CardinalityEstimator():
1008
+ self.base_table_estimation = _CardinalityEstimatorWrapper(
1009
+ base_table_estimation, target_db=self.database
1010
+ )
1011
+ case None:
1012
+ self.base_table_estimation = NativeCardinalityEstimator(self.database)
1013
+ case _:
1014
+ self.base_table_estimation = base_table_estimation
1015
+
1016
+ self.join_estimation = (
1017
+ join_estimation if join_estimation else UESBoundEstimator()
1018
+ )
1019
+ self.subquery_policy = (
1020
+ subquery_policy if subquery_policy else UESSubqueryPolicy()
1021
+ )
1022
+ self.stats_container = (
1023
+ stats_container
1024
+ if stats_container
1025
+ else MaxFrequencyStats(self.database.statistics())
1026
+ )
1027
+ self._pull_eager_pk_tables = pull_eager_pk_tables
1028
+ self._logging_enabled = verbose
1029
+
1030
+ def optimize_join_order(self, query: SqlQuery) -> Optional[LogicalJoinTree]:
1031
+ if not isinstance(query, ImplicitSqlQuery):
1032
+ raise ValueError("UES optimization only works for implicit queries for now")
1033
+ if len(query.tables()) < 2:
1034
+ return None
1035
+
1036
+ self.base_table_estimation.setup_for_query(query)
1037
+ self.stats_container.setup_for_query(query, self.base_table_estimation)
1038
+ self.join_estimation.setup_for_query(query)
1039
+ self.join_estimation.setup_for_stats(self.stats_container)
1040
+ self.subquery_policy.setup_for_query(query)
1041
+ if "setup_for_stats_container" in dir(self.subquery_policy):
1042
+ self.subquery_policy.setup_for_stats_container(self.stats_container)
1043
+
1044
+ join_graph = JoinGraph(query, self.database.schema())
1045
+
1046
+ if len(query.tables()) == 2:
1047
+ final_join_tree = self._binary_join_optimization(query, join_graph)
1048
+ elif join_graph.contains_cross_products():
1049
+ # cross-product query is reduced to multiple independent optimization passes
1050
+ optimized_components: list[LogicalJoinTree] = []
1051
+ for component in join_graph.join_components():
1052
+ # FIXME: join components might consist of single tables!
1053
+ optimized_component = self._clone().optimize_join_order(component.query)
1054
+ if not optimized_component:
1055
+ raise JoinOrderOptimizationError(component.query)
1056
+ optimized_components.append(optimized_component)
1057
+
1058
+ # insert cross-products such that the smaller partitions are joined first
1059
+ sorted(optimized_components, key=operator.attrgetter("annotation"))
1060
+ final_join_tree, *remaining_joins = optimized_components
1061
+ for remaining_join in remaining_joins:
1062
+ output_cardinality = (
1063
+ final_join_tree.annotation * remaining_join.annotation
1064
+ )
1065
+ final_join_tree = final_join_tree.join_with(
1066
+ remaining_join, annotation=output_cardinality
1067
+ )
1068
+ elif join_graph.contains_free_n_m_joins():
1069
+ final_join_tree = self._default_ues_optimizer(query, join_graph)
1070
+ else:
1071
+ final_join_tree = self._star_query_optimizer(query, join_graph)
1072
+
1073
+ return final_join_tree
1074
+
1075
+ def describe(self) -> dict:
1076
+ return {
1077
+ "name": "ues",
1078
+ "settings": {
1079
+ "base_table_estimation": self.base_table_estimation.describe(),
1080
+ "join_estimation": self.join_estimation.describe(),
1081
+ "subqueries": self.subquery_policy.describe(),
1082
+ "statistics": self.stats_container.describe(),
1083
+ },
1084
+ }
1085
+
1086
+ def pre_check(self) -> OptimizationPreCheck:
1087
+ specified_checks = [
1088
+ check
1089
+ for check in [
1090
+ self.base_table_estimation.pre_check(),
1091
+ self.join_estimation.pre_check(),
1092
+ self.subquery_policy.pre_check(),
1093
+ ]
1094
+ if check
1095
+ ]
1096
+ specified_checks.append(UESOptimizationPreCheck)
1097
+ return merge_checks(specified_checks)
1098
+
1099
+ def _default_ues_optimizer(
1100
+ self, query: SqlQuery, join_graph: JoinGraph
1101
+ ) -> LogicalJoinTree:
1102
+ """Implementation of our take on the UES algorithm for queries with n:m joins.
1103
+
1104
+ Parameters
1105
+ ----------
1106
+ query : SqlQuery
1107
+ The query to optimize.
1108
+ join_graph : JoinGraph
1109
+ The join graph of the input query. This structure is mutated during the algorithm.
1110
+
1111
+ Returns
1112
+ -------
1113
+ LogicalJoinTree
1114
+ The resulting join tree
1115
+
1116
+ Raises
1117
+ ------
1118
+ AssertionError
1119
+ If the iterative construction failed. This indicates a bug in the implementation of the algorithm, not a mistake by
1120
+ the user.
1121
+ """
1122
+ self._log_information("Using default UES optimizer")
1123
+ join_tree = LogicalJoinTree.empty()
1124
+
1125
+ while join_graph.contains_free_n_m_joins():
1126
+ # Update the current upper bounds
1127
+ lowest_bound = np.inf
1128
+ lowest_bound_table = None
1129
+ for candidate_join in join_graph.available_n_m_join_paths(
1130
+ both_directions_on_initial=True
1131
+ ):
1132
+ candidate_table = candidate_join.target_table
1133
+ filter_estimate = self.stats_container.base_table_estimates[
1134
+ candidate_table
1135
+ ]
1136
+ pk_fk_bounds = [
1137
+ self.join_estimation.estimate_for(
1138
+ join_path.join_condition, join_graph
1139
+ )
1140
+ for join_path in join_graph.available_pk_fk_joins_for(
1141
+ candidate_table
1142
+ )
1143
+ ]
1144
+ candidate_min_bound = min([filter_estimate] + pk_fk_bounds)
1145
+ self.stats_container.upper_bounds[candidate_table] = candidate_min_bound
1146
+
1147
+ if candidate_min_bound < lowest_bound:
1148
+ lowest_bound = candidate_min_bound
1149
+ lowest_bound_table = candidate_table
1150
+ self._log_information(
1151
+ ".. Current bounds: "
1152
+ + util.dicts.stringify(self.stats_container.upper_bounds)
1153
+ )
1154
+
1155
+ if join_tree.is_empty():
1156
+ join_tree = LogicalJoinTree.scan(
1157
+ lowest_bound_table, annotation=lowest_bound
1158
+ )
1159
+ join_graph.mark_joined(lowest_bound_table)
1160
+ self.stats_container.upper_bounds[join_tree] = lowest_bound
1161
+ pk_joins = join_graph.available_deep_pk_join_paths_for(
1162
+ lowest_bound_table, self._table_base_cardinality_ordering
1163
+ )
1164
+ for pk_join in pk_joins:
1165
+ target_table = pk_join.target_table
1166
+ base_cardinality = self.stats_container.base_table_estimates[
1167
+ target_table
1168
+ ]
1169
+ join_bound = self.join_estimation.estimate_for(
1170
+ pk_join.join_condition, join_graph
1171
+ )
1172
+ join_graph.mark_joined(target_table, pk_join.join_condition)
1173
+ join_tree = join_tree.join_with(
1174
+ pk_join.target_table,
1175
+ annotation=join_bound,
1176
+ partner_annotation=base_cardinality,
1177
+ )
1178
+ self._log_optimization_progress(
1179
+ "Initial table selection", lowest_bound_table, pk_joins
1180
+ )
1181
+ continue
1182
+
1183
+ selected_candidate: JoinPath | None = None
1184
+ lowest_bound = np.inf
1185
+ bounds_log: dict[JoinPath, float] = {}
1186
+ for candidate_join in join_graph.available_n_m_join_paths():
1187
+ candidate_bound = self.join_estimation.estimate_for(
1188
+ candidate_join.join_condition, join_graph
1189
+ )
1190
+ bounds_log[candidate_join] = candidate_bound
1191
+ if candidate_bound < lowest_bound:
1192
+ selected_candidate = candidate_join
1193
+ lowest_bound = candidate_bound
1194
+ self._log_information(f".. n:m join bounds: {bounds_log}")
1195
+
1196
+ direct_pk_joins = join_graph.available_pk_fk_joins_for(
1197
+ selected_candidate.target_table
1198
+ )
1199
+ create_subquery = any(
1200
+ self.subquery_policy.generate_subquery_for(
1201
+ pk_join.join_condition, join_graph
1202
+ )
1203
+ for pk_join in direct_pk_joins
1204
+ )
1205
+ candidate_table = selected_candidate.target_table
1206
+ all_pk_joins = (
1207
+ join_graph.available_deep_pk_join_paths_for(candidate_table)
1208
+ if self._pull_eager_pk_tables
1209
+ else join_graph.available_pk_fk_joins_for(candidate_table)
1210
+ )
1211
+ candidate_base_cardinality = self.stats_container.base_table_estimates[
1212
+ candidate_table
1213
+ ]
1214
+ self._log_optimization_progress(
1215
+ "n:m join",
1216
+ candidate_table,
1217
+ all_pk_joins,
1218
+ join_condition=selected_candidate.join_condition,
1219
+ subquery_join=create_subquery,
1220
+ )
1221
+ if create_subquery:
1222
+ subquery_tree = JoinTree.scan(
1223
+ candidate_table, annotation=candidate_base_cardinality
1224
+ )
1225
+ join_graph.mark_joined(candidate_table)
1226
+ subquery_tree = self._insert_pk_joins(
1227
+ query, all_pk_joins, subquery_tree, join_graph
1228
+ )
1229
+
1230
+ join_tree = join_tree.join_with(subquery_tree, annotation=lowest_bound)
1231
+ self.stats_container.upper_bounds[join_tree] = lowest_bound
1232
+
1233
+ else:
1234
+ join_tree = join_tree.join_with(
1235
+ candidate_table,
1236
+ annotation=lowest_bound,
1237
+ partner_annotation=candidate_base_cardinality,
1238
+ )
1239
+ join_graph.mark_joined(
1240
+ candidate_table, selected_candidate.join_condition
1241
+ )
1242
+ self.stats_container.upper_bounds[join_tree] = lowest_bound
1243
+ join_tree = self._insert_pk_joins(
1244
+ query, all_pk_joins, join_tree, join_graph
1245
+ )
1246
+
1247
+ self.stats_container.trigger_frequency_update(
1248
+ join_tree, candidate_table, selected_candidate.join_condition
1249
+ )
1250
+
1251
+ if join_graph.contains_free_tables():
1252
+ raise AssertionError("Join graph still has free tables remaining!")
1253
+ return join_tree
1254
+
1255
+ def _binary_join_optimization(
1256
+ self, query: ImplicitSqlQuery, join_graph: JoinGraph
1257
+ ) -> LogicalJoinTree:
1258
+ """Specialized optimization algorithm for queries with just a single join.
1259
+
1260
+ The algorithm can still be meaningful to determine the inner and outer relation for the only join that has to be
1261
+ performed. Furthermore, this algorithm can be used for smaller partitions of queries with cross products.
1262
+
1263
+ The algorithm is inspired by UES and uses the table with the smaller upper bound as the outer relation.
1264
+
1265
+ Parameters
1266
+ ----------
1267
+ query : ImplicitSqlQuery
1268
+ The query to optimize
1269
+ join_graph : joingraph.JoinGraph
1270
+ The join graph of the query. This structure is mutated during the algorithm.
1271
+
1272
+ Returns
1273
+ -------
1274
+ LogicalJoinTree
1275
+ The resulting join tree
1276
+ """
1277
+ table1, table2 = query.tables()
1278
+ table1_smaller = (
1279
+ self.stats_container.base_table_estimates[table1]
1280
+ < self.stats_container.base_table_estimates[table2]
1281
+ )
1282
+ small_table, large_table = (
1283
+ (table1, table2) if table1_smaller else (table2, table1)
1284
+ )
1285
+
1286
+ large_card = self.stats_container.base_table_estimates[large_table]
1287
+ small_card = self.stats_container.base_table_estimates[small_table]
1288
+
1289
+ join_predicate = query.predicates().joins_between(large_table, small_table)
1290
+ join_bound = self.join_estimation.estimate_for(join_predicate, join_graph)
1291
+
1292
+ join_tree = LogicalJoinTree.scan(large_table, annotation=large_card)
1293
+ join_tree = join_tree.join_with(
1294
+ small_table, annotation=join_bound, partner_annotation=small_card
1295
+ )
1296
+ return join_tree
1297
+
1298
+ def _star_query_optimizer(
1299
+ self, query: ImplicitSqlQuery, join_graph: JoinGraph
1300
+ ) -> LogicalJoinTree:
1301
+ """Join ordering algorithm for star queries (i.e. queries which only consist of primary key/foreign key joins).
1302
+
1303
+ The algorithm is inspired by UES and always tries to insert the table next that guarantees the smallest upper bound.
1304
+
1305
+ Parameters
1306
+ ----------
1307
+ query : ImplicitSqlQuery
1308
+ The query to optimize
1309
+ join_graph : JoinGraph
1310
+ The join graph of the input query. This structure is mutated during the algorithm.
1311
+
1312
+ Returns
1313
+ -------
1314
+ LogicalJoinTree
1315
+ The resulting join tree
1316
+ """
1317
+ self._log_information("Using star query optimizer")
1318
+ # initial table / join selection
1319
+ lowest_bound = np.inf
1320
+ lowest_bound_join = None
1321
+ for candidate_join in join_graph.available_join_paths():
1322
+ current_bound = self.join_estimation.estimate_for(
1323
+ candidate_join.join_condition, join_graph
1324
+ )
1325
+ if current_bound < lowest_bound:
1326
+ lowest_bound = current_bound
1327
+ lowest_bound_join = candidate_join
1328
+
1329
+ start_table = lowest_bound_join.start_table
1330
+ start_card = self.stats_container.base_table_estimates[start_table]
1331
+ join_tree = LogicalJoinTree.scan(start_table, annotation=start_card)
1332
+ join_graph.mark_joined(start_table)
1333
+ join_tree = self._apply_pk_fk_join(
1334
+ query,
1335
+ lowest_bound_join,
1336
+ join_bound=lowest_bound,
1337
+ join_graph=join_graph,
1338
+ current_join_tree=join_tree,
1339
+ )
1340
+
1341
+ # join partner selection
1342
+ while join_graph.contains_free_tables():
1343
+ lowest_bound = np.inf
1344
+ lowest_bound_join = None
1345
+ for candidate_join in join_graph.available_join_paths():
1346
+ current_bound = self.join_estimation.estimate_for(
1347
+ candidate_join.join_condition, join_graph
1348
+ )
1349
+ if current_bound < lowest_bound:
1350
+ lowest_bound = current_bound
1351
+ lowest_bound_join = candidate_join
1352
+
1353
+ join_tree = self._apply_pk_fk_join(
1354
+ query,
1355
+ lowest_bound_join,
1356
+ join_bound=lowest_bound,
1357
+ join_graph=join_graph,
1358
+ current_join_tree=join_tree,
1359
+ )
1360
+
1361
+ return join_tree
1362
+
1363
+ def _table_base_cardinality_ordering(
1364
+ self, table: TableReference, join_edge: dict
1365
+ ) -> int:
1366
+ """Utility method to impose an ordering of multiple primary key tables for a foreign key join.
1367
+
1368
+ The actual ordering sorts the primary key tables according to their upper bounds and is used internally by the join
1369
+ graph.
1370
+
1371
+ Parameters
1372
+ ----------
1373
+ table : TableReference
1374
+ The table for which the cardinality should be retrieved.
1375
+ join_edge : dict
1376
+ The edge of the join graph that describes the current join. This is ignored by the calculation and only required
1377
+ to satisfy the interface required by the join graph.
1378
+
1379
+ Returns
1380
+ -------
1381
+ int
1382
+ A order index based on the cardinality estimate of the table
1383
+
1384
+ See Also
1385
+ --------
1386
+ joingraph.JoinGraph.available_deep_pk_join_paths_for
1387
+ """
1388
+ return self.stats_container.base_table_estimates[table]
1389
+
1390
+ def _apply_pk_fk_join(
1391
+ self,
1392
+ query: SqlQuery,
1393
+ pk_fk_join: JoinPath,
1394
+ *,
1395
+ join_bound: int,
1396
+ join_graph: JoinGraph,
1397
+ current_join_tree: LogicalJoinTree,
1398
+ ) -> LogicalJoinTree:
1399
+ """Includes a specific pk/fk join into a join tree, taking care of all necessary updates.
1400
+
1401
+ Parameters
1402
+ ----------
1403
+ query : SqlQuery
1404
+ The query that is being optimized
1405
+ pk_fk_join : JoinPath
1406
+ The actual join that should be performed
1407
+ join_bound : int
1408
+ The calculated upper bound of the join
1409
+ join_graph : JoinGraph
1410
+ The join graph of the query. This structure is mutated as part of the update
1411
+ current_join_tree : LogicalJoinTree
1412
+ The join order that has been determined so far
1413
+
1414
+ Returns
1415
+ -------
1416
+ LogicalJoinTree
1417
+ An updated join tree that includes the given join as the last (i.e. top-most) join.
1418
+ """
1419
+ target_table = pk_fk_join.target_table
1420
+ target_cardinality = self.stats_container.base_table_estimates[target_table]
1421
+ updated_join_tree = current_join_tree.join_with(
1422
+ target_table, annotation=join_bound, partner_annotation=target_cardinality
1423
+ )
1424
+ join_graph.mark_joined(target_table, pk_fk_join.join_condition)
1425
+ self.stats_container.upper_bounds[updated_join_tree] = join_bound
1426
+ return updated_join_tree
1427
+
1428
+ def _insert_pk_joins(
1429
+ self,
1430
+ query: SqlQuery,
1431
+ pk_joins: Iterable[JoinPath],
1432
+ join_tree: LogicalJoinTree,
1433
+ join_graph: JoinGraph,
1434
+ ) -> LogicalJoinTree:
1435
+ """Generalization of `_apply_pk_fk_join` to multiple join paths.
1436
+
1437
+ Parameters
1438
+ ----------
1439
+ query : SqlQuery
1440
+ The query that is being optimized
1441
+ pk_joins : Iterable[joingraph.JoinPath]
1442
+ The joins that should be included in the join tree, in the order in which they are inserted
1443
+ join_tree : jointree.LogicalJoinTree
1444
+ The join order that has been determined so far
1445
+ join_graph : joingraph.JoinGraph
1446
+ The join graph of the query. This structure is mutated as part of the update
1447
+
1448
+ Returns
1449
+ -------
1450
+ jointree.LogicalJoinTree
1451
+ An updated join tree that includes all of the join paths. Join paths that appear earlier in the iterable are
1452
+ inserted deeper within the tree.
1453
+ """
1454
+ # TODO: refactor in terms of _apply_pk_fk_join
1455
+ for pk_join in pk_joins:
1456
+ pk_table = pk_join.target_table
1457
+ if not join_graph.is_free_table(pk_table):
1458
+ continue
1459
+ pk_join_bound = self.join_estimation.estimate_for(
1460
+ pk_join.join_condition, join_graph
1461
+ )
1462
+ pk_base_cardinality = self.stats_container.base_table_estimates[pk_table]
1463
+ join_tree = join_tree.join_with(
1464
+ pk_table,
1465
+ annotation=pk_join_bound,
1466
+ partner_annotation=pk_base_cardinality,
1467
+ )
1468
+ join_graph.mark_joined(pk_table, pk_join.join_condition)
1469
+ self.stats_container.upper_bounds[join_tree] = pk_join_bound
1470
+ return join_tree
1471
+
1472
+ def _clone(self) -> UESJoinOrderOptimizer:
1473
+ """Creates a new join order optimizer with the same settings as this one.
1474
+
1475
+ Returns
1476
+ -------
1477
+ UESJoinOrderOptimizer
1478
+ The cloned optimizer
1479
+ """
1480
+ return UESJoinOrderOptimizer(
1481
+ base_table_estimation=copy.copy(self.base_table_estimation),
1482
+ join_estimation=copy.copy(self.join_estimation),
1483
+ subquery_policy=copy.copy(self.subquery_policy),
1484
+ stats_container=copy.copy(self.stats_container),
1485
+ database=self.database,
1486
+ )
1487
+
1488
+ def _log_information(self, info: str) -> None:
1489
+ """Displays arbitrary information.
1490
+
1491
+ The current implementation of this methods writes to *stdout* directly. If logging is disabled, no information is
1492
+ printed.
1493
+
1494
+ Parameters
1495
+ ----------
1496
+ info : str
1497
+ The information to display
1498
+ """
1499
+ if self._logging_enabled:
1500
+ print(info)
1501
+
1502
+ def _log_optimization_progress(
1503
+ self,
1504
+ phase: str,
1505
+ candidate_table: TableReference,
1506
+ pk_joins: Iterable[JoinPath],
1507
+ *,
1508
+ join_condition: AbstractPredicate | None = None,
1509
+ subquery_join: bool | None = None,
1510
+ ) -> None:
1511
+ """Displays the current optimizer state.
1512
+
1513
+ The current implementation of this method writes to *stdout* directly. If logging is disabled, no information is
1514
+ printed.
1515
+
1516
+ Parameters
1517
+ ----------
1518
+ phase : str
1519
+ The phase of the UES algorithm, e.g. initial table selection of n:m join execution
1520
+ candidate_table : TableReference
1521
+ The table that is considered as the next join partner
1522
+ pk_joins : Iterable[JoinPath]
1523
+ Primary key joins that should be applied to the candidate table
1524
+ join_condition : AbstractPredicate | None, optional
1525
+ The join condition that was used to find the candidate table. Can be ``None`` to omit this information, e.g. when
1526
+ it is not applicable for the current phase.
1527
+ subquery_join : bool | None, optional
1528
+ Whether the primary key tables should be joined before the actual n:m join. Can be ``None`` to omit this
1529
+ information, e.g. when it is not applicable for the current phase.
1530
+ """
1531
+ # TODO: use proper logging instead of print() calls
1532
+ if not self._logging_enabled:
1533
+ return
1534
+ log_components = [
1535
+ phase,
1536
+ "::",
1537
+ str(candidate_table),
1538
+ "with PK joins",
1539
+ str(pk_joins),
1540
+ ]
1541
+ if join_condition:
1542
+ log_components.extend(["on condition", str(join_condition)])
1543
+ if subquery_join is not None:
1544
+ log_components.append(
1545
+ "with subquery" if subquery_join else "without subquery"
1546
+ )
1547
+ log_message = " ".join(log_components)
1548
+ print(log_message)
1549
+
1550
+
1551
+ class UESOperatorSelection(PhysicalOperatorSelection):
1552
+ """Implementation of the operator selection used in the UES algorithm.
1553
+
1554
+ UES is actually not concerned with operator selection and focuses exclusively on join orders. Therefore, this
1555
+ strategy simply disables nested loop joins since they can typically lead to performance degradation. Essentially
1556
+ this enforces the usage of hash joins for the vast majority of joins in a typical database system because they
1557
+ provide the most robust behavior.
1558
+
1559
+ Parameters
1560
+ ----------
1561
+ database : Database
1562
+ The target database on which the optimized query should be executed. This parameter enables a graceful fallback in case
1563
+ the database does not support a nested-loop join in the first place. If this situation occurs, nothing is disabled.
1564
+
1565
+ Notes
1566
+ -----
1567
+ Although the UES join order optimizer never produces physical query plans and is only concerned with logical join trees,
1568
+ this selection algorithm handles physical plans gracefully by retaining all former operator assignments that do not
1569
+ contradict the no-nested-loop join rule.
1570
+ """
1571
+
1572
+ def __init__(self, database: Database) -> None:
1573
+ super().__init__()
1574
+ self.database = database
1575
+
1576
+ def select_physical_operators(
1577
+ self, query: SqlQuery, join_order: Optional[JoinTree]
1578
+ ) -> PhysicalOperatorAssignment:
1579
+ assignment = PhysicalOperatorAssignment()
1580
+ if self.database.hinting().supports_hint(JoinOperator.NestedLoopJoin):
1581
+ assignment.set_operator_enabled_globally(
1582
+ JoinOperator.NestedLoopJoin,
1583
+ False,
1584
+ overwrite_fine_grained_selection=True,
1585
+ )
1586
+ return assignment
1587
+
1588
+ def describe(self) -> dict:
1589
+ return {"name": "ues"}
1590
+
1591
+
1592
+ UESOptimizationPreCheck = merge_checks(
1593
+ ImplicitQueryPreCheck(),
1594
+ EquiJoinPreCheck(),
1595
+ DependentSubqueryPreCheck(),
1596
+ SetOperationsPreCheck(),
1597
+ VirtualTablesPreCheck(),
1598
+ )
1599
+ """Check for all query features that UES does (not) support.
1600
+
1601
+ This check asserts that the following criteria are met:
1602
+
1603
+ - the input query is an *implicit* SQL query (see qal for details)
1604
+ - all join predicates are binary equi joins
1605
+ - there are no dependent subqueries
1606
+ - there are no virtual tables, including no CTEs
1607
+ """