PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1121 @@
1
+ """Provides PostBOUND's main optimization pipeline.
2
+
3
+ In fact, PostBOUND does not provide a single pipeline implementation. Rather, different pipeline types exists to accomodate
4
+ different use-cases. See the documentation of the general `OptimizationPipeline` base class for details. That class serves as
5
+ the smallest common denominator among all pipeline implementations.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import abc
11
+ from typing import Optional, Protocol, Self
12
+
13
+ from ._hints import PhysicalOperatorAssignment, PlanParameterization
14
+ from ._qep import QueryPlan
15
+ from ._stages import (
16
+ CardinalityEstimator,
17
+ CompleteOptimizationAlgorithm,
18
+ CostModel,
19
+ IncrementalOptimizationStep,
20
+ JoinOrderOptimization,
21
+ ParameterGeneration,
22
+ PhysicalOperatorSelection,
23
+ PlanEnumerator,
24
+ )
25
+ from ._validation import (
26
+ EmptyPreCheck,
27
+ OptimizationPreCheck,
28
+ UnsupportedQueryError,
29
+ UnsupportedSystemError,
30
+ merge_checks,
31
+ )
32
+ from .db._db import Database, DatabasePool
33
+ from .qal._qal import SqlQuery
34
+ from .util._errors import StateError
35
+ from .util.jsonize import jsondict
36
+
37
+
38
+ class OptimizationPipeline(abc.ABC):
39
+ """The optimization pipeline is the main tool to apply different strategies to optimize SQL queries.
40
+
41
+ Depending on the specific scenario, different concrete pipeline implementations exist. For example, to apply multi-stage
42
+ optimization design (e.g. consisting of join ordering and a subsequent physical operator selection), the
43
+ `MultiStageOptimizationPipeline` exists. Similarly, for optimization algorithms that perform join ordering and operator
44
+ selection in one process, an `IntegratedOptimizationPipeline` is available. The `TextBookOptimizationPipeline` is modelled
45
+ after the traditional interplay of cardinality estimator, cost model and plan enumerator. Lastly, to model approaches that
46
+ subsequently improve query plans by correcting some previous optimization decisions (e.g. transforming a hash join to a
47
+ nested loop join), the `IncrementalOptimizationPipeline` is provided. Consult the individual pipeline documentation for
48
+ more details. This class only describes the basic interface that is shared by all the pipeline implementations.
49
+
50
+ If in doubt what the best pipeline implementation is, it is probably best to start with the
51
+ `MultiStageOptimizationPipeline` or the `TextBookOptimizationPipeline`, since they are the most flexible.
52
+ """
53
+
54
+ @abc.abstractmethod
55
+ def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
56
+ """Applies the current pipeline configuration to obtain an optimized plan for the input query.
57
+
58
+ Parameters
59
+ ----------
60
+ query : SqlQuery
61
+ The query that should be optimized
62
+
63
+ Returns
64
+ -------
65
+ QueryPlan
66
+ An optimized query execution plan for the input query.
67
+
68
+ If the optimization strategies only provide partial optimization decisions (e.g. physical operators for a subset of
69
+ the joins), it is up to the pipeline to fill the gaps in order to provide a complete execution plan. A typical
70
+ approach could be to delegate this task to the optimizer of the target database by providing it the partial
71
+ optimization information.
72
+
73
+ Raises
74
+ ------
75
+ UnsupportedQueryError
76
+ If the selected optimization algorithms cannot be applied to the specific query, e.g. because it contains
77
+ unsupported features.
78
+ """
79
+ raise NotImplementedError
80
+
81
+ def optimize_query(self, query: SqlQuery) -> SqlQuery:
82
+ """Applies the current pipeline configuration to optimize the input query.
83
+
84
+ This process also involves the generation of appropriate optimization information that enforces the selected
85
+ optimization decision when the query is executed on an actual database system.
86
+
87
+ Parameters
88
+ ----------
89
+ query : SqlQuery
90
+ The query that should be optimized
91
+
92
+ Returns
93
+ -------
94
+ SqlQuery
95
+ A transformed query that encapsulates all the optimization decisions made by the pipeline. What this
96
+ actually means depends on the selected optimization strategies, as well as specifics of the target database
97
+ system:
98
+
99
+ Depending on the optimization strategy the optimization decisions can range from simple operator selections
100
+ (such as "no nested loop join for this join") to entire physical query execution plans (consisting of a
101
+ join order, as well as scan and join operators for all parts of the plan) and anything in between. For
102
+ novel cardinality estimation approaches, the optimization info could also be structured such that the
103
+ default cardinality estimates are overwritten.
104
+
105
+ Furthermore, the way the optimization info is expressed depends on the selected database system. Most systems
106
+ do not allow a direct modification of the query optimizer's implementation. Therefore, PostBOUND takes an indirect
107
+ approach: it emits system-specific hints that enable corrections for individual optimizer decisions (such as
108
+ disabling a specific physical operator). For example, PostgreSQL allows to use planner options such as
109
+ ``SET enable_nestloop = 'off'`` to disable nested loop joins for the all subsequent queries in the current
110
+ connection. MySQL provides hints like ``BNL(R S)`` to recommend a block-nested loop join or hash join (depending
111
+ on the MySQL version) to the optimizer for a specific join. These hints are inserted into comment blocks in the
112
+ final SQL query. Likewise, some systems treat certain SQL keywords differently or provide their own extensions.
113
+ This also allows to modify the underlying plans. For example, when SQLite encouters a *CROSS JOIN* syntax in the
114
+ *FROM* clause, it does not try to optimize the join order and uses the order in which the tables are specified in
115
+ the relation instead.
116
+
117
+ Therefore, the resulting query will differ from the original input query in a number of ways. However, the
118
+ produced result sets should still be equivalent. If this is not the case, something went severly wrong
119
+ during query optimization. Take a look at the `db` module for more details on the database system support
120
+ and the query generation capabilities.
121
+
122
+ Raises
123
+ ------
124
+ UnsupportedQueryError
125
+ If the selected optimization algorithms cannot be applied to the specific query, e.g. because it contains
126
+ unsupported features.
127
+
128
+
129
+ References
130
+ ----------
131
+
132
+ .. PostgreSQL query planning options: https://www.postgresql.org/docs/15/runtime-config-query.html
133
+ .. MySQL optimizer hints: https://dev.mysql.com/doc/refman/8.0/en/optimizer-hints.html
134
+ .. SQLite *CROSS JOIN* handling: https://www.sqlite.org/optoverview.html#crossjoin
135
+ """
136
+ execution_plan = self.query_execution_plan(query)
137
+ hinting_service = self.target_database().hinting()
138
+ return hinting_service.generate_hints(query, execution_plan)
139
+
140
+ @abc.abstractmethod
141
+ def target_database(self) -> Database:
142
+ """Provides the current target database.
143
+
144
+ Returns
145
+ -------
146
+ Database
147
+ The database for which the input queries should be optimized
148
+ """
149
+ raise NotImplementedError
150
+
151
+ @abc.abstractmethod
152
+ def describe(self) -> jsondict:
153
+ """Generates a description of the current pipeline configuration.
154
+
155
+ This description is intended to transparently document which optimization strategies have been selected and
156
+ how they have been instantiated. It can be JSON-serialized and will be included in the output of the benchmarking
157
+ utilities.
158
+
159
+ Returns
160
+ -------
161
+ jsondict
162
+ The actual description
163
+ """
164
+ raise NotImplementedError
165
+
166
+
167
+ class IntegratedOptimizationPipeline(OptimizationPipeline):
168
+ """This pipeline is intended for algorithms that calculate the entire query plan in a single process.
169
+
170
+ To configure the pipeline, use the `set_optimization_algorithm` method followed by the `build` method (in line with the
171
+ other pipelines).
172
+
173
+ Parameters
174
+ ----------
175
+ target_db : Optional[Database], optional
176
+ The database for which the optimized queries should be generated. If this is not given, he default database is
177
+ extracted from the `DatabasePool`.
178
+ """
179
+
180
+ def __init__(self, target_db: Optional[Database] = None) -> None:
181
+ self._target_db = (
182
+ target_db
183
+ if target_db is not None
184
+ else DatabasePool.get_instance().current_database()
185
+ )
186
+ self._optimization_algorithm: Optional[CompleteOptimizationAlgorithm] = None
187
+ self._build = False
188
+ super().__init__()
189
+
190
+ @property
191
+ def target_db(self) -> Database:
192
+ """The database for which optimized queries should be generated.
193
+
194
+ When assigning a new target database, the pipeline has to be build again.
195
+
196
+ Returns
197
+ -------
198
+ Database
199
+ The currently selected database system
200
+
201
+ See Also
202
+ --------
203
+ CompleteOptimizationAlgorithm.pre_check
204
+ """
205
+ return self._target_db
206
+
207
+ @target_db.setter
208
+ def target_db(self, system: Database) -> None:
209
+ self._build = False
210
+ self._target_db = system
211
+
212
+ @property
213
+ def optimization_algorithm(self) -> Optional[CompleteOptimizationAlgorithm]:
214
+ """The optimization algorithm is used each time a query should be optimized.
215
+
216
+ Returns
217
+ -------
218
+ Optional[CompleteOptimizationAlgorithm]
219
+ The currently selected optimization algorithm, if any.
220
+ """
221
+ return self._optimization_algorithm
222
+
223
+ def setup_optimization_algorithm(
224
+ self, algorithm: CompleteOptimizationAlgorithm
225
+ ) -> Self:
226
+ """Configures the pipeline to use the given optimization algorithm.
227
+
228
+ Parameters
229
+ ----------
230
+ algorithm : CompleteOptimizationAlgorithm
231
+ The new optimization algorithm to use. No compatibility checks are performed, yet. This is done when building the
232
+ pipeline.
233
+
234
+ Returns
235
+ -------
236
+ IntegratedOptimizationPipeline
237
+ The current pipeline to allow for easy method-chaining.
238
+ """
239
+ self._optimization_algorithm = algorithm
240
+ return self
241
+
242
+ def build(self) -> Self:
243
+ """Constructs the optimization pipeline.
244
+
245
+ This includes checking the selected optimization algorithm for compatibility with the `target_db`. Afterwards, the
246
+ pipeline is ready to optimize queries.
247
+
248
+ Returns
249
+ -------
250
+ IntegratedOptimizationPipeline
251
+ The current pipeline to allow for easy method-chaining.
252
+
253
+ Raises
254
+ ------
255
+ UnsupportedSystemError
256
+ If the new optimization algorithm is not compatible with the current target database system.
257
+
258
+ See Also
259
+ --------
260
+ CompleteOptimizationAlgorithm.pre_check
261
+ """
262
+ pre_check = self._optimization_algorithm.pre_check()
263
+ if pre_check is not None:
264
+ pre_check.check_supported_database_system(
265
+ self._target_db
266
+ ).ensure_all_passed()
267
+ self._build = True
268
+ return self
269
+
270
+ def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
271
+ if not self._build:
272
+ raise StateError(
273
+ "No algorithm has been selected. Don't forget to call `build()` after setting the algorithm."
274
+ )
275
+
276
+ pre_check = self.optimization_algorithm.pre_check()
277
+ if pre_check is not None:
278
+ pre_check.check_supported_query(query).ensure_all_passed()
279
+
280
+ physical_qep = self.optimization_algorithm.optimize_query(query)
281
+ return physical_qep
282
+
283
+ def target_database(self) -> Database:
284
+ return self._target_db
285
+
286
+ def describe(self) -> jsondict:
287
+ algorithm_description = (
288
+ self._optimization_algorithm.describe()
289
+ if self._optimization_algorithm is not None
290
+ else "no_algorithm"
291
+ )
292
+ return {
293
+ "name": "integrated_pipeline",
294
+ "database_system": self._target_db.describe(),
295
+ "optimization_algorithm": algorithm_description,
296
+ }
297
+
298
+ def __repr__(self) -> str:
299
+ return str(self)
300
+
301
+ def __str__(self) -> str:
302
+ return f"IntegratedOptimization [{self._optimization_algorithm}]"
303
+
304
+
305
+ class TextBookOptimizationPipeline(OptimizationPipeline):
306
+ """This pipeline is modelled after the traditional approach to query optimization as used in most real-world systems.
307
+
308
+ The optimizer consists of a cardinality estimator that calculates the size of intermediate results, a cost model that
309
+ quantifies how expensive specific access paths for the intermediates are, and an enumerator that generates the
310
+ intermediates in the first place.
311
+
312
+ To configure the pipeline, specific strategies for each of the three components have to be assigned.
313
+
314
+ Parameters
315
+ ----------
316
+ target_db : Database
317
+ The database for which the optimized queries should be generated.
318
+ """
319
+
320
+ def __init__(self, target_db: Database) -> None:
321
+ from .db.postgres import PostgresInterface
322
+ from .optimizer.dynprog import (
323
+ DynamicProgrammingEnumerator,
324
+ PostgresDynProg,
325
+ )
326
+ from .optimizer.native import (
327
+ NativeCardinalityEstimator,
328
+ NativeCostModel,
329
+ )
330
+
331
+ self._target_db = target_db
332
+ self._card_est: CardinalityEstimator = NativeCardinalityEstimator()
333
+ self._cost_model: CostModel = NativeCostModel()
334
+
335
+ if isinstance(target_db, PostgresInterface):
336
+ self._plan_enumerator = PostgresDynProg(target_db=target_db)
337
+ self._plan_enumerator.infer_settings()
338
+ else:
339
+ self._plan_enumerator = DynamicProgrammingEnumerator(target_db=target_db)
340
+
341
+ self._support_check = EmptyPreCheck()
342
+ self._build = False
343
+
344
+ def target_database(self) -> Database:
345
+ return self._target_db
346
+
347
+ def setup_cardinality_estimator(self, estimator: CardinalityEstimator) -> Self:
348
+ """Configures the cardinality estimator of the optimizer.
349
+
350
+ Setting a new algorithm requires the pipeline to be build again.
351
+
352
+ Parameters
353
+ ----------
354
+ estimator : CardinalityEstimator
355
+ The estimator to be used
356
+
357
+ Returns
358
+ -------
359
+ self
360
+ The current pipeline to allow for easy method-chaining.
361
+ """
362
+ self._build = False
363
+ self._card_est = estimator
364
+ return self
365
+
366
+ def setup_cost_model(self, cost_model: CostModel) -> Self:
367
+ """Configures the cost model of the optimizer.
368
+
369
+ Setting a new algorithm requires the pipeline to be build again.
370
+
371
+ Parameters
372
+ ----------
373
+ cost_model : CostModel
374
+ The cost model to be used
375
+
376
+ Returns
377
+ -------
378
+ self
379
+ The current pipeline to allow for easy method-chaining.
380
+ """
381
+ self._build = False
382
+ self._cost_model = cost_model
383
+ return self
384
+
385
+ def setup_plan_enumerator(self, plan_enumerator: PlanEnumerator) -> Self:
386
+ """Configures the plan enumerator of the optimizer.
387
+
388
+ Setting a new algorithm requires the pipeline to be build again.
389
+
390
+ Parameters
391
+ ----------
392
+ plan_enumerator : PlanEnumerator
393
+ The enumerator to be used
394
+
395
+ Returns
396
+ -------
397
+ self
398
+ The current pipeline to allow for easy method-chaining.
399
+ """
400
+ self._build = False
401
+ self._plan_enumerator = plan_enumerator
402
+ return self
403
+
404
+ def use(self, component: PlanEnumerator | CostModel | CardinalityEstimator) -> Self:
405
+ """Shortcut method to setup the pipeline. Delegates to the appropriate setup_XXX method."""
406
+ match component:
407
+ case PlanEnumerator():
408
+ return self.setup_plan_enumerator(component)
409
+ case CostModel():
410
+ return self.setup_cost_model(component)
411
+ case CardinalityEstimator():
412
+ return self.setup_cardinality_estimator(component)
413
+ case _:
414
+ raise TypeError(f"Unsupported component type: {type(component)}")
415
+
416
+ def build(self) -> Self:
417
+ """Constructs the optimization pipeline.
418
+
419
+ This includes checking all strategies for compatibility with the `target_db`. Afterwards, the pipeline is ready to
420
+ optimize queries.
421
+
422
+ Returns
423
+ -------
424
+ self
425
+ The current pipeline to allow for easy method-chaining.
426
+
427
+ Raises
428
+ ------
429
+ UnsupportedSystemError
430
+ If any of the selected optimization stages is not compatible with the `target_db`.
431
+ """
432
+ if self._card_est is None:
433
+ raise StateError("Missing cardinality estimator")
434
+ if self._cost_model is None:
435
+ raise StateError("Missing cost model")
436
+ if self._plan_enumerator is None:
437
+ raise StateError("Missing plan enumerator")
438
+
439
+ self._support_check = merge_checks(
440
+ [
441
+ self._card_est.pre_check(),
442
+ self._cost_model.pre_check(),
443
+ self._plan_enumerator.pre_check(),
444
+ ]
445
+ )
446
+ self._support_check.check_supported_database_system(
447
+ self._target_db
448
+ ).ensure_all_passed(self._target_db)
449
+
450
+ self._build = True
451
+ return self
452
+
453
+ def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
454
+ if not self._build:
455
+ raise StateError("Pipeline has not been build")
456
+ self._support_check.check_supported_query(query).ensure_all_passed(query)
457
+
458
+ return self._plan_enumerator.generate_execution_plan(
459
+ query, cardinality_estimator=self._card_est, cost_model=self._cost_model
460
+ )
461
+
462
+ def describe(self) -> jsondict:
463
+ return {
464
+ "name": "textbook_pipeline",
465
+ "database_system": self._target_db.describe(),
466
+ "plan_enumerator": self._plan_enumerator.describe()
467
+ if self._plan_enumerator is not None
468
+ else None,
469
+ "cost_model": self._cost_model.describe()
470
+ if self._cost_model is not None
471
+ else None,
472
+ "cardinality_estimator": self._card_est.describe()
473
+ if self._card_est is not None
474
+ else None,
475
+ }
476
+
477
+ def __repr__(self) -> str:
478
+ return str(self)
479
+
480
+ def __str__(self) -> str:
481
+ components = [self._plan_enumerator, self._cost_model, self._card_est]
482
+ opt_chain = " + ".join(str(comp) for comp in components)
483
+ return f"TextBookOptimization [{opt_chain}]"
484
+
485
+
486
+ class MultiStageOptimizationPipeline(OptimizationPipeline):
487
+ """This optimization pipeline performs query optimization in separate phases.
488
+
489
+ The pipeline is organized in two large stages (join ordering and physical operator selection), which are
490
+ accompanied by initial pre check and a final plan parameterization steps. In total, those four individual steps
491
+ completely specify the optimization settings that should be applied to an incoming query. For each of the steps
492
+ general interface exist that must be implemented by the selected strategies.
493
+
494
+ The steps are applied in consecutive order and perform the following tasks:
495
+
496
+ 1. the incoming query is checked for unsupported features
497
+ 2. an optimized join order for the query is calculated
498
+ 3. appropriate physical operators are determined, depending on the join order
499
+ 4. the query plan (join order + physical operators) is further parameterized, for example with custom cardinality estimates
500
+
501
+ All steps are optional. If they are not specified, no operation will be performed at the specific stage. Effectively, this
502
+ means that the query optimizer of the target database system needs to step in and "fill the gaps". For example, if no
503
+ join ordering is performed, the native optimizer needs to come up with a join order. But, the native optimizer will use
504
+ the selected physical operators to perform these joins. Likewise, specifying only a join order means that the native
505
+ optimizer will select its own physical operators. If cardinalities are provided, they are used to guide the native
506
+ optimizer. As an extreme case, one can skip join ordering and physical operator selection completely and only compute
507
+ cardinality estimates in the parameterization step. This way, a different cardinality estimator can be simulated without
508
+ using the `TextBookOptimizationPipeline`. This has the advantage that no default strategies for cost estimation and plan
509
+ enumeration need to to be simulated and the actual algorithms from the target database are used.
510
+
511
+ Once the optimization settings have been selected via the *setup* methods (or alternatively via the `load_settings`
512
+ functionality), the pipeline has to be build using the `build` method. Afterwards, it is ready to optimize
513
+ input queries.
514
+
515
+ A pipeline depends on a specific database system. This is necessary to produce the appropriate metadata for an
516
+ input query (i.e. to apply the specifics that enforce the optimized query plan during query execution for the
517
+ database system). This field can be changed between optimization calls to use the same pipeline for different
518
+ systems.
519
+
520
+ As a shortcut, `load_settings` can be used to initialize a pipeline with pre-defined optimization strategies.
521
+
522
+ Parameters
523
+ ----------
524
+ target_db : Database
525
+ The database for which the optimized queries should be generated.
526
+
527
+
528
+ Examples
529
+ --------
530
+ >>> pipeline = pb.MultiStageOptimizationPipline(postgres_db)
531
+ >>> pipeline.load_settings(ues_settings)
532
+ >>> pipeline.build()
533
+ >>> pipeline.optimize_query(join_order_benchmark["1a"])
534
+ """
535
+
536
+ def __init__(self, target_db: Database) -> None:
537
+ self._target_db = target_db
538
+ self._pre_check: OptimizationPreCheck | None = EmptyPreCheck()
539
+ self._join_order_enumerator: JoinOrderOptimization | None = None
540
+ self._physical_operator_selection: PhysicalOperatorSelection | None = None
541
+ self._plan_parameterization: ParameterGeneration | None = None
542
+ self._build = False
543
+
544
+ @property
545
+ def target_db(self) -> Database:
546
+ """The database for which optimized queries should be generated.
547
+
548
+ When assigning a new target database, the pipeline needs to be build again.
549
+
550
+ Returns
551
+ -------
552
+ Database
553
+ The currently selected database system
554
+ """
555
+ return self._target_db
556
+
557
+ @target_db.setter
558
+ def target_db(self, new_db: Database) -> None:
559
+ self._target_db = new_db
560
+ self._build = False
561
+
562
+ @property
563
+ def pre_check(self) -> Optional[OptimizationPreCheck]:
564
+ """An overarching check that should be applied to all queries before they are optimized.
565
+
566
+ This check complements the pre checks of the individual stages and can be used to enforce experiment-specific
567
+ constraints.
568
+
569
+ Returns
570
+ -------
571
+ Optional[OptimizationPreCheck]
572
+ The current check, if any. Can also be an `EmptyPreCheck` instance.
573
+ """
574
+ return self._pre_check
575
+
576
+ @property
577
+ def join_order_enumerator(self) -> Optional[JoinOrderOptimization]:
578
+ """The selected join order optimization algorithm.
579
+
580
+ Returns
581
+ -------
582
+ Optional[JoinOrderOptimization]
583
+ The current algorithm, if any has been selected.
584
+ """
585
+ return self._join_order_enumerator
586
+
587
+ @property
588
+ def physical_operator_selection(self) -> Optional[PhysicalOperatorSelection]:
589
+ """The selected operator selection algorithm.
590
+
591
+ Returns
592
+ -------
593
+ Optional[PhysicalOperatorSelection]
594
+ The current algorithm, if any has been selected.
595
+ """
596
+ return self._physical_operator_selection
597
+
598
+ @property
599
+ def plan_parameterization(self) -> Optional[ParameterGeneration]:
600
+ """The selected parameterization algorithm.
601
+
602
+ Returns
603
+ -------
604
+ Optional[ParameterGeneration]
605
+ The current algorithm, if any has been selected.
606
+ """
607
+ return self._plan_parameterization
608
+
609
+ def setup_query_support_check(self, check: OptimizationPreCheck) -> Self:
610
+ """Configures the pre-check that should be executed for each query.
611
+
612
+ This check will be combined with any additional checks that are required by the actual optimization strategies.
613
+ Setting a new check requires the pipeline to be build again.
614
+
615
+ Parameters
616
+ ----------
617
+ check : OptimizationPreCheck
618
+ The new check
619
+
620
+ Returns
621
+ -------
622
+ self
623
+ The current pipeline to allow for easy method-chaining.
624
+ """
625
+ self._pre_check = check
626
+ self._build = False
627
+ return self
628
+
629
+ def setup_join_order_optimization(self, enumerator: JoinOrderOptimization) -> Self:
630
+ """Configures the pipeline to obtain an optimized join order.
631
+
632
+ The actual strategy can either produce a purely logical join order, or an initial physical query execution plan
633
+ that also specifies how the individual joins should be executed. All later stages are expected to work with
634
+ these two cases.
635
+
636
+ Setting a new algorithm requires the pipeline to be build again.
637
+
638
+ Parameters
639
+ ----------
640
+ enumerator : JoinOrderOptimization
641
+ The new join order optimization algorithm
642
+
643
+ Returns
644
+ -------
645
+ self
646
+ The current pipeline to allow for easy method-chaining.
647
+ """
648
+ self._join_order_enumerator = enumerator
649
+ self._build = False
650
+ return self
651
+
652
+ def setup_physical_operator_selection(
653
+ self, selector: PhysicalOperatorSelection
654
+ ) -> Self:
655
+ """Configures the algorithm to assign physical operators to the query.
656
+
657
+ This algorithm receives the input query as well as the join order (if there is one) as input. In a special
658
+ case, this join order can also provide an initial assignment of physical operators. These settings can then
659
+ be further adapted by the selected algorithm (or completely overwritten).
660
+
661
+ Setting a new algorithm requires the pipeline to be build again.
662
+
663
+ Paramters
664
+ ---------
665
+ selector : PhysicalOperatorSelection
666
+ The new operator selection algorithm
667
+
668
+ Returns
669
+ -------
670
+ self
671
+ The current pipeline to allow for easy method-chaining.
672
+ """
673
+ self._physical_operator_selection = selector
674
+ self._build = False
675
+ return self
676
+
677
+ def setup_plan_parameterization(self, param_generator: ParameterGeneration) -> Self:
678
+ """Configures the algorithm to parameterize the query plan.
679
+
680
+ This algorithm receives the input query as well as the join order and the physical operators (if those have
681
+ been determined yet) as input.
682
+
683
+ Setting a new algorithm requires the pipeline to be build again.
684
+
685
+ Parameters
686
+ ----------
687
+ param_generator : ParameterGeneration
688
+ The new parameterization algorithm
689
+
690
+ Returns
691
+ -------
692
+ self
693
+ The current pipeline to allow for easy method-chaining.
694
+ """
695
+ self._plan_parameterization = param_generator
696
+ self._build = False
697
+ return self
698
+
699
+ def use(
700
+ self,
701
+ component: JoinOrderOptimization
702
+ | PhysicalOperatorSelection
703
+ | ParameterGeneration,
704
+ ) -> Self:
705
+ """Shortcut method to setup the pipeline. Delegates to the appropriate setup_XXX method."""
706
+ match component:
707
+ case JoinOrderOptimization():
708
+ return self.setup_join_order_optimization(component)
709
+ case PhysicalOperatorSelection():
710
+ return self.setup_physical_operator_selection(component)
711
+ case ParameterGeneration():
712
+ return self.setup_plan_parameterization(component)
713
+ case _:
714
+ raise TypeError(f"Unsupported component type: {type(component)}")
715
+
716
+ def load_settings(self, optimization_settings: OptimizationSettings) -> Self:
717
+ """Applies all the optimization settings from a pre-defined optimization strategy to the pipeline.
718
+
719
+ This is just a shorthand method to skip calling all setup methods individually for a fixed combination of
720
+ optimization settings. After the settings have been loaded, they can be overwritten again using the *setup*
721
+ methods.
722
+
723
+ Loading new presets requires the pipeline to be build again.
724
+
725
+ Parameters
726
+ ----------
727
+ optimization_settings : OptimizationSettings
728
+ The specific settings
729
+
730
+ Returns
731
+ -------
732
+ self
733
+ The current pipeline to allow for easy method-chaining.
734
+ """
735
+ support_check = optimization_settings.query_pre_check()
736
+ if support_check:
737
+ self.setup_query_support_check(support_check)
738
+ join_ordering = optimization_settings.build_join_order_optimizer()
739
+ if join_ordering:
740
+ self.setup_join_order_optimization(join_ordering)
741
+ operator_selection = optimization_settings.build_physical_operator_selection()
742
+ if operator_selection:
743
+ self.setup_physical_operator_selection(operator_selection)
744
+ plan_parameterization = optimization_settings.build_plan_parameterization()
745
+ if plan_parameterization:
746
+ self.setup_plan_parameterization(plan_parameterization)
747
+ self._build = False
748
+ return self
749
+
750
+ def build(self) -> Self:
751
+ """Constructs the optimization pipeline.
752
+
753
+ This includes filling all undefined optimization steps with empty strategies and checking all strategies for
754
+ compatibility with the `target_db`. Afterwards, the pipeline is ready to optimize queries.
755
+
756
+ Returns
757
+ -------
758
+ self
759
+ The current pipeline to allow for easy method-chaining.
760
+
761
+ Raises
762
+ ------
763
+ UnsupportedSystemError
764
+ If any of the selected optimization stages is not compatible with the `target_db`.
765
+ """
766
+ all_checks = [self.pre_check]
767
+ if self.join_order_enumerator is not None:
768
+ all_checks.append(self.join_order_enumerator.pre_check())
769
+ if self.physical_operator_selection is not None:
770
+ all_checks.append(self.physical_operator_selection.pre_check())
771
+ if self.plan_parameterization is not None:
772
+ all_checks.append(self.plan_parameterization.pre_check())
773
+
774
+ self._pre_check = merge_checks(all_checks)
775
+
776
+ db_check_result = self._pre_check.check_supported_database_system(
777
+ self._target_db
778
+ )
779
+ if not db_check_result.passed:
780
+ raise UnsupportedSystemError(self.target_db, db_check_result.failure_reason)
781
+
782
+ self._build = True
783
+ return self
784
+
785
+ def target_database(self) -> Database:
786
+ return self.target_db
787
+
788
+ def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
789
+ optimized_query = self.optimize_query(query)
790
+ return self.target_db.optimizer().query_plan(optimized_query)
791
+
792
+ def optimize_query(self, query: SqlQuery) -> SqlQuery:
793
+ self._assert_is_build()
794
+ supported_query_check = self._pre_check.check_supported_query(query)
795
+ if not supported_query_check.passed:
796
+ raise UnsupportedQueryError(query, supported_query_check.failure_reason)
797
+
798
+ join_order = (
799
+ None
800
+ if self.join_order_enumerator is None
801
+ else self.join_order_enumerator.optimize_join_order(query)
802
+ )
803
+ physical_operators = (
804
+ PhysicalOperatorAssignment()
805
+ if self.physical_operator_selection is None
806
+ else self.physical_operator_selection.select_physical_operators(
807
+ query, join_order
808
+ )
809
+ )
810
+ plan_parameters = (
811
+ PlanParameterization()
812
+ if self.plan_parameterization is None
813
+ else self.plan_parameterization.generate_plan_parameters(
814
+ query, join_order, physical_operators
815
+ )
816
+ )
817
+
818
+ return self._target_db.hinting().generate_hints(
819
+ query,
820
+ join_order=join_order,
821
+ physical_operators=physical_operators,
822
+ plan_parameters=plan_parameters,
823
+ )
824
+
825
+ def describe(self) -> jsondict:
826
+ return {
827
+ "name": "multi_stage_pipeline",
828
+ "database_system": self._target_db.describe(),
829
+ "query_pre_check": self._pre_check.describe() if self._pre_check else None,
830
+ "join_ordering": self._join_order_enumerator.describe()
831
+ if self._join_order_enumerator
832
+ else None,
833
+ "operator_selection": (
834
+ self._physical_operator_selection.describe()
835
+ if self._physical_operator_selection
836
+ else None
837
+ ),
838
+ "plan_parameterization": self._plan_parameterization.describe()
839
+ if self._plan_parameterization
840
+ else None,
841
+ }
842
+
843
+ def _assert_is_build(self) -> None:
844
+ """Raises an error if the pipeline has not been build yet."""
845
+ if not self._build:
846
+ raise StateError("Pipeline has not been build")
847
+
848
+ def __repr__(self) -> str:
849
+ return str(self)
850
+
851
+ def __str__(self) -> str:
852
+ components = [
853
+ self._join_order_enumerator,
854
+ self._physical_operator_selection,
855
+ self._plan_parameterization,
856
+ ]
857
+ opt_chain = " -> ".join(str(comp) for comp in components)
858
+ return f"MultiStageOptimization [{opt_chain}]"
859
+
860
+
861
+ class IncrementalOptimizationPipeline(OptimizationPipeline):
862
+ """This optimization pipeline can be thought of as a generalization of the `MultiStageOptimizationPipeline`.
863
+
864
+ Instead of only operating in two stages, an arbitrary amount of optimization steps can be applied. During each
865
+ step an entire physical query execution plan is received as input and also produced as output. Therefore, partial
866
+ operator assignments or cardinality estimates are not supported by this pipeline. The incremental nature probably
867
+ makes it the most usefull for optimization strategies that continously improve query plans.
868
+
869
+ Parameters
870
+ ----------
871
+ target_db : Database
872
+ The database for which the optimized queries should be generated.
873
+ """
874
+
875
+ def __init__(self, target_db: Database) -> None:
876
+ self._target_db = target_db
877
+ self._initial_plan_generator: Optional[CompleteOptimizationAlgorithm] = None
878
+ self._optimization_steps: list[IncrementalOptimizationStep] = []
879
+
880
+ @property
881
+ def target_db(self) -> Database:
882
+ """The database for which optimized queries should be generated.
883
+
884
+ When a new target database is selected, all optimization steps are checked for support of the new database.
885
+
886
+ Returns
887
+ -------
888
+ Database
889
+ _description_
890
+
891
+ Raises
892
+ ------
893
+ UnsupportedSystemError
894
+ If any of the optimization steps or the initial plan generator cannot work with the target database
895
+ """
896
+ return self._target_db
897
+
898
+ @target_db.setter
899
+ def target_db(self, database: Database) -> None:
900
+ self._ensure_pipeline_integrity(database=database)
901
+ self._target_db = database
902
+
903
+ @property
904
+ def initial_plan_generator(self) -> Optional[CompleteOptimizationAlgorithm]:
905
+ """Strategy to construct the first physical query execution plan to start the incremental optimization.
906
+
907
+ If no initial generator is selected, the initial plan will be derived from the optimizer of the target
908
+ database.
909
+
910
+ Returns
911
+ -------
912
+ Optional[CompleteOptimizationAlgorithm]
913
+ The current initial generator.
914
+
915
+ Raises
916
+ ------
917
+ UnsupportedSystemError
918
+ If the initial generator does not work with the current `target_db`
919
+ """
920
+ return self._initial_plan_generator
921
+
922
+ @initial_plan_generator.setter
923
+ def initial_plan_generator(
924
+ self, plan_generator: Optional[CompleteOptimizationAlgorithm]
925
+ ) -> None:
926
+ self._ensure_pipeline_integrity(initial_plan_generator=plan_generator)
927
+ self._initial_plan_generator = plan_generator
928
+
929
+ def add_optimization_step(self, next_step: IncrementalOptimizationStep) -> Self:
930
+ """Expands the optimization pipeline by another stage.
931
+
932
+ The given step will be applied at the end of the pipeline. The very first optimization steps receives an
933
+ initial plan that has either been generated via the `initial_plan_generator` (if it has been setup), or by
934
+ retrieving the query execution plan from the `target_db`.
935
+
936
+ Parameters
937
+ ----------
938
+ next_step : IncrementalOptimizationStep
939
+ The next optimization stage
940
+
941
+ Returns
942
+ -------
943
+ IncrementalOptimizationPipeline
944
+ If any of the optimization steps does not work with the target database
945
+ """
946
+ self._ensure_pipeline_integrity(additional_optimization_step=next_step)
947
+ self._optimization_steps.append(next_step)
948
+ return self
949
+
950
+ def target_database(self) -> Database:
951
+ return self.target_db
952
+
953
+ def query_execution_plan(self, query: SqlQuery) -> QueryPlan:
954
+ self._ensure_supported_query(query)
955
+ current_plan = (
956
+ self.initial_plan_generator.optimize_query(query)
957
+ if self.initial_plan_generator is not None
958
+ else self.target_db.optimizer().query_plan(query)
959
+ )
960
+ for optimization_step in self._optimization_steps:
961
+ current_plan = optimization_step.optimize_query(query, current_plan)
962
+ return current_plan
963
+
964
+ def describe(self) -> jsondict:
965
+ return {
966
+ "name": "incremental_pipeline",
967
+ "database_system": self._target_db.describe(),
968
+ "initial_plan": (
969
+ self._initial_plan_generator.describe()
970
+ if self._initial_plan_generator is not None
971
+ else "native"
972
+ ),
973
+ "steps": [step.describe() for step in self._optimization_steps],
974
+ }
975
+
976
+ def _ensure_pipeline_integrity(
977
+ self,
978
+ *,
979
+ database: Optional[Database] = None,
980
+ initial_plan_generator: Optional[CompleteOptimizationAlgorithm] = None,
981
+ additional_optimization_step: Optional[IncrementalOptimizationStep] = None,
982
+ ) -> None:
983
+ """Checks that all selected optimization strategies work with the target database.
984
+
985
+ This method should be called when individual parts of the pipeline have been updated. The updated parts are
986
+ supplied as parameters. All other parameters are inferred from the current pipeline state.
987
+
988
+ Parameters
989
+ ----------
990
+ database : Optional[Database], optional
991
+ The new target database system if it has been updated, by default None
992
+ initial_plan_generator : Optional[CompleteOptimizationAlgorithm], optional
993
+ The new initial plan generator if it has been updated, by default None
994
+ additional_optimization_step : Optional[IncrementalOptimizationStep], optional
995
+ The next optimization step, if a new one has been added, by default None
996
+
997
+ Raises
998
+ ------
999
+ UnsupportedSystemError
1000
+ If one of the optimization algorithms is not compatible with the target database
1001
+ """
1002
+ database = self.target_db if database is None else database
1003
+ initial_plan_generator = (
1004
+ self._initial_plan_generator
1005
+ if initial_plan_generator is None
1006
+ else initial_plan_generator
1007
+ )
1008
+
1009
+ if (
1010
+ initial_plan_generator is not None
1011
+ and initial_plan_generator.pre_check() is not None
1012
+ ):
1013
+ initial_plan_generator.pre_check().check_supported_database_system(
1014
+ database
1015
+ ).ensure_all_passed(database)
1016
+
1017
+ if (
1018
+ additional_optimization_step is not None
1019
+ and additional_optimization_step.pre_check() is not None
1020
+ ):
1021
+ (
1022
+ additional_optimization_step.pre_check()
1023
+ .check_supported_database_system(database)
1024
+ .ensure_all_passed(database)
1025
+ )
1026
+
1027
+ for incremental_step in self._optimization_steps:
1028
+ if incremental_step.pre_check() is None:
1029
+ continue
1030
+ incremental_step.pre_check().check_supported_database_system(
1031
+ database
1032
+ ).ensure_all_passed(database)
1033
+
1034
+ def _ensure_supported_query(self, query: SqlQuery) -> None:
1035
+ """Applies all relevant pre-checks to the input query.
1036
+
1037
+ Parameters
1038
+ ----------
1039
+ query : SqlQuery
1040
+ The input query
1041
+
1042
+ Raises
1043
+ ------
1044
+ UnsupportedQueryError
1045
+ If one of the optimization algorithms is not compatible with the input query
1046
+ """
1047
+ if (
1048
+ self._initial_plan_generator is not None
1049
+ and self._initial_plan_generator.pre_check() is not None
1050
+ ):
1051
+ self._initial_plan_generator.pre_check().check_supported_query(
1052
+ query
1053
+ ).ensure_all_passed(query)
1054
+ for incremental_step in self._optimization_steps:
1055
+ if incremental_step.pre_check() is None:
1056
+ continue
1057
+ incremental_step.pre_check().check_supported_query(query).ensure_all_passed(
1058
+ query
1059
+ )
1060
+
1061
+ def __repr__(self) -> str:
1062
+ return str(self)
1063
+
1064
+ def __str__(self) -> str:
1065
+ opt_chain = " -> ".join(str(comp) for comp in self._optimization_steps)
1066
+ return f"MultiStageOptimization [{opt_chain}]"
1067
+
1068
+
1069
+ class OptimizationSettings(Protocol):
1070
+ """Captures related settings for the optimization pipeline to make them more easily accessible.
1071
+
1072
+ All components are optional, depending on the specific optimization scenario/approach.
1073
+ """
1074
+
1075
+ def query_pre_check(self) -> Optional[OptimizationPreCheck]:
1076
+ """The required query pre-check.
1077
+
1078
+ Returns
1079
+ -------
1080
+ Optional[OptimizationPreCheck]
1081
+ The pre-check if one is necessary, or ``None`` otherwise.
1082
+ """
1083
+ return None
1084
+
1085
+ def build_complete_optimizer(self) -> Optional[CompleteOptimizationAlgorithm]:
1086
+ return None
1087
+
1088
+ def build_join_order_optimizer(self) -> Optional[JoinOrderOptimization]:
1089
+ """The algorithm that is used to obtain the optimized join order.
1090
+
1091
+ Returns
1092
+ -------
1093
+ Optional[JoinOrderOptimization]
1094
+ The optimization strategy for the join order, or ``None`` if the scenario does not include a join order
1095
+ optimization.
1096
+ """
1097
+ return None
1098
+
1099
+ def build_physical_operator_selection(self) -> Optional[PhysicalOperatorSelection]:
1100
+ """The algorithm that is used to determine the physical operators.
1101
+
1102
+ Returns
1103
+ -------
1104
+ Optional[PhysicalOperatorSelection]
1105
+ The optimization strategy for the physical operators, or ``None`` if the scenario does not include an operator
1106
+ optimization.
1107
+ """
1108
+ return None
1109
+
1110
+ def build_plan_parameterization(self) -> Optional[ParameterGeneration]:
1111
+ """The algorithm that is used to further parameterize the query plan.
1112
+
1113
+ Returns
1114
+ -------
1115
+ Optional[ParameterGeneration]
1116
+ The parameter optimization strategy, or ``None`` if the scenario does not include such a stage.
1117
+ """
1118
+ return None
1119
+
1120
+ def build_incremental_optimizer(self) -> Optional[IncrementalOptimizationStep]:
1121
+ return None