PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/__init__.py ADDED
@@ -0,0 +1,211 @@
1
+ """PostBOUND - A research framework for query optimization in relational database systems.
2
+
3
+ On a high level, PostBOUND is designed for the following workflow: different optimization stratgies - so called optimization
4
+ _stages_ - are implemented according to specific interfaces. For example, there are stages that select the join order for an
5
+ input query, or stages that compute the cardinality of intermediate results. The different stages are combined in an
6
+ `OptimizationPipeline`. Most of the time, the pipeline applies the strategies to each input query in a sequential manner, one
7
+ optimization stage after another. The result of the optimization strategies is an abstract description of the optimization
8
+ decisions, e.g. which join order to use or how a specific join should be performed. Lastly, the pipeline ensures that the
9
+ selected optimization decisions are actually enforced when executing the query and provides an optimized version of the input
10
+ query. Notice that PostBOUND does not interfere with the native optimizer of different database systems directly, since that
11
+ would involve a lot of complicated and error-prone code, if it is possible at all. Instead, PostBOUND follows an indirect
12
+ approach and makes use of the fact that many database systems (and specifically all of the supported systems) incorporate
13
+ proprietary extensions to the SQL standard - mostly in the form of so-called *hint blocks*. These are SQL comments that can be
14
+ placed at specific places in the query and have special syntax which tells the optimizer to modify the query execution plan in
15
+ a certain way. Alternatively, some systems also interpret certain SQL constructs differently and disable certain optimization
16
+ features for them. E.g. by using the explicit *JOIN ON* syntax instead of specifying joins implicitly through *FROM* and
17
+ *WHERE* clause, join order optimization can be disabled for some systems. These system-specific properties are utilitized by
18
+ PostBOUND to enforce the selected query execution plan at runtime.
19
+
20
+ In addition to the optimization pipeline, PostBOUND provides a lot of infrastructure to aid in the common tasks of (research
21
+ in) query optimization. For example, PostBOUND introduces a high-level query abstraction and provides utilities to parse SQL
22
+ queries, apply transformations to them or to access their predicates. Furthermore, a unified interface for different database
23
+ systems, e.g. regarding statistics or query plans, allows optimization algorithms to focus on their actual optimization
24
+ problem. Likewise, utilities for workloads and benchmarking as well as pre-defined optimization strategies ensure that
25
+ evaluations take place on a reproducible foundation.
26
+
27
+ On a high-level, the PostBOUND project is structured as follows:
28
+
29
+ - this module contains the actual optimization pipelines and their optimization stages
30
+ - the `optimizer` package provides the basic data structures that encode optimization decisions (e.g. selected physical
31
+ operators) as well as general utilities (e.g. a join graph abstraction). Notice that the `optimizer` package is accessible
32
+ under the `opt` alias.
33
+ - the `qal` package provides the query abstraction used throughout PostBOUND, as well as logic to parse and transform
34
+ query instances
35
+ - the `db` package contains all parts of PostBOUND that concern database interaction. That includes retrieving data
36
+ from different database systems, as well as generating queries for execution based on the optimization decisions
37
+ - the `workloads` package provides utilities to load benchmark queries, measure the execution time of different optimization
38
+ pipelines on those benchmarks and offers quick access to some popular workloads
39
+ - the `util` package contains algorithms and types that do not belong to specific parts of PostBOUND and are more
40
+ general in nature
41
+ - the `vis` package also contains a number of utilities, but with a strict focus on the visualization of different objects that
42
+ are frequently encoutered in the optimization context (such as join trees, query execution plans and join orders). This
43
+ package should only be used if PostBOUND has been installed with visualization support and has to be imported explicitly.
44
+
45
+ To get a general idea of how to work with PostBOUND and where to start, please take a look at the README and the example
46
+ scripts.
47
+ Most of the modules are available directly from the main package, so generally you just need to ``import postbound as pb``.
48
+ In some cases (e.g. for pre-defined optimization strategies), explicit imports are required. This is described in detail in the
49
+ documentation of the respective modules.
50
+
51
+
52
+ Optimization pipeline
53
+ ---------------------
54
+
55
+ PostBOUND does not provide a single pipeline implementation. Rather, different pipeline types exists to accomodate
56
+ different use-cases. See the documentation of the general `OptimizationPipeline` base class for details. That class serves as
57
+ the smallest common denominator among all pipeline implementations. Based on the general interface, the most commonly used
58
+ pipelines are the `TextBookOptimizationPipeline` and the `MultiStageOptimizationPipeline`. The former models an optimizer based
59
+ on the traditional architecture of cost-based optimizers (i.e. plan enumerator, cost model and cardinality estimator). The
60
+ latter first computes a join order and afterwards selects the physical operators for this join order. The resulting plan can be
61
+ further parameterized, e.g. using cardinality estimates. Importantly, the `MultiStageOptimizationPipeline` allows to leave some
62
+ of the stages empty, which forces the native query optimizer to "fill the gaps" with its own policies. For example, one might
63
+ only compute a join order along with the cardinality estimates. The target optimizer would then select the physical operators
64
+ based on its own cost model, but using the cardinality estimates in place of its own estimation procedures.
65
+
66
+ To develop custom optimization algorithms and make use of PostBOUND's pipeline abstraction, the optimization stages are the
67
+ interfaces that need to be implemented. They specify the basic interface that pipelines expect and provide additional
68
+ information about the selected optimization strategies. Depending on the specific pipeline type, different stages have to be
69
+ implemented and each pipeline can require a different amount of steps that need to be applied in a different order. Refer to
70
+ the documentation of the respective pipelines for details.
71
+
72
+
73
+ General Workflow
74
+ ----------------
75
+
76
+ To implement an optimization strategy, the necessary pipelines as well as its stages need to be identified first. The stages
77
+ are designed as abstract interfaces that need to be implemented by the new algorithm. Secondly, a target database has to be
78
+ chosen. This is necessary for two reasons: database systems provide different functionality. Therefore, PostBOUND provides some
79
+ checks to ensure that the optimization decisions can actually be enforced on the selected database system. Furthermore,
80
+ remember that PostBOUND does not actually interfer with the native optimizer of a database system. Instead, it uses optimizer
81
+ hints to apply the optimization decisions during query execution. These hints are system-specific and the hint generation
82
+ process is also provided by the database system.
83
+
84
+ An end-to-end optimization scenario typically involves the following steps (some are carried out by PostBOUND automatically,
85
+ some require user input):
86
+
87
+ 1. obtaining a working database connection (see the `db` package)
88
+ 2. setting up the optimization pipeline by configuring the different stages (this is done by the user)
89
+ 3. building the pipeline
90
+ 4. loading a workload to optimize (see the `workloads` module)
91
+ 5. optimizing an input query (this is done by the pipeline)
92
+ 6. generating the appropriate plan metadata, mostly query hints (this is done by the pipeline and supported by the `db`
93
+ package)
94
+ 7. executing the input query with the optimization metadata (either manually or using the `executor` module)
95
+
96
+ """
97
+
98
+ from . import _bench as bench
99
+ from . import db, experiments, qal, util
100
+ from . import optimizer as opt
101
+ from ._core import (
102
+ Cardinality,
103
+ ColumnReference,
104
+ Cost,
105
+ IntermediateOperator,
106
+ JoinOperator,
107
+ PhysicalOperator,
108
+ ScanOperator,
109
+ TableReference,
110
+ )
111
+ from ._hints import (
112
+ PhysicalOperatorAssignment,
113
+ PlanParameterization,
114
+ )
115
+ from ._jointree import LogicalJoinTree
116
+ from ._pipelines import (
117
+ IncrementalOptimizationPipeline,
118
+ IntegratedOptimizationPipeline,
119
+ MultiStageOptimizationPipeline,
120
+ OptimizationPipeline,
121
+ OptimizationSettings,
122
+ TextBookOptimizationPipeline,
123
+ )
124
+ from ._qep import (
125
+ PlanEstimates,
126
+ PlanMeasures,
127
+ PlanParams,
128
+ QueryPlan,
129
+ SortKey,
130
+ Subplan,
131
+ )
132
+ from ._stages import (
133
+ CardinalityEstimator,
134
+ CompleteOptimizationAlgorithm,
135
+ CostModel,
136
+ IncrementalOptimizationStep,
137
+ JoinOrderOptimization,
138
+ OptimizationPreCheck,
139
+ OptimizationStage,
140
+ ParameterGeneration,
141
+ PhysicalOperatorSelection,
142
+ PlanEnumerator,
143
+ as_complete_algorithm,
144
+ )
145
+ from ._validation import PreCheckResult, UnsupportedQueryError, UnsupportedSystemError
146
+ from .db import _duckdb as duckdb
147
+ from .db import postgres
148
+ from .db._db import Database
149
+ from .experiments import analysis, workloads
150
+ from .qal import relalg, transform
151
+ from .qal._qal import SqlQuery
152
+ from .qal.parser import parse_query
153
+
154
+ __version__ = "0.19.0"
155
+
156
+ __all__ = [
157
+ "db",
158
+ "opt",
159
+ "qal",
160
+ "experiments",
161
+ "util",
162
+ "Cost",
163
+ "Cardinality",
164
+ "TableReference",
165
+ "ColumnReference",
166
+ "PhysicalOperator",
167
+ "ScanOperator",
168
+ "JoinOperator",
169
+ "IntermediateOperator",
170
+ "OptimizationPipeline",
171
+ "CompleteOptimizationAlgorithm",
172
+ "IntegratedOptimizationPipeline",
173
+ "CardinalityEstimator",
174
+ "CostModel",
175
+ "PlanEnumerator",
176
+ "TextBookOptimizationPipeline",
177
+ "JoinOrderOptimization",
178
+ "PhysicalOperatorSelection",
179
+ "ParameterGeneration",
180
+ "MultiStageOptimizationPipeline",
181
+ "IncrementalOptimizationStep",
182
+ "IncrementalOptimizationPipeline",
183
+ "as_complete_algorithm",
184
+ "OptimizationStage",
185
+ "UnsupportedQueryError",
186
+ "UnsupportedSystemError",
187
+ "OptimizationPreCheck",
188
+ "PreCheckResult",
189
+ "OptimizationSettings",
190
+ "Database",
191
+ "postgres",
192
+ "duckdb",
193
+ "relalg",
194
+ "transform",
195
+ "SqlQuery",
196
+ "parse_query",
197
+ "PlanEstimates",
198
+ "PlanMeasures",
199
+ "PlanParams",
200
+ "QueryPlan",
201
+ "SortKey",
202
+ "Subplan",
203
+ "LogicalJoinTree",
204
+ "PhysicalOperatorAssignment",
205
+ "PlanParameterization",
206
+ "_cardinalities",
207
+ "validation",
208
+ "analysis",
209
+ "workloads",
210
+ "bench",
211
+ ]
postbound/_base.py ADDED
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TypeVar
4
+
5
+ T = TypeVar("T")
6
+ """Typed expressions use this generic type variable."""