PyPI - PostBOUND - Versions diffs - 0.19.0__py3-none-any.whl - Mend

PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

postbound/__init__.py +211 -0
postbound/_base.py +6 -0
postbound/_bench.py +1012 -0
postbound/_core.py +1153 -0
postbound/_hints.py +1373 -0
postbound/_jointree.py +1079 -0
postbound/_pipelines.py +1121 -0
postbound/_qep.py +1986 -0
postbound/_stages.py +876 -0
postbound/_validation.py +734 -0
postbound/db/__init__.py +72 -0
postbound/db/_db.py +2348 -0
postbound/db/_duckdb.py +785 -0
postbound/db/mysql.py +1195 -0
postbound/db/postgres.py +4216 -0
postbound/experiments/__init__.py +12 -0
postbound/experiments/analysis.py +674 -0
postbound/experiments/benchmarking.py +54 -0
postbound/experiments/ceb.py +877 -0
postbound/experiments/interactive.py +105 -0
postbound/experiments/querygen.py +334 -0
postbound/experiments/workloads.py +980 -0
postbound/optimizer/__init__.py +92 -0
postbound/optimizer/__init__.pyi +73 -0
postbound/optimizer/_cardinalities.py +369 -0
postbound/optimizer/_joingraph.py +1150 -0
postbound/optimizer/dynprog.py +1825 -0
postbound/optimizer/enumeration.py +432 -0
postbound/optimizer/native.py +539 -0
postbound/optimizer/noopt.py +54 -0
postbound/optimizer/presets.py +147 -0
postbound/optimizer/randomized.py +650 -0
postbound/optimizer/tonic.py +1479 -0
postbound/optimizer/ues.py +1607 -0
postbound/qal/__init__.py +343 -0
postbound/qal/_qal.py +9678 -0
postbound/qal/formatter.py +1089 -0
postbound/qal/parser.py +2344 -0
postbound/qal/relalg.py +4257 -0
postbound/qal/transform.py +2184 -0
postbound/shortcuts.py +70 -0
postbound/util/__init__.py +46 -0
postbound/util/_errors.py +33 -0
postbound/util/collections.py +490 -0
postbound/util/dataframe.py +71 -0
postbound/util/dicts.py +330 -0
postbound/util/jsonize.py +68 -0
postbound/util/logging.py +106 -0
postbound/util/misc.py +168 -0
postbound/util/networkx.py +401 -0
postbound/util/numbers.py +438 -0
postbound/util/proc.py +107 -0
postbound/util/stats.py +37 -0
postbound/util/system.py +48 -0
postbound/util/typing.py +35 -0
postbound/vis/__init__.py +5 -0
postbound/vis/fdl.py +69 -0
postbound/vis/graphs.py +48 -0
postbound/vis/optimizer.py +538 -0
postbound/vis/plots.py +84 -0
postbound/vis/tonic.py +70 -0
postbound/vis/trees.py +105 -0
postbound-0.19.0.dist-info/METADATA +355 -0
postbound-0.19.0.dist-info/RECORD +67 -0
postbound-0.19.0.dist-info/WHEEL +5 -0
postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
postbound-0.19.0.dist-info/top_level.txt +1 -0

postbound/__init__.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""PostBOUND - A research framework for query optimization in relational database systems.
+On a high level, PostBOUND is designed for the following workflow: different optimization stratgies - so called optimization
+_stages_ - are implemented according to specific interfaces. For example, there are stages that select the join order for an
+input query, or stages that compute the cardinality of intermediate results. The different stages are combined in an
+`OptimizationPipeline`. Most of the time, the pipeline applies the strategies to each input query in a sequential manner, one
+optimization stage after another. The result of the optimization strategies is an abstract description of the optimization
+decisions, e.g. which join order to use or how a specific join should be performed. Lastly, the pipeline ensures that the
+selected optimization decisions are actually enforced when executing the query and provides an optimized version of the input
+query.  Notice that PostBOUND does not interfere with the native optimizer of different database systems directly, since that
+would involve a lot of complicated and error-prone code, if it is possible at all. Instead, PostBOUND follows an indirect
+approach and makes use of the fact that many database systems (and specifically all of the supported systems) incorporate
+proprietary extensions to the SQL standard - mostly in the form of so-called *hint blocks*. These are SQL comments that can be
+placed at specific places in the query and have special syntax which tells the optimizer to modify the query execution plan in
+a certain way. Alternatively, some systems also interpret certain SQL constructs differently and disable certain optimization
+features for them. E.g. by using the explicit *JOIN ON* syntax instead of specifying joins implicitly through *FROM* and
+*WHERE* clause, join order optimization can be disabled for some systems. These system-specific properties are utilitized by
+PostBOUND to enforce the selected query execution plan at runtime.
+In addition to the optimization pipeline, PostBOUND provides a lot of infrastructure to aid in the common tasks of (research
+in) query optimization. For example, PostBOUND introduces a high-level query abstraction and provides utilities to parse SQL
+queries, apply transformations to them or to access their predicates. Furthermore, a unified interface for different database
+systems, e.g. regarding statistics or query plans, allows optimization algorithms to focus on their actual optimization
+problem. Likewise, utilities for workloads and benchmarking as well as pre-defined optimization strategies ensure that
+evaluations take place on a reproducible foundation.
+On a high-level, the PostBOUND project is structured as follows:
+- this module contains the actual optimization pipelines and their optimization stages
+- the `optimizer` package provides the basic data structures that encode optimization decisions (e.g. selected physical
+  operators) as well as general utilities (e.g. a join graph abstraction). Notice that the `optimizer` package is accessible
+  under the `opt` alias.
+- the `qal` package provides the query abstraction used throughout PostBOUND, as well as logic to parse and transform
+  query instances
+- the `db` package contains all parts of PostBOUND that concern database interaction. That includes retrieving data
+  from different database systems, as well as generating queries for execution based on the optimization decisions
+- the `workloads` package provides utilities to load benchmark queries, measure the execution time of different optimization
+  pipelines on those benchmarks and offers quick access to some popular workloads
+- the `util` package contains algorithms and types that do not belong to specific parts of PostBOUND and are more
+  general in nature
+- the `vis` package also contains a number of utilities, but with a strict focus on the visualization of different objects that
+  are frequently encoutered in the optimization context (such as join trees, query execution plans and join orders). This
+  package should only be used if PostBOUND has been installed with visualization support and has to be imported explicitly.
+To get a general idea of how to work with PostBOUND and where to start, please take a look at the README and the example
+scripts.
+Most of the modules are available directly from the main package, so generally you just need to ``import postbound as pb``.
+In some cases (e.g. for pre-defined optimization strategies), explicit imports are required. This is described in detail in the
+documentation of the respective modules.
+Optimization pipeline
+---------------------
+PostBOUND does not provide a single pipeline implementation. Rather, different pipeline types exists to accomodate
+different use-cases. See the documentation of the general `OptimizationPipeline` base class for details. That class serves as
+the smallest common denominator among all pipeline implementations. Based on the general interface, the most commonly used
+pipelines are the `TextBookOptimizationPipeline` and the `MultiStageOptimizationPipeline`. The former models an optimizer based
+on the traditional architecture of cost-based optimizers (i.e. plan enumerator, cost model and cardinality estimator). The
+latter first computes a join order and afterwards selects the physical operators for this join order. The resulting plan can be
+further parameterized, e.g. using cardinality estimates. Importantly, the `MultiStageOptimizationPipeline` allows to leave some
+of the stages empty, which forces the native query optimizer to "fill the gaps" with its own policies. For example, one might
+only compute a join order along with the cardinality estimates. The target optimizer would then select the physical operators
+based on its own cost model, but using the cardinality estimates in place of its own estimation procedures.
+To develop custom optimization algorithms and make use of PostBOUND's pipeline abstraction, the optimization stages are the
+interfaces that need to be implemented. They specify the basic interface that pipelines expect and provide additional
+information about the selected optimization strategies. Depending on the specific pipeline type, different stages have to be
+implemented and each pipeline can require a different amount of steps that need to be applied in a different order. Refer to
+the documentation of the respective pipelines for details.
+General Workflow
+----------------
+To implement an optimization strategy, the necessary pipelines as well as its stages need to be identified first. The stages
+are designed as abstract interfaces that need to be implemented by the new algorithm. Secondly, a target database has to be
+chosen. This is necessary for two reasons: database systems provide different functionality. Therefore, PostBOUND provides some
+checks to ensure that the optimization decisions can actually be enforced on the selected database system. Furthermore,
+remember that PostBOUND does not actually interfer with the native optimizer of a database system. Instead, it uses optimizer
+hints to apply the optimization decisions during query execution. These hints are system-specific and the hint generation
+process is also provided by the database system.
+An end-to-end optimization scenario typically involves the following steps (some are carried out by PostBOUND automatically,
+some require user input):
+1. obtaining a working database connection (see the `db` package)
+2. setting up the optimization pipeline by configuring the different stages (this is done by the user)
+3. building the pipeline
+4. loading a workload to optimize (see the `workloads` module)
+5. optimizing an input query (this is done by the pipeline)
+6. generating the appropriate plan metadata, mostly query hints (this is done by the pipeline and supported by the `db`
+   package)
+7. executing the input query with the optimization metadata (either manually or using the `executor` module)
+"""
+from . import _bench as bench
+from . import db, experiments, qal, util
+from . import optimizer as opt
+from ._core import (
+    Cardinality,
+    ColumnReference,
+    Cost,
+    IntermediateOperator,
+    JoinOperator,
+    PhysicalOperator,
+    ScanOperator,
+    TableReference,
+)
+from ._hints import (
+    PhysicalOperatorAssignment,
+    PlanParameterization,
+)
+from ._jointree import LogicalJoinTree
+from ._pipelines import (
+    IncrementalOptimizationPipeline,
+    IntegratedOptimizationPipeline,
+    MultiStageOptimizationPipeline,
+    OptimizationPipeline,
+    OptimizationSettings,
+    TextBookOptimizationPipeline,
+)
+from ._qep import (
+    PlanEstimates,
+    PlanMeasures,
+    PlanParams,
+    QueryPlan,
+    SortKey,
+    Subplan,
+)
+from ._stages import (
+    CardinalityEstimator,
+    CompleteOptimizationAlgorithm,
+    CostModel,
+    IncrementalOptimizationStep,
+    JoinOrderOptimization,
+    OptimizationPreCheck,
+    OptimizationStage,
+    ParameterGeneration,
+    PhysicalOperatorSelection,
+    PlanEnumerator,
+    as_complete_algorithm,
+)
+from ._validation import PreCheckResult, UnsupportedQueryError, UnsupportedSystemError
+from .db import _duckdb as duckdb
+from .db import postgres
+from .db._db import Database
+from .experiments import analysis, workloads
+from .qal import relalg, transform
+from .qal._qal import SqlQuery
+from .qal.parser import parse_query
+__version__ = "0.19.0"
+__all__ = [
+    "db",
+    "opt",
+    "qal",
+    "experiments",
+    "util",
+    "Cost",
+    "Cardinality",
+    "TableReference",
+    "ColumnReference",
+    "PhysicalOperator",
+    "ScanOperator",
+    "JoinOperator",
+    "IntermediateOperator",
+    "OptimizationPipeline",
+    "CompleteOptimizationAlgorithm",
+    "IntegratedOptimizationPipeline",
+    "CardinalityEstimator",
+    "CostModel",
+    "PlanEnumerator",
+    "TextBookOptimizationPipeline",
+    "JoinOrderOptimization",
+    "PhysicalOperatorSelection",
+    "ParameterGeneration",
+    "MultiStageOptimizationPipeline",
+    "IncrementalOptimizationStep",
+    "IncrementalOptimizationPipeline",
+    "as_complete_algorithm",
+    "OptimizationStage",
+    "UnsupportedQueryError",
+    "UnsupportedSystemError",
+    "OptimizationPreCheck",
+    "PreCheckResult",
+    "OptimizationSettings",
+    "Database",
+    "postgres",
+    "duckdb",
+    "relalg",
+    "transform",
+    "SqlQuery",
+    "parse_query",
+    "PlanEstimates",
+    "PlanMeasures",
+    "PlanParams",
+    "QueryPlan",
+    "SortKey",
+    "Subplan",
+    "LogicalJoinTree",
+    "PhysicalOperatorAssignment",
+    "PlanParameterization",
+    "_cardinalities",
+    "validation",
+    "analysis",
+    "workloads",
+    "bench",
+]

postbound/_base.py ADDED Viewed

@@ -0,0 +1,6 @@
+from __future__ import annotations
+from typing import TypeVar
+T = TypeVar("T")
+"""Typed expressions use this generic type variable."""