PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/db/_db.py
ADDED
|
@@ -0,0 +1,2348 @@
|
|
|
1
|
+
"""This module provides PostBOUNDs basic interaction with databases.
|
|
2
|
+
|
|
3
|
+
More specifically, this includes
|
|
4
|
+
|
|
5
|
+
- an interface to interact with databases (the `Database` interface)
|
|
6
|
+
- an interface to retrieve schema information (the `DatabaseSchema` interface)
|
|
7
|
+
- an interface to obtain different table-level and column-level statistics (the `DatabaseStatistics` interface)
|
|
8
|
+
- an interface to modify queries such that optimization decisions are respected during the actual query execution (the
|
|
9
|
+
`HintService` interface)
|
|
10
|
+
- an interface to access information of the native optimizer of the database system (the `OptimizerInterface` class)
|
|
11
|
+
- a utility to easily obtain database connections (the `DatabasePool` singleton class).
|
|
12
|
+
|
|
13
|
+
Take a look at the central `Database` class for more details. All concrete database systems need to implement this
|
|
14
|
+
interface.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import abc
|
|
20
|
+
import atexit
|
|
21
|
+
import collections
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import textwrap
|
|
25
|
+
import warnings
|
|
26
|
+
from collections.abc import Iterable, Sequence
|
|
27
|
+
from datetime import date, datetime, time, timedelta
|
|
28
|
+
from typing import Any, Optional, Protocol, Type, runtime_checkable
|
|
29
|
+
|
|
30
|
+
import networkx as nx
|
|
31
|
+
|
|
32
|
+
from .. import util
|
|
33
|
+
from .._core import (
|
|
34
|
+
Cardinality,
|
|
35
|
+
ColumnReference,
|
|
36
|
+
Cost,
|
|
37
|
+
TableReference,
|
|
38
|
+
UnboundColumnError,
|
|
39
|
+
VirtualTableError,
|
|
40
|
+
)
|
|
41
|
+
from .._hints import (
|
|
42
|
+
HintType,
|
|
43
|
+
PhysicalOperator,
|
|
44
|
+
PhysicalOperatorAssignment,
|
|
45
|
+
PlanParameterization,
|
|
46
|
+
)
|
|
47
|
+
from .._jointree import JoinTree
|
|
48
|
+
from .._qep import QueryPlan
|
|
49
|
+
from ..qal._qal import SqlQuery
|
|
50
|
+
|
|
51
|
+
ResultRow = tuple
|
|
52
|
+
"""Simple type alias to denote a single tuple from a result set."""
|
|
53
|
+
|
|
54
|
+
ResultSet = Sequence[ResultRow]
|
|
55
|
+
"""Simple type alias to denote the result relation of a query."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Cursor(Protocol):
|
|
59
|
+
"""Interface for database cursors that adhere to the Python Database API specification.
|
|
60
|
+
|
|
61
|
+
This is not a complete representation and only focuses on the parts of the specification that are important for
|
|
62
|
+
PostBOUND right now. In the future, additional methods might get added.
|
|
63
|
+
|
|
64
|
+
This type is only intended to denote the expected return type of certain methods, the cursors themselves are
|
|
65
|
+
supplied by the respective database integrations. There should be no need to implement one manually and all cursors
|
|
66
|
+
should be compatible with this interface by default (since they are DB API 2.0 cursor objects).
|
|
67
|
+
|
|
68
|
+
See PEP 249 for details (https://peps.python.org/pep-0249/)
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
@abc.abstractmethod
|
|
72
|
+
def close(self) -> None:
|
|
73
|
+
raise NotImplementedError
|
|
74
|
+
|
|
75
|
+
@abc.abstractmethod
|
|
76
|
+
def execute(
|
|
77
|
+
self, operation: str, parameters: Optional[dict | Sequence] = None
|
|
78
|
+
) -> Optional[Cursor]:
|
|
79
|
+
raise NotImplementedError
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
def fetchone(self) -> Optional[ResultRow]:
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
@abc.abstractmethod
|
|
86
|
+
def fetchall(self) -> Optional[ResultSet]:
|
|
87
|
+
raise NotImplementedError
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Connection(Protocol):
|
|
91
|
+
"""Interface for database connections that adhere to the Python Database API specification.
|
|
92
|
+
|
|
93
|
+
This is not a complete representation and only focuses on the parts of the specification that are important for
|
|
94
|
+
PostBOUND right now. In the future, additional methods might get added.
|
|
95
|
+
|
|
96
|
+
This type is only intended to denote the expected return type of certain methods, the connections themselves are
|
|
97
|
+
supplied by the respective database integrations. There should be no need to implement one manually and all
|
|
98
|
+
connections should be compatible with this interface by default (since they are DB API 2.0 connection objects).
|
|
99
|
+
|
|
100
|
+
See PEP 249 for details (https://peps.python.org/pep-0249/)
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
@abc.abstractmethod
|
|
104
|
+
def close(self) -> None:
|
|
105
|
+
raise NotImplementedError
|
|
106
|
+
|
|
107
|
+
@abc.abstractmethod
|
|
108
|
+
def cursor(self) -> Cursor:
|
|
109
|
+
raise NotImplementedError
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@runtime_checkable
|
|
113
|
+
class PrewarmingSupport(Protocol):
|
|
114
|
+
"""Some databases might support adding specific tables to their shared buffer.
|
|
115
|
+
|
|
116
|
+
If so, they should implement this protocol to allow other parts of the framework to exploit this feature.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
@abc.abstractmethod
|
|
120
|
+
def prewarm_tables(
|
|
121
|
+
self,
|
|
122
|
+
tables: Optional[TableReference | Iterable[TableReference]] = None,
|
|
123
|
+
*more_tables: TableReference,
|
|
124
|
+
exclude_table_pages: bool = False,
|
|
125
|
+
include_primary_index: bool = True,
|
|
126
|
+
include_secondary_indexes: bool = True,
|
|
127
|
+
) -> None:
|
|
128
|
+
"""Prepares the database buffer pool with tuples from specific tables.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
tables : Optional[TableReference | Iterable[TableReference]], optional
|
|
133
|
+
The tables that should be placed into the buffer pool
|
|
134
|
+
*more_tables : TableReference
|
|
135
|
+
More tables that should be placed into the buffer pool, enabling a more convenient usage of this method.
|
|
136
|
+
See examples for details on the usage.
|
|
137
|
+
exclude_table_pages : bool, optional
|
|
138
|
+
Whether the table data (i.e. pages containing the actual tuples) should *not* be prewarmed. This is off by default,
|
|
139
|
+
meaning that prewarming is applied to the data pages. This can be toggled on to only prewarm index pages (see
|
|
140
|
+
`include_primary_index` and `include_secondary_index`).
|
|
141
|
+
include_primary_index : bool, optional
|
|
142
|
+
Whether the pages of the primary key index should also be prewarmed. Enabled by default.
|
|
143
|
+
include_secondary_indexes : bool, optional
|
|
144
|
+
Whether the pages for secondary indexes should also be prewarmed. Enabled by default.
|
|
145
|
+
|
|
146
|
+
Notes
|
|
147
|
+
-----
|
|
148
|
+
If the database should prewarm more table pages than can be contained in the shared buffer, the actual contents of the
|
|
149
|
+
pool are not specified. All prewarming tasks might happen sequentially, in which case the first prewarmed relations
|
|
150
|
+
will typically be evicted and only the last relations (tables or indexes) are retained in the shared buffer. The
|
|
151
|
+
precise order in which the prewarming tasks are executed is not specified and depends on the actual relations.
|
|
152
|
+
|
|
153
|
+
Examples
|
|
154
|
+
--------
|
|
155
|
+
>>> database.prewarm_tables([table1, table2])
|
|
156
|
+
>>> database.prewarm_tables(table1, table2)
|
|
157
|
+
"""
|
|
158
|
+
...
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@runtime_checkable
|
|
162
|
+
class TimeoutSupport(Protocol):
|
|
163
|
+
"""Marks database systems that support executing queries with a timeout."""
|
|
164
|
+
|
|
165
|
+
def execute_with_timeout(
|
|
166
|
+
self, query: SqlQuery | str, *, timeout: float = 60.0
|
|
167
|
+
) -> Optional[ResultSet]:
|
|
168
|
+
"""Executes a query with a specific timeout.
|
|
169
|
+
|
|
170
|
+
For query execution, we use the following rules in contrast to `Database.execute_query`:
|
|
171
|
+
|
|
172
|
+
1. We never make use of the database interfaces' cache, even if it the query is contained in the cache
|
|
173
|
+
2. We never attempt to simplify the result set, even if this would be possible (e.g., for single-row result sets).
|
|
174
|
+
This is more of a pragmatic decision to be able to indicate a timeout with *None* and distinguishing it from a
|
|
175
|
+
valid result set of a single *NULL* tuple. Otherwise, we would have to resort to raising *TimeoutError* or similar
|
|
176
|
+
strategies, which complicates the control flow for the caller.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
query : SqlQuery | str
|
|
181
|
+
The query to execute. If this contains hints or other special features, those will be treated normally.
|
|
182
|
+
timeout : float, optional
|
|
183
|
+
The timeout in seconds. If the query takes longer (inlcuding all special treatment of the database interface),
|
|
184
|
+
it will be cancelled. Defaults to 60 seconds.
|
|
185
|
+
|
|
186
|
+
Returns
|
|
187
|
+
-------
|
|
188
|
+
Optional[ResultSet]
|
|
189
|
+
The result set of the query. If the query was cancelled, this will be *None*.
|
|
190
|
+
"""
|
|
191
|
+
...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@runtime_checkable
|
|
195
|
+
class StopwatchSupport(Protocol):
|
|
196
|
+
"""Marks the database systems that support measurement of query execution times."""
|
|
197
|
+
|
|
198
|
+
def time_query(
|
|
199
|
+
self, query: SqlQuery | str, *, timeout: Optional[float] = None
|
|
200
|
+
) -> float:
|
|
201
|
+
"""Determines the execution time of a query.
|
|
202
|
+
|
|
203
|
+
The execution time is measured from the moment the query is passed to the internal cursor (i.e. including sending the
|
|
204
|
+
query to the database server), until the execution is finished. Therfore, it does not include the time required to
|
|
205
|
+
transfer the result set back to the client.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
query : SqlQuery | str
|
|
210
|
+
The query to execute.
|
|
211
|
+
timeout : Optional[float], optional
|
|
212
|
+
Cancels the query execution if it takes longer than this number (in seconds). Notice that this parameter requires
|
|
213
|
+
timeout support from the database system.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
float
|
|
218
|
+
The runtime of the query in seconds. The result set is ignored.
|
|
219
|
+
|
|
220
|
+
Raises
|
|
221
|
+
------
|
|
222
|
+
UnsupportedDatabaseFeatureError
|
|
223
|
+
If the database system does not support timeouts. You can use the `TimeoutSupport` protocol to check this
|
|
224
|
+
beforehand.
|
|
225
|
+
"""
|
|
226
|
+
...
|
|
227
|
+
|
|
228
|
+
def last_query_runtime(self) -> float:
|
|
229
|
+
"""Get the runtime of the last executed query.
|
|
230
|
+
|
|
231
|
+
The execution time is measured from the moment the query is passed to the internal cursor (i.e. including sending the
|
|
232
|
+
query to the database server), until the execution is finished. Therfore, it does not include the time required to
|
|
233
|
+
transfer the result set back to the client.
|
|
234
|
+
|
|
235
|
+
Returns
|
|
236
|
+
-------
|
|
237
|
+
float
|
|
238
|
+
The runtime of the last executed query in seconds. If no query has been executed before, *NaN* is returned.
|
|
239
|
+
"""
|
|
240
|
+
...
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class QueryCacheWarning(UserWarning):
|
|
244
|
+
"""Warning to indicate that the query result cache was not found."""
|
|
245
|
+
|
|
246
|
+
def __init__(self, msg: str) -> None:
|
|
247
|
+
super().__init__(msg)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def simplify_result_set(result_set: list[tuple[Any]]) -> Any:
|
|
251
|
+
"""Default implementation of the result set simplification logic outlined in `Database.execute_query`.
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
result_set : list[tuple[Any]]
|
|
256
|
+
Result set to simplify: each entry in the list corresponds to one row in the result set and each component of the
|
|
257
|
+
tuples corresponds to one column in the result set
|
|
258
|
+
|
|
259
|
+
Returns
|
|
260
|
+
-------
|
|
261
|
+
Any
|
|
262
|
+
The simplified result set: if the result set consists just of a single row, this row is unwrapped from the list. If the
|
|
263
|
+
result set contains just a single column, this is unwrapped from the tuple. Both simplifications are also combined,
|
|
264
|
+
such that a result set of a single row of a single column is turned into the single value.
|
|
265
|
+
"""
|
|
266
|
+
# simplify the query result as much as possible: [(42, 24)] becomes (42, 24) and [(1,), (2,)] becomes [1, 2]
|
|
267
|
+
# [(42, 24), (4.2, 2.4)] is left as-is
|
|
268
|
+
if not result_set:
|
|
269
|
+
return []
|
|
270
|
+
|
|
271
|
+
result_structure = result_set[0] # what do the result tuples look like?
|
|
272
|
+
if len(result_structure) == 1: # do we have just one column?
|
|
273
|
+
result_set = [
|
|
274
|
+
row[0] for row in result_set
|
|
275
|
+
] # if it is just one column, unwrap it
|
|
276
|
+
|
|
277
|
+
if len(result_set) == 1: # if it is just one row, unwrap it
|
|
278
|
+
return result_set[0]
|
|
279
|
+
return result_set
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class _DBCacheJsonEncoder(json.JSONEncoder):
|
|
283
|
+
def default(self, obj: Any) -> Any:
|
|
284
|
+
if isinstance(obj, datetime):
|
|
285
|
+
return {"$datetime": obj.isoformat()}
|
|
286
|
+
elif isinstance(obj, date):
|
|
287
|
+
return {"$date": obj.isoformat()}
|
|
288
|
+
elif isinstance(obj, time):
|
|
289
|
+
return {"$time": obj.isoformat()}
|
|
290
|
+
elif isinstance(obj, timedelta):
|
|
291
|
+
return {"$timedelta": obj.total_seconds()}
|
|
292
|
+
return super().default(obj)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class _DBCacheJsonDecoder(json.JSONDecoder):
|
|
296
|
+
def __init__(self, *args, **kwargs):
|
|
297
|
+
self._second_hook = kwargs.get("object_hook")
|
|
298
|
+
super().__init__(object_hook=self.object_hook, *args, **kwargs)
|
|
299
|
+
|
|
300
|
+
def object_hook(self, obj: Any) -> Any:
|
|
301
|
+
if self._second_hook:
|
|
302
|
+
return self._second_hook(obj)
|
|
303
|
+
|
|
304
|
+
if "$datetime" in obj:
|
|
305
|
+
return datetime.fromisoformat(obj["$datetime"])
|
|
306
|
+
elif "$date" in obj:
|
|
307
|
+
return date.fromisoformat(obj["$date"])
|
|
308
|
+
elif "$time" in obj:
|
|
309
|
+
return time.fromisoformat(obj["$time"])
|
|
310
|
+
elif "$timedelta" in obj:
|
|
311
|
+
return timedelta(seconds=obj["$timedelta"])
|
|
312
|
+
return obj
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class Database(abc.ABC):
|
|
316
|
+
"""A `Database` is PostBOUND's logical abstraction of physical database management systems.
|
|
317
|
+
|
|
318
|
+
It provides high-level access to internal functionality provided by such systems. More specifically, each
|
|
319
|
+
`Database` instance supports the following functionality:
|
|
320
|
+
|
|
321
|
+
- executing arbitrary SQL queries
|
|
322
|
+
- retrieving schema information, most importantly about primary keys and foreign keys
|
|
323
|
+
- accessing statistical information, such as most common values or the number of rows in a table
|
|
324
|
+
- query formatting and generation of query hints to enforce optimizer decisions (join orders, operators, etc.)
|
|
325
|
+
- introspection of the query optimizer to retrieve query execution plans, cost estimates, etc.
|
|
326
|
+
|
|
327
|
+
Notice, that all this information is by design read-only and functionality to write queries is intentionally not
|
|
328
|
+
implemented (although one could issue `INSERT`/`UPDATE`/`DELETE` queries via the query execution functionality).
|
|
329
|
+
|
|
330
|
+
This restriction to read-only information enables the caching of query results to provide them without running a
|
|
331
|
+
query over and over again. This is achieved by storing the results of past queries in a special JSON file, which is
|
|
332
|
+
read upon creation of the `Database` instance. If this behavior is not desired, it can simply be turned off
|
|
333
|
+
globally via the `cache_enabled` property, or on a per-method-call basis by setting the corresponding parameter. If
|
|
334
|
+
no such parameter is available, the specific method does not make use of the caching mechanic.
|
|
335
|
+
|
|
336
|
+
Each database management system will need to implement this basic interface to enable PostBOUND to access the
|
|
337
|
+
necessary information.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
system_name : str
|
|
342
|
+
The name of the database system for which the connection is established. This is only really important to
|
|
343
|
+
distinguish different instances of the interface in a convenient manner.
|
|
344
|
+
cache_enabled : bool, optional
|
|
345
|
+
Whether complex queries that are executed against the database system should be cached. This is especially useful to
|
|
346
|
+
emulate certain statistics that are not maintained by the specific database system (see `DatabaseStatistics` for
|
|
347
|
+
details). If this is *False*, the query cache will not be loaded as well. Defaults to *True*.
|
|
348
|
+
|
|
349
|
+
Notes
|
|
350
|
+
-----
|
|
351
|
+
When the `__init__` method is called, the connection to the specific database system has to be established already,
|
|
352
|
+
i.e. calling any of the public methods should provide a valid result. This is particularly important, because this
|
|
353
|
+
method takes care of the cache initialization. This initialization in turn relies on identifying the correct
|
|
354
|
+
cache file, which in turn depends on the system name, system version and database name of the connection.
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
def __init__(self, system_name: str, *, cache_enabled: bool = True) -> None:
|
|
358
|
+
self.system_name = system_name
|
|
359
|
+
|
|
360
|
+
self._cache_enabled = cache_enabled
|
|
361
|
+
self._query_cache: dict[str, ResultSet] = {}
|
|
362
|
+
if self._cache_enabled:
|
|
363
|
+
self._inflate_query_cache()
|
|
364
|
+
atexit.register(self.close)
|
|
365
|
+
|
|
366
|
+
@abc.abstractmethod
|
|
367
|
+
def schema(self) -> DatabaseSchema:
|
|
368
|
+
"""Provides access to the underlying schema information of the database.
|
|
369
|
+
|
|
370
|
+
Returns
|
|
371
|
+
-------
|
|
372
|
+
DatabaseSchema
|
|
373
|
+
An object implementing the schema interface for the actual database system. This should normally be
|
|
374
|
+
completely stateless.
|
|
375
|
+
"""
|
|
376
|
+
raise NotImplementedError
|
|
377
|
+
|
|
378
|
+
@abc.abstractmethod
|
|
379
|
+
def statistics(self) -> DatabaseStatistics:
|
|
380
|
+
"""Provides access to the current statistics of the database.
|
|
381
|
+
|
|
382
|
+
Implementing generalized statistics for a framework that supports multiple different physical database systems
|
|
383
|
+
is much more complicated than it might seem at first. Therefore, different modes for the statistics
|
|
384
|
+
provisioning exist. These modes can be changed by setting the properties of the interface. See the
|
|
385
|
+
documentation of `DatabaseStatistics` for more details.
|
|
386
|
+
|
|
387
|
+
Repeated calls to this method are guaranteed to provide the same object. Therefore, changes to the statistics
|
|
388
|
+
interface configuration are guaranteed to be persisted accross multiple accesses to the statistics system.
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
DatabaseStatistics
|
|
393
|
+
The statistics interface. Repeated calls to this method are guaranteed to provide the same object.
|
|
394
|
+
"""
|
|
395
|
+
raise NotImplementedError
|
|
396
|
+
|
|
397
|
+
@abc.abstractmethod
|
|
398
|
+
def hinting(self) -> HintService:
|
|
399
|
+
"""Provides access to the hint generation facilities for the current database system.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
HintService
|
|
404
|
+
The hinting service. This should normally be completely stateless.
|
|
405
|
+
"""
|
|
406
|
+
raise NotImplementedError
|
|
407
|
+
|
|
408
|
+
@abc.abstractmethod
|
|
409
|
+
def optimizer(self) -> OptimizerInterface:
|
|
410
|
+
"""Provides access to optimizer-related functionality of the database system.
|
|
411
|
+
|
|
412
|
+
Returns
|
|
413
|
+
-------
|
|
414
|
+
OptimizerInterface
|
|
415
|
+
The optimizer interface. This should normally be completely stateless.
|
|
416
|
+
|
|
417
|
+
Raises
|
|
418
|
+
------
|
|
419
|
+
UnsupportedDatabaseFeatureError
|
|
420
|
+
If the database system does not provide any sort of external access to the optimizer.
|
|
421
|
+
"""
|
|
422
|
+
raise NotImplementedError
|
|
423
|
+
|
|
424
|
+
@abc.abstractmethod
|
|
425
|
+
def execute_query(
|
|
426
|
+
self,
|
|
427
|
+
query: SqlQuery | str,
|
|
428
|
+
*,
|
|
429
|
+
cache_enabled: Optional[bool] = None,
|
|
430
|
+
raw: bool = False,
|
|
431
|
+
) -> Any:
|
|
432
|
+
"""Executes the given query and returns the associated result set.
|
|
433
|
+
|
|
434
|
+
Parameters
|
|
435
|
+
----------
|
|
436
|
+
query : SqlQuery | str
|
|
437
|
+
The query to execute. If it contains a `Hint` with `preparatory_statements`, these will be executed
|
|
438
|
+
beforehand. Notice that such statements are never subject to caching.
|
|
439
|
+
cache_enabled : Optional[bool], optional
|
|
440
|
+
Controls the caching behavior for just this one query. The default value of *None* indicates that the
|
|
441
|
+
"global" configuration of the database system should be used. Setting this parameter to a boolean value
|
|
442
|
+
forces or deactivates caching for the specific query for the specific execution no matter what the "global"
|
|
443
|
+
configuration is.
|
|
444
|
+
raw : bool, optional
|
|
445
|
+
Whether the result set should be returned as-is. By default, the result set is simplified. Raw mode skips this
|
|
446
|
+
step.
|
|
447
|
+
|
|
448
|
+
Returns
|
|
449
|
+
-------
|
|
450
|
+
Any
|
|
451
|
+
Result set of the input query. This is a list of equal-length tuples in the most general case. Each
|
|
452
|
+
component of the tuple corresponds to a specific column of the result set and each tuple corresponds to a
|
|
453
|
+
row in the result set. However, many queries do not provide a 2-dimensional result set (e.g. *COUNT(\\*)*
|
|
454
|
+
queries). In such cases, the nested structure of the result set makes it quite cumbersome to use.
|
|
455
|
+
Therefore, this method tries to simplify the return value of the query for more convenient use (if `raw` mode is
|
|
456
|
+
disabled). More specifically, if the query returns just a single row, this row is returned directly as a tuple.
|
|
457
|
+
Furthermore, if the query returns just a single column, the values of that column are returned directly in
|
|
458
|
+
a list. Both simplifications will also be combined, such that a result set of a single row of a single
|
|
459
|
+
value will be returned as that single value directly. In all other cases, the result will be a list
|
|
460
|
+
consisting of the different result tuples.
|
|
461
|
+
|
|
462
|
+
Notes
|
|
463
|
+
-----
|
|
464
|
+
This method is mainly intended to execute read-only SQL queries. In fact, the only types of SQL queries that
|
|
465
|
+
can be modelled by the query abstraction layer are precisely such read-only queries. However, if one really
|
|
466
|
+
needs to execute mutating queries, they can be issued as plain text. Just remember that this behavior is
|
|
467
|
+
heavily discouraged!
|
|
468
|
+
|
|
469
|
+
The precise behavior of this method depends on whether caching is enabled or not. If it is, the query will
|
|
470
|
+
only be executed against the live database system, if it is not in the cache. Otherwise, the result will simply
|
|
471
|
+
be retrieved. Caching can be enabled/disabled for just this one query via the `cache_enabled` switch. If this
|
|
472
|
+
is not specified, caching depends on the `cache_enabled` property.
|
|
473
|
+
|
|
474
|
+
If caching should be used for this method, but is disabled at a database-level, the current cache will still
|
|
475
|
+
be read and persisted. This ensures that all cached queries are properly saved and none of the previous cache
|
|
476
|
+
content is lost.
|
|
477
|
+
"""
|
|
478
|
+
raise NotImplementedError
|
|
479
|
+
|
|
480
|
+
@abc.abstractmethod
|
|
481
|
+
def database_name(self) -> str:
|
|
482
|
+
"""Provides the name of the (physical) database that the database interface is connected to.
|
|
483
|
+
|
|
484
|
+
Returns
|
|
485
|
+
-------
|
|
486
|
+
str
|
|
487
|
+
The database name, e.g. *imdb* or *tpc-h*
|
|
488
|
+
"""
|
|
489
|
+
raise NotImplementedError
|
|
490
|
+
|
|
491
|
+
def database_system_name(self) -> str:
|
|
492
|
+
"""Provides the name of the database management system that this interface is connected to.
|
|
493
|
+
|
|
494
|
+
Returns
|
|
495
|
+
-------
|
|
496
|
+
str
|
|
497
|
+
The database system name, e.g. *PostgreSQL*
|
|
498
|
+
"""
|
|
499
|
+
return self.system_name
|
|
500
|
+
|
|
501
|
+
@abc.abstractmethod
|
|
502
|
+
def database_system_version(self) -> util.Version:
|
|
503
|
+
"""Returns the release version of the database management system that this interface is connected to.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
util.Version
|
|
508
|
+
The version
|
|
509
|
+
"""
|
|
510
|
+
raise NotImplementedError
|
|
511
|
+
|
|
512
|
+
@abc.abstractmethod
|
|
513
|
+
def describe(self) -> dict:
|
|
514
|
+
"""Provides a representation of the current database connection as well as its system settings.
|
|
515
|
+
|
|
516
|
+
This description is intended to transparently document which customizations have been applied, thereby giving
|
|
517
|
+
an idea of how the default query execution might have been affected. It can be JSON-serialized and will be
|
|
518
|
+
included by most of the output of the utilities in the `runner` module of the `experiments` package.
|
|
519
|
+
|
|
520
|
+
Returns
|
|
521
|
+
-------
|
|
522
|
+
dict
|
|
523
|
+
The actual description
|
|
524
|
+
"""
|
|
525
|
+
raise NotImplementedError
|
|
526
|
+
|
|
527
|
+
@abc.abstractmethod
|
|
528
|
+
def reset_connection(self) -> None:
|
|
529
|
+
"""Obtains a new network connection for the database. Useful for debugging purposes or in case of crashes.
|
|
530
|
+
|
|
531
|
+
Notice that resetting the connection can have unintended side-effects if other methods rely on the cursor
|
|
532
|
+
object. After resetting, the former cursor object will probably no longer be valid. Therefore, this method
|
|
533
|
+
should be used with caution.
|
|
534
|
+
|
|
535
|
+
See Also
|
|
536
|
+
--------
|
|
537
|
+
Database.cursor
|
|
538
|
+
"""
|
|
539
|
+
raise NotImplementedError
|
|
540
|
+
|
|
541
|
+
def reset_cache(self) -> None:
|
|
542
|
+
"""Removes all results from the query cache. Useful for debugging purposes."""
|
|
543
|
+
self._query_cache = {}
|
|
544
|
+
|
|
545
|
+
@abc.abstractmethod
|
|
546
|
+
def cursor(self) -> Cursor:
|
|
547
|
+
"""Provides a cursor to execute queries and iterate over result sets manually.
|
|
548
|
+
|
|
549
|
+
Returns
|
|
550
|
+
-------
|
|
551
|
+
Cursor
|
|
552
|
+
A cursor compatible with the Python DB API specification 2.0 (PEP 249). The specific cursor type depends on
|
|
553
|
+
the concrete database implementation however.
|
|
554
|
+
|
|
555
|
+
References
|
|
556
|
+
----------
|
|
557
|
+
|
|
558
|
+
.. Python DB API specification 2.0 (PEP 249): https://peps.python.org/pep-0249/
|
|
559
|
+
"""
|
|
560
|
+
raise NotImplementedError
|
|
561
|
+
|
|
562
|
+
@abc.abstractmethod
|
|
563
|
+
def close(self) -> None:
|
|
564
|
+
"""Shuts down all currently open connections to the database."""
|
|
565
|
+
raise NotImplementedError
|
|
566
|
+
|
|
567
|
+
def provides(self, support: Type) -> bool:
|
|
568
|
+
"""Checks, whether the database interface supports a specific protocol."""
|
|
569
|
+
return isinstance(self, support)
|
|
570
|
+
|
|
571
|
+
def _get_cache_enabled(self) -> bool:
|
|
572
|
+
"""Getter for the `cache_enabled` property.
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
bool
|
|
577
|
+
Whether caching is currently enabled
|
|
578
|
+
"""
|
|
579
|
+
return self._cache_enabled
|
|
580
|
+
|
|
581
|
+
def _set_cache_enabled(self, enabled: bool) -> None:
|
|
582
|
+
"""Setter for the `cache_enabled` property. Inflates the query cache if necessary.
|
|
583
|
+
|
|
584
|
+
If the cache should be enabled now, but no cached data exists, the cache will be inflated from disk.
|
|
585
|
+
|
|
586
|
+
Parameters
|
|
587
|
+
----------
|
|
588
|
+
enabled : bool
|
|
589
|
+
Whether caching should be enabled
|
|
590
|
+
"""
|
|
591
|
+
if enabled and not self._query_cache:
|
|
592
|
+
self._inflate_query_cache()
|
|
593
|
+
self._cache_enabled = enabled
|
|
594
|
+
|
|
595
|
+
cache_enabled = property(_get_cache_enabled, _set_cache_enabled)
|
|
596
|
+
"""Controls, whether the results of executed queries should be cached to prevent future re-execution.
|
|
597
|
+
|
|
598
|
+
If caching should be enabled later on and no cached data exists, the cache will be inflated from disk.
|
|
599
|
+
"""
|
|
600
|
+
|
|
601
|
+
def _inflate_query_cache(self) -> None:
|
|
602
|
+
"""Tries to read the query cache for this database.
|
|
603
|
+
|
|
604
|
+
This reads a JSON file that contains all cached queries and their result sets. It should not be edited
|
|
605
|
+
manually.
|
|
606
|
+
"""
|
|
607
|
+
if self._query_cache:
|
|
608
|
+
return
|
|
609
|
+
query_cache_name = self._query_cache_name()
|
|
610
|
+
if os.path.isfile(query_cache_name):
|
|
611
|
+
with open(query_cache_name, "r") as cache_file:
|
|
612
|
+
try:
|
|
613
|
+
self._query_cache = json.load(cache_file, cls=_DBCacheJsonDecoder)
|
|
614
|
+
except json.JSONDecodeError as e:
|
|
615
|
+
warnings.warn(
|
|
616
|
+
"Could not read query cache: " + str(e),
|
|
617
|
+
category=QueryCacheWarning,
|
|
618
|
+
)
|
|
619
|
+
self._query_cache = {}
|
|
620
|
+
else:
|
|
621
|
+
warnings.warn(
|
|
622
|
+
f"Could not read query cache: File {query_cache_name} does not exist",
|
|
623
|
+
category=QueryCacheWarning,
|
|
624
|
+
)
|
|
625
|
+
self._query_cache = {}
|
|
626
|
+
atexit.register(self._store_query_cache, query_cache_name)
|
|
627
|
+
|
|
628
|
+
def _store_query_cache(self, query_cache_name: str) -> None:
|
|
629
|
+
"""Stores the query cache into a JSON file.
|
|
630
|
+
|
|
631
|
+
Parameters
|
|
632
|
+
----------
|
|
633
|
+
query_cache_name : str
|
|
634
|
+
The path where to write the file to. If it exists, it will be overwritten.
|
|
635
|
+
"""
|
|
636
|
+
with open(query_cache_name, "w") as cache_file:
|
|
637
|
+
json.dump(self._query_cache, cache_file, cls=_DBCacheJsonEncoder)
|
|
638
|
+
|
|
639
|
+
def _query_cache_name(self) -> str:
|
|
640
|
+
"""Provides a normalized file name for the query cache.
|
|
641
|
+
|
|
642
|
+
Returns
|
|
643
|
+
-------
|
|
644
|
+
str
|
|
645
|
+
The cache file name. It consists of the database system name, system version and the name of the database
|
|
646
|
+
"""
|
|
647
|
+
identifier = "_".join(
|
|
648
|
+
[
|
|
649
|
+
self.database_system_name(),
|
|
650
|
+
self.database_system_version().formatted(prefix="v", separator="_"),
|
|
651
|
+
self.database_name(),
|
|
652
|
+
]
|
|
653
|
+
)
|
|
654
|
+
return f".query_cache_{identifier}.json"
|
|
655
|
+
|
|
656
|
+
def __hash__(self) -> int:
|
|
657
|
+
return hash(self._query_cache_name())
|
|
658
|
+
|
|
659
|
+
def __eq__(self, other: object) -> bool:
|
|
660
|
+
return (
|
|
661
|
+
isinstance(other, type(self))
|
|
662
|
+
and self._query_cache_name() == other._query_cache_name()
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
def __repr__(self) -> str:
|
|
666
|
+
return str(self)
|
|
667
|
+
|
|
668
|
+
def __str__(self) -> str:
|
|
669
|
+
return f"{self.database_name()} @ {self.database_system_name()} ({self.database_system_version()})"
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
ForeignKeyRef = collections.namedtuple("ForeignKeyRef", ["fk_col", "referenced_col"])
|
|
673
|
+
"""
|
|
674
|
+
A foreign key references has a foreign key column `fk_col` (the first element) that requires a matching value in the
|
|
675
|
+
`referenced_col` (the second element) of the target table.
|
|
676
|
+
"""
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
class DatabaseSchema(abc.ABC):
|
|
680
|
+
"""This interface provides access to different information about the logical structure of a database.
|
|
681
|
+
|
|
682
|
+
In contrast to database statistics, schema information is much more standardized. PostBOUND therefore only takes on
|
|
683
|
+
the role of a mediator to delegate requests to different parts of the schema to the approapriate - and sometimes
|
|
684
|
+
system specific - metadata catalogs of the database systems. For each kind of schema information a dedicated query
|
|
685
|
+
method exists. Take a look at these methods to understand the functionality provided by the database schema
|
|
686
|
+
interface.
|
|
687
|
+
|
|
688
|
+
Parameters
|
|
689
|
+
----------
|
|
690
|
+
db : Database
|
|
691
|
+
The database for which the schema information should be read. This is required to obtain cursors that request
|
|
692
|
+
the desired data.
|
|
693
|
+
prep_placeholder : str, optional
|
|
694
|
+
The placeholder that is used for prepared statements. Some systems use `?` as a placeholder, while others use *%s*
|
|
695
|
+
(the default). This needs to be specified to ensure that the information_schema queries are correctly formatted.
|
|
696
|
+
|
|
697
|
+
Notes
|
|
698
|
+
-----
|
|
699
|
+
**Hint for implementors:** the database schema contains no abstract methods that need to be overridden. All methods come
|
|
700
|
+
with a default implementation that uses the *information_schema* to retrieve the necessary information. However, if the
|
|
701
|
+
target database system does not support specific features of the information_schema, the corresponding methods need to be
|
|
702
|
+
overridden to provide the necessary functionality. The documentation of each method details which parts of the
|
|
703
|
+
information_schema it needs.
|
|
704
|
+
"""
|
|
705
|
+
|
|
706
|
+
def __init__(self, db: Database, *, prep_placeholder: str = "%s"):
|
|
707
|
+
self._db = db
|
|
708
|
+
self._prep_placeholder = prep_placeholder
|
|
709
|
+
|
|
710
|
+
def tables(self) -> set[TableReference]:
|
|
711
|
+
"""Fetches all user-defined tables that are contained in the current database.
|
|
712
|
+
|
|
713
|
+
Returns
|
|
714
|
+
-------
|
|
715
|
+
set[TableReference]
|
|
716
|
+
All tables in the current schema, including materialized views, etc.
|
|
717
|
+
|
|
718
|
+
Notes
|
|
719
|
+
-----
|
|
720
|
+
**Hint for implementors:** the default implementation of this method relies on the *information_schema.tables* view.
|
|
721
|
+
"""
|
|
722
|
+
query_template = textwrap.dedent(f"""
|
|
723
|
+
SELECT table_name
|
|
724
|
+
FROM information_schema.tables
|
|
725
|
+
WHERE table_catalog = {self._prep_placeholder}
|
|
726
|
+
AND table_schema = current_schema()
|
|
727
|
+
""")
|
|
728
|
+
self._db.cursor().execute(query_template, (self._db.database_name(),))
|
|
729
|
+
result_set = self._db.cursor().fetchall()
|
|
730
|
+
assert result_set is not None
|
|
731
|
+
return set(TableReference(row[0]) for row in result_set)
|
|
732
|
+
|
|
733
|
+
def columns(self, table: TableReference | str) -> Sequence[ColumnReference]:
|
|
734
|
+
"""Fetches all columns of the given table.
|
|
735
|
+
|
|
736
|
+
Parameters
|
|
737
|
+
----------
|
|
738
|
+
table : TableReference | str
|
|
739
|
+
A table in the current schema
|
|
740
|
+
|
|
741
|
+
Returns
|
|
742
|
+
-------
|
|
743
|
+
Sequence[ColumnReference]
|
|
744
|
+
All columns for the given table. Columns are ordered according to their position in the table.
|
|
745
|
+
Will be empty if the table is not found or does not contain any columns.
|
|
746
|
+
|
|
747
|
+
Raises
|
|
748
|
+
------
|
|
749
|
+
postbound.qal.VirtualTableError
|
|
750
|
+
If the given table is virtual (e.g. subquery or CTE)
|
|
751
|
+
|
|
752
|
+
Notes
|
|
753
|
+
-----
|
|
754
|
+
**Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
|
|
755
|
+
"""
|
|
756
|
+
|
|
757
|
+
# The documentation of lookup_column() reference an implementation detail of this method.
|
|
758
|
+
# Make sure to keep the two in sync.
|
|
759
|
+
|
|
760
|
+
table = table if isinstance(table, TableReference) else TableReference(table)
|
|
761
|
+
if table.virtual:
|
|
762
|
+
raise VirtualTableError(table)
|
|
763
|
+
schema_placeholder = (
|
|
764
|
+
self._prep_placeholder if table.schema else "current_schema()"
|
|
765
|
+
)
|
|
766
|
+
query_template = textwrap.dedent(f"""
|
|
767
|
+
SELECT column_name
|
|
768
|
+
FROM information_schema.columns
|
|
769
|
+
WHERE table_name = {self._prep_placeholder}
|
|
770
|
+
AND table_catalog = current_database()
|
|
771
|
+
AND table_schema = {schema_placeholder}
|
|
772
|
+
ORDER BY ordinal_position
|
|
773
|
+
""")
|
|
774
|
+
params = [table.full_name]
|
|
775
|
+
if table.schema:
|
|
776
|
+
params.append(table.schema)
|
|
777
|
+
self._db.cursor().execute(query_template, params)
|
|
778
|
+
result_set = self._db.cursor().fetchall()
|
|
779
|
+
assert result_set is not None
|
|
780
|
+
return [ColumnReference(row[0], table) for row in result_set]
|
|
781
|
+
|
|
782
|
+
def is_view(self, table: TableReference | str) -> bool:
|
|
783
|
+
"""Checks, whether a specific table is actually is a view.
|
|
784
|
+
|
|
785
|
+
Parameters
|
|
786
|
+
----------
|
|
787
|
+
table : TableReference | str
|
|
788
|
+
The table to check. May not be a virtual table.
|
|
789
|
+
|
|
790
|
+
Returns
|
|
791
|
+
-------
|
|
792
|
+
bool
|
|
793
|
+
Whether the table is a view
|
|
794
|
+
|
|
795
|
+
Raises
|
|
796
|
+
------
|
|
797
|
+
ValueError
|
|
798
|
+
If the table was not found in the current database
|
|
799
|
+
|
|
800
|
+
Notes
|
|
801
|
+
-----
|
|
802
|
+
**Hint for implementors:** the default implementation of this method relies on the *information_schema.tables* view.
|
|
803
|
+
"""
|
|
804
|
+
if isinstance(table, TableReference) and table.virtual:
|
|
805
|
+
raise VirtualTableError(table)
|
|
806
|
+
table = table if isinstance(table, str) else table.full_name
|
|
807
|
+
db_name = self._db.database_name()
|
|
808
|
+
|
|
809
|
+
query_template = textwrap.dedent(f"""
|
|
810
|
+
SELECT table_type
|
|
811
|
+
FROM information_schema.tables
|
|
812
|
+
WHERE table_catalog = {self._prep_placeholder}
|
|
813
|
+
AND table_name = {self._prep_placeholder}
|
|
814
|
+
AND table_catalog = current_database()
|
|
815
|
+
""")
|
|
816
|
+
self._db.cursor().execute(query_template, (db_name, table))
|
|
817
|
+
result_set = self._db.cursor().fetchall()
|
|
818
|
+
|
|
819
|
+
assert result_set is not None
|
|
820
|
+
if not result_set:
|
|
821
|
+
raise ValueError(f"Table '{table}' not found in database '{db_name}'")
|
|
822
|
+
table_type = result_set[0][0]
|
|
823
|
+
return table_type == "VIEW"
|
|
824
|
+
|
|
825
|
+
def lookup_column(
|
|
826
|
+
self,
|
|
827
|
+
column: ColumnReference | str,
|
|
828
|
+
candidate_tables: Iterable[TableReference],
|
|
829
|
+
*,
|
|
830
|
+
expect_match: bool = False,
|
|
831
|
+
) -> Optional[TableReference]:
|
|
832
|
+
"""Searches for a table that owns the given column.
|
|
833
|
+
|
|
834
|
+
Parameters
|
|
835
|
+
----------
|
|
836
|
+
column : ColumnReference | str
|
|
837
|
+
The column that is being looked up
|
|
838
|
+
candidate_tables : Iterable[TableReference]
|
|
839
|
+
Tables that could possibly own the given column
|
|
840
|
+
expect_match : bool, optional
|
|
841
|
+
If enabled, an error is raised whenever no table is found. Otherwise *None* is returned. By default, this is
|
|
842
|
+
disabled.
|
|
843
|
+
|
|
844
|
+
Returns
|
|
845
|
+
-------
|
|
846
|
+
TableReference
|
|
847
|
+
The first of the `candidate_tables` that has a column of similar name.
|
|
848
|
+
|
|
849
|
+
Raises
|
|
850
|
+
------
|
|
851
|
+
ValueError
|
|
852
|
+
If `expect_match` is enabled and none of the candidate tables has a column of the given name.
|
|
853
|
+
|
|
854
|
+
Notes
|
|
855
|
+
-----
|
|
856
|
+
**Hint for implementors:** the default implementation of this method (transitively) relies on the
|
|
857
|
+
*information_schema.columns* view.
|
|
858
|
+
"""
|
|
859
|
+
for candidate in candidate_tables:
|
|
860
|
+
candidate_cols = self.columns(candidate)
|
|
861
|
+
if column in candidate_cols:
|
|
862
|
+
return candidate
|
|
863
|
+
|
|
864
|
+
if expect_match:
|
|
865
|
+
raise ValueError(
|
|
866
|
+
f"Column '{column}' not found in any of the candidate tables: {candidate_tables}"
|
|
867
|
+
)
|
|
868
|
+
return None
|
|
869
|
+
|
|
870
|
+
def is_primary_key(self, column: ColumnReference) -> bool:
|
|
871
|
+
"""Checks, whether a column is the primary key for its associated table.
|
|
872
|
+
|
|
873
|
+
Parameters
|
|
874
|
+
----------
|
|
875
|
+
column : ColumnReference
|
|
876
|
+
The column to check
|
|
877
|
+
|
|
878
|
+
Returns
|
|
879
|
+
-------
|
|
880
|
+
bool
|
|
881
|
+
Whether the column is the primary key of its table. If it is part of a compound primary key, this is *False*.
|
|
882
|
+
|
|
883
|
+
Raises
|
|
884
|
+
------
|
|
885
|
+
postbound.qal.UnboundColumnError
|
|
886
|
+
If the column is not associated with any table
|
|
887
|
+
postbound.qal.VirtualTableError
|
|
888
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
889
|
+
|
|
890
|
+
Notes
|
|
891
|
+
-----
|
|
892
|
+
**Hint for implementors:** the default implementation of this method relies on the
|
|
893
|
+
*information_schema.table_constraints* and *information_schema.constraint_column_usage* views.
|
|
894
|
+
"""
|
|
895
|
+
if not column.is_bound():
|
|
896
|
+
raise UnboundColumnError(
|
|
897
|
+
f"Cannot check primary key status for column {column}: Column is not bound to any table."
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
schema_placeholder = (
|
|
901
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
902
|
+
)
|
|
903
|
+
query_template = textwrap.dedent(f"""
|
|
904
|
+
SELECT ccu.column_name
|
|
905
|
+
FROM information_schema.table_constraints tc
|
|
906
|
+
JOIN information_schema.constraint_column_usage ccu
|
|
907
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
908
|
+
AND tc.table_catalog = ccu.table_catalog
|
|
909
|
+
AND tc.table_schema = ccu.table_schema
|
|
910
|
+
AND tc.table_name = ccu.table_name
|
|
911
|
+
AND tc.constraint_catalog = ccu.constraint_catalog
|
|
912
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
913
|
+
AND ccu.column_name = {self._prep_placeholder}
|
|
914
|
+
AND tc.constraint_type = 'PRIMARY KEY'
|
|
915
|
+
AND tc.table_catalog = current_database()
|
|
916
|
+
AND tc.table_schema = {schema_placeholder};
|
|
917
|
+
""")
|
|
918
|
+
|
|
919
|
+
params = [column.table.full_name, column.name]
|
|
920
|
+
if column.table.schema:
|
|
921
|
+
params.append(column.table.schema)
|
|
922
|
+
|
|
923
|
+
self._db.cursor().execute(query_template, params)
|
|
924
|
+
result_set = self._db.cursor().fetchone()
|
|
925
|
+
|
|
926
|
+
return result_set is not None
|
|
927
|
+
|
|
928
|
+
def primary_key_column(
|
|
929
|
+
self, table: TableReference | str
|
|
930
|
+
) -> Optional[ColumnReference]:
|
|
931
|
+
"""Determines the primary key column of a specific table.
|
|
932
|
+
|
|
933
|
+
Parameters
|
|
934
|
+
----------
|
|
935
|
+
table : TableReference | str
|
|
936
|
+
The table to check
|
|
937
|
+
|
|
938
|
+
Returns
|
|
939
|
+
-------
|
|
940
|
+
Optional[ColumnReference]
|
|
941
|
+
The primary key if it exists, or *None* otherwise.
|
|
942
|
+
|
|
943
|
+
Notes
|
|
944
|
+
-----
|
|
945
|
+
**Hint for implementors:** the default implementation of this method relies on the
|
|
946
|
+
*information_schema.table_constraints* and *information_schema.constraint_column_usage* views.
|
|
947
|
+
"""
|
|
948
|
+
schema_placeholder = (
|
|
949
|
+
self._prep_placeholder if table.schema else "current_schema()"
|
|
950
|
+
)
|
|
951
|
+
query_template = textwrap.dedent(f"""
|
|
952
|
+
SELECT ccu.column_name
|
|
953
|
+
FROM information_schema.table_constraints tc
|
|
954
|
+
JOIN information_schema.constraint_column_usage ccu
|
|
955
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
956
|
+
AND tc.table_catalog = ccu.table_catalog
|
|
957
|
+
AND tc.table_schema = ccu.table_schema
|
|
958
|
+
AND tc.table_name = ccu.table_name
|
|
959
|
+
AND tc.constraint_catalog = ccu.constraint_catalog
|
|
960
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
961
|
+
AND tc.constraint_type = 'PRIMARY KEY'
|
|
962
|
+
AND tc.table_catalog = current_database()
|
|
963
|
+
AND tc.table_schema = {schema_placeholder};
|
|
964
|
+
""")
|
|
965
|
+
|
|
966
|
+
params = [table.full_name]
|
|
967
|
+
if table.schema:
|
|
968
|
+
params.append(table.schema)
|
|
969
|
+
|
|
970
|
+
self._db.cursor().execute(query_template, params)
|
|
971
|
+
result_set = self._db.cursor().fetchall()
|
|
972
|
+
|
|
973
|
+
if not result_set:
|
|
974
|
+
return None
|
|
975
|
+
elif len(result_set) > 1:
|
|
976
|
+
raise ValueError(
|
|
977
|
+
f"Table {table} has multiple primary key columns: {result_set}"
|
|
978
|
+
)
|
|
979
|
+
col = result_set[0][0]
|
|
980
|
+
return ColumnReference(col, table)
|
|
981
|
+
|
|
982
|
+
def has_secondary_index(self, column: ColumnReference) -> bool:
|
|
983
|
+
"""Checks, whether a secondary index is available for a specific column.
|
|
984
|
+
|
|
985
|
+
Parameters
|
|
986
|
+
----------
|
|
987
|
+
column : ColumnReference
|
|
988
|
+
The column to check
|
|
989
|
+
|
|
990
|
+
Returns
|
|
991
|
+
-------
|
|
992
|
+
bool
|
|
993
|
+
Whether a secondary index of any kind was created for the column. Compound indexes and primary key indexes
|
|
994
|
+
fail this test.
|
|
995
|
+
|
|
996
|
+
Raises
|
|
997
|
+
------
|
|
998
|
+
postbound.qal.UnboundColumnError
|
|
999
|
+
If the column is not associated with any table
|
|
1000
|
+
postbound.qal.VirtualTableError
|
|
1001
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1002
|
+
|
|
1003
|
+
Notes
|
|
1004
|
+
-----
|
|
1005
|
+
**Hints for implementors:**
|
|
1006
|
+
The default implementation of this method assumes that each foreign key column and each column with a UNIQUE constraint
|
|
1007
|
+
has an associated index. If this should not be the case, a custom implementation needs to be supplied.
|
|
1008
|
+
Furthermore, the implementation relies on the *information_schema.table_constraints*,
|
|
1009
|
+
*information_schema.constraint_column_usage* and *information_schema.key_column_usage* views.
|
|
1010
|
+
"""
|
|
1011
|
+
|
|
1012
|
+
# The documentation of has_index() references an implementation detail of this method.
|
|
1013
|
+
# Make sure to keep the two in sync.
|
|
1014
|
+
|
|
1015
|
+
if not column.is_bound():
|
|
1016
|
+
raise UnboundColumnError(
|
|
1017
|
+
f"Cannot check index status for column {column}: Column is not bound to any table."
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
schema_placeholder = (
|
|
1021
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
# The query template is much more complicated here, due to the different semantics of the constraint_column_usage
|
|
1025
|
+
# view. For UNIQUE constraints, the column is the column that is constrained. However, for foreign keys, the column
|
|
1026
|
+
# is the column that is being referenced.
|
|
1027
|
+
query_template = textwrap.dedent(f"""
|
|
1028
|
+
SELECT ccu.column_name
|
|
1029
|
+
FROM information_schema.table_constraints tc
|
|
1030
|
+
JOIN information_schema.constraint_column_usage ccu
|
|
1031
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
1032
|
+
AND tc.table_catalog = ccu.table_catalog
|
|
1033
|
+
AND tc.table_schema = ccu.table_schema
|
|
1034
|
+
AND tc.table_name = ccu.table_name
|
|
1035
|
+
AND tc.constraint_catalog = ccu.constraint_catalog
|
|
1036
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
1037
|
+
AND ccu.column_name = {self._prep_placeholder}
|
|
1038
|
+
AND tc.constraint_type = 'UNIQUE'
|
|
1039
|
+
AND tc.table_catalog = current_database()
|
|
1040
|
+
AND tc.table_schema = {schema_placeholder}
|
|
1041
|
+
UNION
|
|
1042
|
+
SELECT kcu.column_name
|
|
1043
|
+
FROM information_schema.table_constraints tc
|
|
1044
|
+
JOIN information_schema.key_column_usage kcu
|
|
1045
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
1046
|
+
AND tc.table_catalog = kcu.table_catalog
|
|
1047
|
+
AND tc.table_schema = kcu.table_schema
|
|
1048
|
+
AND tc.table_name = kcu.table_name
|
|
1049
|
+
AND tc.constraint_catalog = kcu.constraint_catalog
|
|
1050
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
1051
|
+
AND kcu.column_name = {self._prep_placeholder}
|
|
1052
|
+
AND tc.constraint_type = 'FOREIGN KEY'
|
|
1053
|
+
AND tc.table_catalog = current_database()
|
|
1054
|
+
AND tc.table_schema = {schema_placeholder};
|
|
1055
|
+
""")
|
|
1056
|
+
|
|
1057
|
+
# Due to the UNION query, we need to repeat the placeholders. While the implementation is definitely not elegant,
|
|
1058
|
+
# this solution is arguably better than relying on named parameters which might or might not be supported by the
|
|
1059
|
+
# target database.
|
|
1060
|
+
params = [column.table.full_name, column.name]
|
|
1061
|
+
if column.table.schema:
|
|
1062
|
+
params.append(column.table.schema)
|
|
1063
|
+
params.extend([column.table.full_name, column.name])
|
|
1064
|
+
if column.table.schema:
|
|
1065
|
+
params.append(column.table.schema)
|
|
1066
|
+
|
|
1067
|
+
self._db.cursor().execute(query_template, params)
|
|
1068
|
+
result_set = self._db.cursor().fetchone()
|
|
1069
|
+
|
|
1070
|
+
return result_set is not None
|
|
1071
|
+
|
|
1072
|
+
def foreign_keys_on(self, column: ColumnReference) -> set[ColumnReference]:
|
|
1073
|
+
"""Fetches all foreign key constraints that are specified on a specific column.
|
|
1074
|
+
|
|
1075
|
+
The provided columns are the target columns that are referenced by the foreign key constraint. E.g., suppose there are
|
|
1076
|
+
tables A and B with columns x and y. We specify a foreign key constraint on column y to ensure that all values in y
|
|
1077
|
+
reference a value in x. Then, calling this method on column y will return column x. If there are multiple foreign key
|
|
1078
|
+
constraints on the same column, all of them will be returned.
|
|
1079
|
+
|
|
1080
|
+
Parameters
|
|
1081
|
+
----------
|
|
1082
|
+
column : ColumnReference
|
|
1083
|
+
The column to check. All foreign keys that are "pointing from" this column to another column are returned.
|
|
1084
|
+
|
|
1085
|
+
Returns
|
|
1086
|
+
-------
|
|
1087
|
+
set[ColumnReference]
|
|
1088
|
+
The columns that are "pointed to" by foreign key constraints on the given column. If no such foreign keys exist,
|
|
1089
|
+
an empty set is returned.
|
|
1090
|
+
|
|
1091
|
+
Raises
|
|
1092
|
+
------
|
|
1093
|
+
postbound.qal.UnboundColumnError
|
|
1094
|
+
If the column is not associated with any table
|
|
1095
|
+
postbound.qal.VirtualTableError
|
|
1096
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1097
|
+
"""
|
|
1098
|
+
if not column.is_bound():
|
|
1099
|
+
raise UnboundColumnError(
|
|
1100
|
+
f"Cannot check foreign keys for column {column}: Column is not bound to any table."
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
schema_placeholder = (
|
|
1104
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
1105
|
+
)
|
|
1106
|
+
query_template = textwrap.dedent(f"""
|
|
1107
|
+
SELECT ccu.table_name, ccu.column_name
|
|
1108
|
+
FROM information_schema.table_constraints tc
|
|
1109
|
+
JOIN information_schema.key_column_usage kcu
|
|
1110
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
1111
|
+
AND tc.table_schema = kcu.table_schema
|
|
1112
|
+
AND tc.table_name = kcu.table_name
|
|
1113
|
+
JOIN information_schema.constraint_column_usage ccu
|
|
1114
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
1115
|
+
AND tc.table_schema = ccu.table_schema
|
|
1116
|
+
AND tc.table_catalog = ccu.table_catalog
|
|
1117
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
1118
|
+
AND kcu.column_name = {self._prep_placeholder}
|
|
1119
|
+
AND tc.constraint_type = 'FOREIGN KEY'
|
|
1120
|
+
AND tc.table_schema = {schema_placeholder}
|
|
1121
|
+
AND tc.table_catalog = current_database();
|
|
1122
|
+
""")
|
|
1123
|
+
params = [column.table.full_name, column.name]
|
|
1124
|
+
if column.table.schema:
|
|
1125
|
+
params.append(column.table.schema)
|
|
1126
|
+
|
|
1127
|
+
self._db.cursor().execute(query_template, params)
|
|
1128
|
+
result_set = self._db.cursor().fetchall()
|
|
1129
|
+
|
|
1130
|
+
return {
|
|
1131
|
+
ColumnReference(row[1], TableReference(row[0], schema=column.table.schema))
|
|
1132
|
+
for row in result_set
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
def has_index(self, column: ColumnReference) -> bool:
|
|
1136
|
+
"""Checks, whether there is any index structure available on a column
|
|
1137
|
+
|
|
1138
|
+
Parameters
|
|
1139
|
+
----------
|
|
1140
|
+
column : ColumnReference
|
|
1141
|
+
The column to check
|
|
1142
|
+
|
|
1143
|
+
Returns
|
|
1144
|
+
-------
|
|
1145
|
+
bool
|
|
1146
|
+
Whether any kind of index (primary, or secondary) is available for the column. Only compound indexes will
|
|
1147
|
+
fail this test.
|
|
1148
|
+
|
|
1149
|
+
Raises
|
|
1150
|
+
------
|
|
1151
|
+
postbound.qal.UnboundColumnError
|
|
1152
|
+
If the column is not associated with any table
|
|
1153
|
+
postbound.qal.VirtualTableError
|
|
1154
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1155
|
+
|
|
1156
|
+
Notes
|
|
1157
|
+
-----
|
|
1158
|
+
**Hints for implementors:** the default implementation of this method (transitively) relies on the
|
|
1159
|
+
**information_schema.table_constraints** and **information_schema.constraint_column_usage** views. It assumes that
|
|
1160
|
+
primary keys, foreign keys and unique constraints are all associated with an index structure. If this is not the case,
|
|
1161
|
+
a custom implementation needs to be supplied.
|
|
1162
|
+
"""
|
|
1163
|
+
return self.is_primary_key(column) or self.has_secondary_index(column)
|
|
1164
|
+
|
|
1165
|
+
def indexes_on(self, column: ColumnReference) -> set[str]:
|
|
1166
|
+
"""Retrieves the names of all indexes of a specific column.
|
|
1167
|
+
|
|
1168
|
+
Parameters
|
|
1169
|
+
----------
|
|
1170
|
+
column : ColumnReference
|
|
1171
|
+
The column to check.
|
|
1172
|
+
|
|
1173
|
+
Returns
|
|
1174
|
+
-------
|
|
1175
|
+
set[str]
|
|
1176
|
+
The indexes. If no indexes are available, the set will be empty.
|
|
1177
|
+
|
|
1178
|
+
Raises
|
|
1179
|
+
------
|
|
1180
|
+
postbound.qal.UnboundColumnError
|
|
1181
|
+
If the column is not associated with any table
|
|
1182
|
+
postbound.qal.VirtualTableError
|
|
1183
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1184
|
+
|
|
1185
|
+
Notes
|
|
1186
|
+
-----
|
|
1187
|
+
**Hints for implementors:** the default implementation of this method assumes that primary keys, foreign keys and
|
|
1188
|
+
unique constraints are all associated with an index structure. It provides the names of the corresponding constraints.
|
|
1189
|
+
The implementation relies on the *information_schema.table_constraints*, *information_schema.constraint_column_usage*
|
|
1190
|
+
and *information_schema.key_column_usage* views.
|
|
1191
|
+
"""
|
|
1192
|
+
if not column.is_bound():
|
|
1193
|
+
raise UnboundColumnError(
|
|
1194
|
+
f"Cannot retrieve indexes for column {column}: Column is not bound to any table."
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
schema_placeholder = (
|
|
1198
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
# The query template is much more complicated here, due to the different semantics of the constraint_column_usage
|
|
1202
|
+
# view. For UNIQUE constraints, the column is the column that is constrained. However, for foreign keys, the column
|
|
1203
|
+
# is the column that is being referenced.
|
|
1204
|
+
query_template = textwrap.dedent(f"""
|
|
1205
|
+
SELECT tc.constraint_name
|
|
1206
|
+
FROM information_schema.table_constraints tc
|
|
1207
|
+
JOIN information_schema.constraint_column_usage ccu
|
|
1208
|
+
ON tc.constraint_name = ccu.constraint_name
|
|
1209
|
+
AND tc.table_catalog = ccu.table_catalog
|
|
1210
|
+
AND tc.table_schema = ccu.table_schema
|
|
1211
|
+
AND tc.table_name = ccu.table_name
|
|
1212
|
+
AND tc.constraint_catalog = ccu.constraint_catalog
|
|
1213
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
1214
|
+
AND ccu.column_name = {self._prep_placeholder}
|
|
1215
|
+
AND tc.constraint_type IN ('PRIMARY KEY', 'UNIQUE')
|
|
1216
|
+
AND tc.table_catalog = current_database()
|
|
1217
|
+
AND tc.table_schema = {schema_placeholder}
|
|
1218
|
+
UNION
|
|
1219
|
+
SELECT tc.constraint_name
|
|
1220
|
+
FROM information_schema.table_constraints tc
|
|
1221
|
+
JOIN information_schema.key_column_usage kcu
|
|
1222
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
1223
|
+
AND tc.table_catalog = kcu.table_catalog
|
|
1224
|
+
AND tc.table_schema = kcu.table_schema
|
|
1225
|
+
AND tc.table_name = kcu.table_name
|
|
1226
|
+
AND tc.constraint_catalog = kcu.constraint_catalog
|
|
1227
|
+
WHERE tc.table_name = {self._prep_placeholder}
|
|
1228
|
+
AND kcu.column_name = {self._prep_placeholder}
|
|
1229
|
+
AND tc.constraint_type = 'FOREIGN KEY'
|
|
1230
|
+
AND tc.table_catalog = current_database()
|
|
1231
|
+
AND tc.table_schema = {schema_placeholder};
|
|
1232
|
+
""")
|
|
1233
|
+
|
|
1234
|
+
# Due to the UNION query, we need to repeat the placeholders. While the implementation is definitely not elegant,
|
|
1235
|
+
# this solution is arguably better than relying on named parameters which might or might not be supported by the
|
|
1236
|
+
# target database.
|
|
1237
|
+
params = [column.table.full_name, column.name]
|
|
1238
|
+
if column.table.schema:
|
|
1239
|
+
params.append(column.table.schema)
|
|
1240
|
+
params.extend([column.table.full_name, column.name])
|
|
1241
|
+
if column.table.schema:
|
|
1242
|
+
params.append(column.table.schema)
|
|
1243
|
+
|
|
1244
|
+
self._db.cursor().execute(query_template, params)
|
|
1245
|
+
result_set = self._db.cursor().fetchall()
|
|
1246
|
+
|
|
1247
|
+
return {row[0] for row in result_set}
|
|
1248
|
+
|
|
1249
|
+
def datatype(self, column: ColumnReference) -> str:
|
|
1250
|
+
"""Retrieves the (physical) data type of a column.
|
|
1251
|
+
|
|
1252
|
+
The provided type can be a standardized SQL-type, but it can be a type specific to the concrete database
|
|
1253
|
+
system just as well. It is up to the user to figure this out and to react accordingly.
|
|
1254
|
+
|
|
1255
|
+
Parameters
|
|
1256
|
+
----------
|
|
1257
|
+
column : ColumnReference
|
|
1258
|
+
The colum to check
|
|
1259
|
+
|
|
1260
|
+
Returns
|
|
1261
|
+
-------
|
|
1262
|
+
str
|
|
1263
|
+
The datatype. Will never be empty.
|
|
1264
|
+
|
|
1265
|
+
Raises
|
|
1266
|
+
------
|
|
1267
|
+
postbound.qal.UnboundColumnError
|
|
1268
|
+
If the column is not associated with any table
|
|
1269
|
+
postbound.qal.VirtualTableError
|
|
1270
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1271
|
+
|
|
1272
|
+
Notes
|
|
1273
|
+
-----
|
|
1274
|
+
**Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
|
|
1275
|
+
"""
|
|
1276
|
+
if not column.is_bound():
|
|
1277
|
+
raise UnboundColumnError(
|
|
1278
|
+
f"Cannot check datatype for column {column}: Column is not bound to any table."
|
|
1279
|
+
)
|
|
1280
|
+
|
|
1281
|
+
schema_placeholder = (
|
|
1282
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
1283
|
+
)
|
|
1284
|
+
query_template = textwrap.dedent(f"""
|
|
1285
|
+
SELECT data_type
|
|
1286
|
+
FROM information_schema.columns
|
|
1287
|
+
WHERE table_name = {self._prep_placeholder}
|
|
1288
|
+
AND column_name = {self._prep_placeholder}
|
|
1289
|
+
AND table_catalog = current_database()
|
|
1290
|
+
AND table_schema = {schema_placeholder};
|
|
1291
|
+
""")
|
|
1292
|
+
|
|
1293
|
+
params = [column.table.full_name, column.name]
|
|
1294
|
+
if column.table.schema:
|
|
1295
|
+
params.append(column.table.schema)
|
|
1296
|
+
|
|
1297
|
+
self._db.cursor().execute(query_template, params)
|
|
1298
|
+
result_set = self._db.cursor().fetchone()
|
|
1299
|
+
assert result_set
|
|
1300
|
+
|
|
1301
|
+
return result_set[0]
|
|
1302
|
+
|
|
1303
|
+
def is_nullable(self, column: ColumnReference) -> bool:
|
|
1304
|
+
"""Checks, whether a specific column may contain NULL values.
|
|
1305
|
+
|
|
1306
|
+
Parameters
|
|
1307
|
+
----------
|
|
1308
|
+
column : ColumnReference
|
|
1309
|
+
The column to check
|
|
1310
|
+
|
|
1311
|
+
Returns
|
|
1312
|
+
-------
|
|
1313
|
+
bool
|
|
1314
|
+
Whether the column may contain NULL values
|
|
1315
|
+
|
|
1316
|
+
Raises
|
|
1317
|
+
------
|
|
1318
|
+
postbound.qal.UnboundColumnError
|
|
1319
|
+
If the column is not associated with any table
|
|
1320
|
+
postbound.qal.VirtualTableError
|
|
1321
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1322
|
+
|
|
1323
|
+
Notes
|
|
1324
|
+
-----
|
|
1325
|
+
**Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
|
|
1326
|
+
"""
|
|
1327
|
+
if not column.is_bound():
|
|
1328
|
+
raise UnboundColumnError(
|
|
1329
|
+
f"Cannot check nullability for column {column}: Column is not bound to any table."
|
|
1330
|
+
)
|
|
1331
|
+
|
|
1332
|
+
schema_placeholder = (
|
|
1333
|
+
self._prep_placeholder if column.table.schema else "current_schema()"
|
|
1334
|
+
)
|
|
1335
|
+
query_template = textwrap.dedent(f"""
|
|
1336
|
+
SELECT is_nullable
|
|
1337
|
+
FROM information_schema.columns
|
|
1338
|
+
WHERE table_name = {self._prep_placeholder}
|
|
1339
|
+
AND column_name = {self._prep_placeholder}
|
|
1340
|
+
AND table_catalog = current_database()
|
|
1341
|
+
AND table_schema = {schema_placeholder};
|
|
1342
|
+
""")
|
|
1343
|
+
|
|
1344
|
+
params = [column.table.full_name, column.name]
|
|
1345
|
+
if column.table.schema:
|
|
1346
|
+
params.append(column.table.schema)
|
|
1347
|
+
|
|
1348
|
+
self._db.cursor().execute(query_template, params)
|
|
1349
|
+
result_set = self._db.cursor().fetchone()
|
|
1350
|
+
assert result_set
|
|
1351
|
+
|
|
1352
|
+
return result_set[0] == "YES"
|
|
1353
|
+
|
|
1354
|
+
def as_graph(self) -> nx.DiGraph:
|
|
1355
|
+
"""Constructs a compact representation of the database schema.
|
|
1356
|
+
|
|
1357
|
+
The schema is expressed as a directed graph. Each table is represented as a node. Nodes contain the following
|
|
1358
|
+
attributes:
|
|
1359
|
+
- `columns`: a list of all columns in the table
|
|
1360
|
+
- `data_type`: a dictionary mapping each column to its data type
|
|
1361
|
+
- `primary_key`: the primary key of the table (if it exists, otherwise *None*)
|
|
1362
|
+
|
|
1363
|
+
In addition, edges are used to model foreign key constraints. Each edge points from the table that contains the foreign
|
|
1364
|
+
key (column *y* in the example in `foreign_keys_on`) to the table that is referenced by the foreign key (*x* in the
|
|
1365
|
+
example in `foreign_keys_on`). Edges contain an attribute `foreign_keys` with a list of the foreign key
|
|
1366
|
+
relationships. Each such constraint is described by a `ForeignKeyRef`.
|
|
1367
|
+
"""
|
|
1368
|
+
g = nx.DiGraph()
|
|
1369
|
+
all_columns: set[ColumnReference] = set()
|
|
1370
|
+
|
|
1371
|
+
for table in self.tables():
|
|
1372
|
+
if self.is_view(table):
|
|
1373
|
+
continue
|
|
1374
|
+
|
|
1375
|
+
cols = self.columns(table)
|
|
1376
|
+
dtypes = {col: self.datatype(col) for col in cols}
|
|
1377
|
+
pkey = self.primary_key_column(table)
|
|
1378
|
+
g.add_node(table, columns=cols, data_type=dtypes, primary_key=pkey)
|
|
1379
|
+
|
|
1380
|
+
all_columns |= set(cols)
|
|
1381
|
+
|
|
1382
|
+
for col in all_columns:
|
|
1383
|
+
foreign_keys = self.foreign_keys_on(col)
|
|
1384
|
+
for fk_target in foreign_keys:
|
|
1385
|
+
fk_constraint = ForeignKeyRef(fk_target, col)
|
|
1386
|
+
current_edge = g.edges.get([col.table, fk_target.table])
|
|
1387
|
+
|
|
1388
|
+
if current_edge:
|
|
1389
|
+
current_edge["foreign_keys"].append(fk_constraint)
|
|
1390
|
+
else:
|
|
1391
|
+
g.add_edge(col.table, fk_target.table, foreign_keys=[fk_constraint])
|
|
1392
|
+
|
|
1393
|
+
return g
|
|
1394
|
+
|
|
1395
|
+
def join_equivalence_keys(self) -> dict[ColumnReference, set[ColumnReference]]:
|
|
1396
|
+
"""Calculates the equivalence classes of joinable columns in the database schema.
|
|
1397
|
+
|
|
1398
|
+
Two columns are considered joinable, if they are linked by a foreign key constraint.
|
|
1399
|
+
For example, consider a schema with three tables R, S and T with foreign keys R.a -> S.b and S.b -> T.c.
|
|
1400
|
+
Then, the columns R.a, S.b and T.c are all joinable and form an equivalence class.
|
|
1401
|
+
Likewise, the constraints R.a -> T.c and S.b -> T.c would establish the same equivalence class.
|
|
1402
|
+
On the other hand, the constraints R.a -> S.b and S.c -> T.d create two different equivalence classes.
|
|
1403
|
+
|
|
1404
|
+
Returns
|
|
1405
|
+
-------
|
|
1406
|
+
dict[ColumnReference, set[ColumnReference]]
|
|
1407
|
+
A mapping from each column to its equivalence class, i.e. the set of all columns that are joinable with it
|
|
1408
|
+
(including itself).
|
|
1409
|
+
"""
|
|
1410
|
+
columns = util.flatten(self.columns(table) for table in self.tables())
|
|
1411
|
+
g = nx.Graph()
|
|
1412
|
+
for col in columns:
|
|
1413
|
+
edges = [(col, fk_target) for fk_target in self.foreign_keys_on(col)]
|
|
1414
|
+
g.add_edges_from(edges)
|
|
1415
|
+
|
|
1416
|
+
eq_keys: dict[ColumnReference, set[ColumnReference]] = {}
|
|
1417
|
+
for component in nx.connected_components(g):
|
|
1418
|
+
for key in component:
|
|
1419
|
+
eq_keys[key] = component
|
|
1420
|
+
|
|
1421
|
+
return eq_keys
|
|
1422
|
+
|
|
1423
|
+
def join_equivalence_classes(self) -> Iterable[set[ColumnReference]]:
|
|
1424
|
+
"""Calculates the quivalence classes of joinable columns in the database schema.
|
|
1425
|
+
|
|
1426
|
+
This method is similar to `join_equivalence_keys`, but returns the different equivalence classes instead of a
|
|
1427
|
+
mapping. See its documentation for more details.
|
|
1428
|
+
|
|
1429
|
+
See Also
|
|
1430
|
+
--------
|
|
1431
|
+
join_equivalence_keys
|
|
1432
|
+
"""
|
|
1433
|
+
columns = util.flatten(self.columns(table) for table in self.tables())
|
|
1434
|
+
g = nx.Graph()
|
|
1435
|
+
for col in columns:
|
|
1436
|
+
edges = [(col, fk_target) for fk_target in self.foreign_keys_on(col)]
|
|
1437
|
+
g.add_edges_from(edges)
|
|
1438
|
+
return list(nx.connected_components(g))
|
|
1439
|
+
|
|
1440
|
+
def __hash__(self) -> int:
|
|
1441
|
+
return hash(self._db)
|
|
1442
|
+
|
|
1443
|
+
def __eq__(self, other: object) -> bool:
|
|
1444
|
+
return isinstance(other, type(self)) and self._db == other._db
|
|
1445
|
+
|
|
1446
|
+
def __repr__(self) -> str:
|
|
1447
|
+
return str(self)
|
|
1448
|
+
|
|
1449
|
+
def __str__(self) -> str:
|
|
1450
|
+
return f"Database schema of {self._db}"
|
|
1451
|
+
|
|
1452
|
+
|
|
1453
|
+
class DatabaseStatistics(abc.ABC):
|
|
1454
|
+
"""The statistics interface provides unified access to table-level and column-level statistics.
|
|
1455
|
+
|
|
1456
|
+
There are two main challenges when implementing a generalized statistics interface for different database systems.
|
|
1457
|
+
The first one is the non-deterministic creation and maintenance of statistics by most database systems. This means
|
|
1458
|
+
that creating two identical databases on the same database system on the same machine might still yield different
|
|
1459
|
+
statistical values. This is because database systems oftentimes create statistics from random samples of column
|
|
1460
|
+
values to speed up computation. However, such variability hurts our efforts to enable reproducible experiments
|
|
1461
|
+
since different performances metrics might not be due to differences in the optimization algorithms but due to bad
|
|
1462
|
+
luck when creating the statistics (whether it is a good sign if an algorithm is that fragile to deviations in
|
|
1463
|
+
statistics is another question). The second main challenge is that different database systems maintain different
|
|
1464
|
+
statistics. Even though many statistics are considered quite "basic" by the research community, not all systems
|
|
1465
|
+
developers deemed all statistics necessary for their optimizer. Once again, this can severly hinder the application
|
|
1466
|
+
of an optimization algorithm if it relies on a basic statistic that just happens to not be available on the desired
|
|
1467
|
+
target database system.
|
|
1468
|
+
|
|
1469
|
+
To address both of these issues, the statistics interface operates in two different modes: in *native* mode it
|
|
1470
|
+
simply delegates all requests to statistical information to the corresponding catalogs of the database systems.
|
|
1471
|
+
Alternatively, the statistics interface can create the illusion of a normalized and standardized statistics
|
|
1472
|
+
catalogue. This so-called *emulated* mode does not rely on the statistics catalogs and issues equivalent SQL
|
|
1473
|
+
queries instead. For example, if a statistic on the number of distinct values of a column is requested, this
|
|
1474
|
+
emulated by running a *SELECT COUNT(DISTINCT column) FROM table* query.
|
|
1475
|
+
|
|
1476
|
+
The current mode can be customized using the boolean `emulated` property. If the statistics interface operates in
|
|
1477
|
+
native mode (i.e. based on the actual statistics catalog) and the user requests a statistic that is not available
|
|
1478
|
+
in the selected database system, the behavior depends on another attribute: `enable_emulation_fallback`. If this
|
|
1479
|
+
boolean attribute is *True*, an emulated statistic will be calculated instead. Otherwise, an
|
|
1480
|
+
`UnsupportedDatabaseFeatureError` is raised.
|
|
1481
|
+
|
|
1482
|
+
Since the live computation of emulated statistics can be costly, the statistics interface has its own
|
|
1483
|
+
`cache_enabled` attribute. It can be set to `None` to use the default caching behavior of the database system.
|
|
1484
|
+
However, if this attribute is set to `True` or `False` directly, caching will be used accordingly for all
|
|
1485
|
+
compute-intensive statistics operations (and only such operations). Once again, this only works because PostBOUND
|
|
1486
|
+
assumes the database to be immutable.
|
|
1487
|
+
|
|
1488
|
+
Parameters
|
|
1489
|
+
----------
|
|
1490
|
+
db : Database
|
|
1491
|
+
The database for which the schema information should be read. This is required to hook into the database cache
|
|
1492
|
+
and to obtain the cursors to actuall execute queries.
|
|
1493
|
+
emulated : bool, optional
|
|
1494
|
+
Whether the statistics interface should operate in emulation mode. To enable reproducibility, this is *True*
|
|
1495
|
+
by default
|
|
1496
|
+
enable_emulation_fallback : bool, optional
|
|
1497
|
+
Whether emulation should be used for unsupported statistics when running in native mode, by default True
|
|
1498
|
+
cache_enabled : Optional[bool], optional
|
|
1499
|
+
Whether emulated statistics queries should be subject to caching, by default True. Set to *None* to use the
|
|
1500
|
+
caching behavior of the `db`
|
|
1501
|
+
|
|
1502
|
+
See Also
|
|
1503
|
+
--------
|
|
1504
|
+
postbound.postbound.OptimizationPipeline : The basic optimization process applied by PostBOUND
|
|
1505
|
+
"""
|
|
1506
|
+
|
|
1507
|
+
def __init__(
|
|
1508
|
+
self,
|
|
1509
|
+
db: Database,
|
|
1510
|
+
*,
|
|
1511
|
+
emulated: bool = True,
|
|
1512
|
+
enable_emulation_fallback: bool = True,
|
|
1513
|
+
cache_enabled: Optional[bool] = True,
|
|
1514
|
+
) -> None:
|
|
1515
|
+
self.emulated = emulated
|
|
1516
|
+
self.enable_emulation_fallback = enable_emulation_fallback
|
|
1517
|
+
self.cache_enabled = cache_enabled
|
|
1518
|
+
self._db = db
|
|
1519
|
+
|
|
1520
|
+
def total_rows(
|
|
1521
|
+
self,
|
|
1522
|
+
table: TableReference,
|
|
1523
|
+
*,
|
|
1524
|
+
emulated: Optional[bool] = None,
|
|
1525
|
+
cache_enabled: Optional[bool] = None,
|
|
1526
|
+
) -> Optional[int]:
|
|
1527
|
+
"""Provides (an estimate of) the total number of rows in a table.
|
|
1528
|
+
|
|
1529
|
+
Parameters
|
|
1530
|
+
----------
|
|
1531
|
+
table : TableReference
|
|
1532
|
+
The table to check
|
|
1533
|
+
emulated : Optional[bool], optional
|
|
1534
|
+
Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
|
|
1535
|
+
emulation setting of the statistics interface should be used.
|
|
1536
|
+
cache_enabled : Optional[bool], optional
|
|
1537
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1538
|
+
setting of the statistics interface should be used.
|
|
1539
|
+
|
|
1540
|
+
Returns
|
|
1541
|
+
-------
|
|
1542
|
+
Optional[int]
|
|
1543
|
+
The total number of rows in the table. If no such statistic exists, but the database system in principle
|
|
1544
|
+
maintains the statistic, *None* is returned. For example, this situation can occur if the database system
|
|
1545
|
+
only maintains a row count if the table has at least a certain size and the table in question did not reach
|
|
1546
|
+
that size yet.
|
|
1547
|
+
|
|
1548
|
+
Raises
|
|
1549
|
+
------
|
|
1550
|
+
VirtualTableError
|
|
1551
|
+
If the given table is virtual (e.g. subquery or CTE)
|
|
1552
|
+
"""
|
|
1553
|
+
if table.virtual:
|
|
1554
|
+
raise VirtualTableError(table)
|
|
1555
|
+
if emulated or (emulated is None and self.emulated):
|
|
1556
|
+
return self._calculate_total_rows(
|
|
1557
|
+
table, cache_enabled=self._determine_caching_behavior(cache_enabled)
|
|
1558
|
+
)
|
|
1559
|
+
else:
|
|
1560
|
+
return self._retrieve_total_rows_from_stats(table)
|
|
1561
|
+
|
|
1562
|
+
def distinct_values(
|
|
1563
|
+
self,
|
|
1564
|
+
column: ColumnReference,
|
|
1565
|
+
*,
|
|
1566
|
+
emulated: Optional[bool] = None,
|
|
1567
|
+
cache_enabled: Optional[bool] = None,
|
|
1568
|
+
) -> Optional[int]:
|
|
1569
|
+
"""Provides (an estimate of) the total number of different column values of a specific column.
|
|
1570
|
+
|
|
1571
|
+
Parameters
|
|
1572
|
+
----------
|
|
1573
|
+
column : ColumnReference
|
|
1574
|
+
The column to check
|
|
1575
|
+
emulated : Optional[bool], optional
|
|
1576
|
+
Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
|
|
1577
|
+
emulation setting of the statistics interface should be used.
|
|
1578
|
+
cache_enabled : Optional[bool], optional
|
|
1579
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1580
|
+
setting of the statistics interface should be used.
|
|
1581
|
+
|
|
1582
|
+
Returns
|
|
1583
|
+
-------
|
|
1584
|
+
Optional[int]
|
|
1585
|
+
The number of distinct values in the column. If no such statistic exists, but the database system in
|
|
1586
|
+
principle maintains the statistic, *None* is returned. For example, this situation can occur if the
|
|
1587
|
+
database system only maintains a distinct value count if the column values are distributed in a
|
|
1588
|
+
sufficiently diverse way.
|
|
1589
|
+
|
|
1590
|
+
Raises
|
|
1591
|
+
------
|
|
1592
|
+
postbound.qal.UnboundColumnError
|
|
1593
|
+
If the column is not associated with any table
|
|
1594
|
+
postbound.qal.VirtualTableError
|
|
1595
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1596
|
+
"""
|
|
1597
|
+
if not column.table:
|
|
1598
|
+
raise UnboundColumnError(column)
|
|
1599
|
+
elif column.table.virtual:
|
|
1600
|
+
raise VirtualTableError(column.table)
|
|
1601
|
+
if emulated or (emulated is None and self.emulated):
|
|
1602
|
+
return self._calculate_distinct_values(
|
|
1603
|
+
column, cache_enabled=self._determine_caching_behavior(cache_enabled)
|
|
1604
|
+
)
|
|
1605
|
+
else:
|
|
1606
|
+
return self._retrieve_distinct_values_from_stats(column)
|
|
1607
|
+
|
|
1608
|
+
def min_max(
|
|
1609
|
+
self,
|
|
1610
|
+
column: ColumnReference,
|
|
1611
|
+
*,
|
|
1612
|
+
emulated: Optional[bool] = None,
|
|
1613
|
+
cache_enabled: Optional[bool] = None,
|
|
1614
|
+
) -> Optional[tuple[Any, Any]]:
|
|
1615
|
+
"""Provides (an estimate of) the minimum and maximum values in a column.
|
|
1616
|
+
|
|
1617
|
+
Parameters
|
|
1618
|
+
----------
|
|
1619
|
+
column : ColumnReference
|
|
1620
|
+
The column to check
|
|
1621
|
+
emulated : Optional[bool], optional
|
|
1622
|
+
Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
|
|
1623
|
+
emulation setting of the statistics interface should be used.
|
|
1624
|
+
cache_enabled : Optional[bool], optional
|
|
1625
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1626
|
+
setting of the statistics interface should be used.
|
|
1627
|
+
|
|
1628
|
+
Returns
|
|
1629
|
+
-------
|
|
1630
|
+
Optional[tuple[Any, Any]]
|
|
1631
|
+
A tuple of minimum and maximum value. If no such statistic exists, but the database system in principle
|
|
1632
|
+
maintains the statistic, *None* is returned. For example, this situation can occur if thec database
|
|
1633
|
+
system only maintains the min/max value if they are sufficiently far apart.
|
|
1634
|
+
|
|
1635
|
+
Raises
|
|
1636
|
+
------
|
|
1637
|
+
postbound.qal.UnboundColumnError
|
|
1638
|
+
If the column is not associated with any table
|
|
1639
|
+
postbound.qal.VirtualTableError
|
|
1640
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1641
|
+
"""
|
|
1642
|
+
if not column.table:
|
|
1643
|
+
raise UnboundColumnError(column)
|
|
1644
|
+
elif column.table.virtual:
|
|
1645
|
+
raise VirtualTableError(column.table)
|
|
1646
|
+
if emulated or (emulated is None and self.emulated):
|
|
1647
|
+
return self._calculate_min_max_values(
|
|
1648
|
+
column, cache_enabled=self._determine_caching_behavior(cache_enabled)
|
|
1649
|
+
)
|
|
1650
|
+
else:
|
|
1651
|
+
return self._retrieve_min_max_values_from_stats(column)
|
|
1652
|
+
|
|
1653
|
+
def most_common_values(
|
|
1654
|
+
self,
|
|
1655
|
+
column: ColumnReference,
|
|
1656
|
+
*,
|
|
1657
|
+
k: int = 10,
|
|
1658
|
+
emulated: Optional[bool] = None,
|
|
1659
|
+
cache_enabled: Optional[bool] = None,
|
|
1660
|
+
) -> Sequence[tuple[Any, int]]:
|
|
1661
|
+
"""Provides (an estimate of) the total number of occurrences of the `k` most frequent values of a column.
|
|
1662
|
+
|
|
1663
|
+
Parameters
|
|
1664
|
+
----------
|
|
1665
|
+
column : ColumnReference
|
|
1666
|
+
The column to check
|
|
1667
|
+
k : int, optional
|
|
1668
|
+
The maximum number of most common values to return. Defaults to 10. If there are less values available, all
|
|
1669
|
+
of the available values will be returned.
|
|
1670
|
+
emulated : Optional[bool], optional
|
|
1671
|
+
Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
|
|
1672
|
+
emulation setting of the statistics interface should be used.
|
|
1673
|
+
cache_enabled : Optional[bool], optional
|
|
1674
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1675
|
+
setting of the statistics interface should be used.
|
|
1676
|
+
|
|
1677
|
+
Returns
|
|
1678
|
+
-------
|
|
1679
|
+
Sequence[tuple[Any, int]]
|
|
1680
|
+
The most common values in pairs of (value, frequency), starting with the highest frequency. Notice that
|
|
1681
|
+
this sequence can be empty if no values are available. This can happen if the database system in principle
|
|
1682
|
+
maintains this statistic but does considers the value distribution to uniform to make the maintenance
|
|
1683
|
+
worthwhile. Likewise, if less common values exist than the requested `k` value, only the available values
|
|
1684
|
+
will be returned (and the sequence will be shorter than `k` in that case).
|
|
1685
|
+
|
|
1686
|
+
Raises
|
|
1687
|
+
------
|
|
1688
|
+
postbound.qal.UnboundColumnError
|
|
1689
|
+
If the column is not associated with any table
|
|
1690
|
+
postbound.qal.VirtualTableError
|
|
1691
|
+
If the table associated with the column is a virtual table (e.g. subquery or CTE)
|
|
1692
|
+
"""
|
|
1693
|
+
if not column.table:
|
|
1694
|
+
raise UnboundColumnError(column)
|
|
1695
|
+
elif column.table.virtual:
|
|
1696
|
+
raise VirtualTableError(column.table)
|
|
1697
|
+
if emulated or (emulated is None and self.emulated):
|
|
1698
|
+
return self._calculate_most_common_values(
|
|
1699
|
+
column, k, cache_enabled=self._determine_caching_behavior(cache_enabled)
|
|
1700
|
+
)
|
|
1701
|
+
else:
|
|
1702
|
+
return self._retrieve_most_common_values_from_stats(column, k)
|
|
1703
|
+
|
|
1704
|
+
def _calculate_total_rows(
|
|
1705
|
+
self, table: TableReference, *, cache_enabled: Optional[bool] = None
|
|
1706
|
+
) -> int:
|
|
1707
|
+
"""Retrieves the total number of rows of a table by issuing a *COUNT(\\*)* query against the live database.
|
|
1708
|
+
|
|
1709
|
+
The table is assumed to be non-virtual.
|
|
1710
|
+
|
|
1711
|
+
Parameters
|
|
1712
|
+
----------
|
|
1713
|
+
table : TableReference
|
|
1714
|
+
The table to check
|
|
1715
|
+
cache_enabled : Optional[bool], optional
|
|
1716
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1717
|
+
setting of the statistics interface should be used.
|
|
1718
|
+
|
|
1719
|
+
Returns
|
|
1720
|
+
-------
|
|
1721
|
+
int
|
|
1722
|
+
The total number of rows in the table.
|
|
1723
|
+
"""
|
|
1724
|
+
query_template = "SELECT COUNT(*) FROM {tab}".format(tab=table.full_name)
|
|
1725
|
+
return self._db.execute_query(
|
|
1726
|
+
query_template,
|
|
1727
|
+
cache_enabled=self._determine_caching_behavior(cache_enabled),
|
|
1728
|
+
)
|
|
1729
|
+
|
|
1730
|
+
def _calculate_distinct_values(
|
|
1731
|
+
self, column: ColumnReference, *, cache_enabled: Optional[bool] = None
|
|
1732
|
+
) -> int:
|
|
1733
|
+
"""Retrieves the number of distinct column values by issuing a *COUNT(\\*)* / *GROUP BY* query over that
|
|
1734
|
+
column against the live database.
|
|
1735
|
+
|
|
1736
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1737
|
+
|
|
1738
|
+
Parameters
|
|
1739
|
+
----------
|
|
1740
|
+
column : ColumnReference
|
|
1741
|
+
The column to check
|
|
1742
|
+
cache_enabled : Optional[bool], optional
|
|
1743
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1744
|
+
setting of the statistics interface should be used.
|
|
1745
|
+
|
|
1746
|
+
Returns
|
|
1747
|
+
-------
|
|
1748
|
+
int
|
|
1749
|
+
The number of distinct values in the column
|
|
1750
|
+
"""
|
|
1751
|
+
query_template = "SELECT COUNT(DISTINCT {col}) FROM {tab}".format(
|
|
1752
|
+
col=column.name, tab=column.table.full_name
|
|
1753
|
+
)
|
|
1754
|
+
return self._db.execute_query(
|
|
1755
|
+
query_template,
|
|
1756
|
+
cache_enabled=self._determine_caching_behavior(cache_enabled),
|
|
1757
|
+
)
|
|
1758
|
+
|
|
1759
|
+
def _calculate_min_max_values(
|
|
1760
|
+
self, column: ColumnReference, *, cache_enabled: Optional[bool] = None
|
|
1761
|
+
) -> tuple[Any, Any]:
|
|
1762
|
+
"""Retrieves the minimum/maximum values in a column by issuing an aggregation query for that column against the
|
|
1763
|
+
live database.
|
|
1764
|
+
|
|
1765
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1766
|
+
|
|
1767
|
+
Parameters
|
|
1768
|
+
----------
|
|
1769
|
+
column : ColumnReference
|
|
1770
|
+
The column to check
|
|
1771
|
+
cache_enabled : Optional[bool], optional
|
|
1772
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1773
|
+
setting of the statistics interface should be used.
|
|
1774
|
+
|
|
1775
|
+
Returns
|
|
1776
|
+
-------
|
|
1777
|
+
tuple[Any, Any]
|
|
1778
|
+
A tuple of *(min, max)*
|
|
1779
|
+
"""
|
|
1780
|
+
query_template = "SELECT MIN({col}), MAX({col}) FROM {tab}".format(
|
|
1781
|
+
col=column.name, tab=column.table.full_name
|
|
1782
|
+
)
|
|
1783
|
+
return self._db.execute_query(
|
|
1784
|
+
query_template,
|
|
1785
|
+
cache_enabled=self._determine_caching_behavior(cache_enabled),
|
|
1786
|
+
)
|
|
1787
|
+
|
|
1788
|
+
def _calculate_most_common_values(
|
|
1789
|
+
self, column: ColumnReference, k: int, *, cache_enabled: Optional[bool] = None
|
|
1790
|
+
) -> Sequence[tuple[Any, int]]:
|
|
1791
|
+
"""Retrieves the `k` most frequent values of a column along with their frequencies by issuing a query over that
|
|
1792
|
+
column against the live database.
|
|
1793
|
+
|
|
1794
|
+
The actual query combines a *COUNT(\\*)* aggregation, with a grouping over the column values, followed by a
|
|
1795
|
+
count-based ordering and limit.
|
|
1796
|
+
|
|
1797
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1798
|
+
|
|
1799
|
+
Parameters
|
|
1800
|
+
----------
|
|
1801
|
+
column : ColumnReference
|
|
1802
|
+
The column to check
|
|
1803
|
+
k : int
|
|
1804
|
+
The number of most frequent values to retrieve. If less values are available (because there are not as much
|
|
1805
|
+
distinct values in the column), the frequencies of all values is returned.
|
|
1806
|
+
cache_enabled : Optional[bool], optional
|
|
1807
|
+
Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
|
|
1808
|
+
setting of the statistics interface should be used.
|
|
1809
|
+
|
|
1810
|
+
Returns
|
|
1811
|
+
-------
|
|
1812
|
+
Sequence[tuple[Any, int]]
|
|
1813
|
+
The most common values in *(value, frequency)* pairs, ordered by largest frequency first. Can be smaller
|
|
1814
|
+
than the requested `k` value if the column contains less distinct values.
|
|
1815
|
+
"""
|
|
1816
|
+
query_template = textwrap.dedent(
|
|
1817
|
+
"""
|
|
1818
|
+
SELECT {col}, COUNT(*) AS n
|
|
1819
|
+
FROM {tab}
|
|
1820
|
+
GROUP BY {col}
|
|
1821
|
+
ORDER BY n DESC, {col}
|
|
1822
|
+
LIMIT {k}""".format(col=column.name, tab=column.table.full_name, k=k)
|
|
1823
|
+
)
|
|
1824
|
+
return self._db.execute_query(
|
|
1825
|
+
query_template,
|
|
1826
|
+
cache_enabled=self._determine_caching_behavior(cache_enabled),
|
|
1827
|
+
)
|
|
1828
|
+
|
|
1829
|
+
@abc.abstractmethod
|
|
1830
|
+
def _retrieve_total_rows_from_stats(self, table: TableReference) -> Optional[int]:
|
|
1831
|
+
"""Queries the DBMS-internal metadata for the number of rows in a table.
|
|
1832
|
+
|
|
1833
|
+
The table is assumed to be non-virtual.
|
|
1834
|
+
|
|
1835
|
+
Parameters
|
|
1836
|
+
----------
|
|
1837
|
+
table : TableReference
|
|
1838
|
+
The table to check
|
|
1839
|
+
|
|
1840
|
+
Returns
|
|
1841
|
+
-------
|
|
1842
|
+
Optional[int]
|
|
1843
|
+
The total number of rows in the table. If no such statistic exists, but the database system in principle
|
|
1844
|
+
maintains the statistic, *None* is returned. For example, this situation can occur if the database system
|
|
1845
|
+
only maintains a row count if the table has at least a certain size and the table in question did not reach
|
|
1846
|
+
that size yet.
|
|
1847
|
+
"""
|
|
1848
|
+
raise NotImplementedError
|
|
1849
|
+
|
|
1850
|
+
@abc.abstractmethod
|
|
1851
|
+
def _retrieve_distinct_values_from_stats(
|
|
1852
|
+
self, column: ColumnReference
|
|
1853
|
+
) -> Optional[int]:
|
|
1854
|
+
"""Queries the DBMS-internal metadata for the number of distinct values of the column.
|
|
1855
|
+
|
|
1856
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1857
|
+
|
|
1858
|
+
Parameters
|
|
1859
|
+
----------
|
|
1860
|
+
column : ColumnReference
|
|
1861
|
+
The column to check
|
|
1862
|
+
|
|
1863
|
+
Returns
|
|
1864
|
+
-------
|
|
1865
|
+
Optional[int]
|
|
1866
|
+
The number of distinct values in the column. If no such statistic exists, but the database system in
|
|
1867
|
+
principle maintains the statistic, *None* is returned. For example, this situation can occur if the
|
|
1868
|
+
database system only maintains a distinct value count if the column values are distributed in a
|
|
1869
|
+
sufficiently diverse way.
|
|
1870
|
+
"""
|
|
1871
|
+
raise NotImplementedError
|
|
1872
|
+
|
|
1873
|
+
@abc.abstractmethod
|
|
1874
|
+
def _retrieve_min_max_values_from_stats(
|
|
1875
|
+
self, column: ColumnReference
|
|
1876
|
+
) -> Optional[tuple[Any, Any]]:
|
|
1877
|
+
"""Queries the DBMS-internal metadata for the minimum / maximum value in a column.
|
|
1878
|
+
|
|
1879
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1880
|
+
|
|
1881
|
+
Parameters
|
|
1882
|
+
----------
|
|
1883
|
+
column : ColumnReference
|
|
1884
|
+
The column to check
|
|
1885
|
+
|
|
1886
|
+
Returns
|
|
1887
|
+
-------
|
|
1888
|
+
Optional[tuple[Any, Any]]
|
|
1889
|
+
A tuple of minimum and maximum value. If no such statistic exists, but the database system in principle
|
|
1890
|
+
maintains the statistic, *None* is returned. For example, this situation can occur if thec database
|
|
1891
|
+
system only maintains the min/max value if they are sufficiently far apart.
|
|
1892
|
+
"""
|
|
1893
|
+
raise NotImplementedError
|
|
1894
|
+
|
|
1895
|
+
@abc.abstractmethod
|
|
1896
|
+
def _retrieve_most_common_values_from_stats(
|
|
1897
|
+
self, column: ColumnReference, k: int
|
|
1898
|
+
) -> Sequence[tuple[Any, int]]:
|
|
1899
|
+
"""Queries the DBMS-internal metadata for the `k` most common values of the `column`.
|
|
1900
|
+
|
|
1901
|
+
The column is assumed to be bound to a (non-virtual) table.
|
|
1902
|
+
|
|
1903
|
+
Parameters
|
|
1904
|
+
----------
|
|
1905
|
+
column : ColumnReference
|
|
1906
|
+
The column to check
|
|
1907
|
+
k : int, optional
|
|
1908
|
+
The maximum number of most common values to return. Defaults to 10. If there are less values available, all
|
|
1909
|
+
of the available values will be returned.
|
|
1910
|
+
|
|
1911
|
+
Returns
|
|
1912
|
+
-------
|
|
1913
|
+
Sequence[tuple[Any, int]]
|
|
1914
|
+
The most common values in pairs of (value, frequency), starting with the highest frequency. Notice that
|
|
1915
|
+
this sequence can be empty if no values are available. This can happen if the database system in principle
|
|
1916
|
+
maintains this statistic but does considers the value distribution to uniform to make the maintenance
|
|
1917
|
+
worthwhile. Likewise, if less common values exist than the requested `k` value, only the available values
|
|
1918
|
+
will be returned (and the sequence will be shorter than `k` in that case).
|
|
1919
|
+
"""
|
|
1920
|
+
raise NotImplementedError
|
|
1921
|
+
|
|
1922
|
+
def _determine_caching_behavior(
|
|
1923
|
+
self, local_cache_enabled: Optional[bool]
|
|
1924
|
+
) -> Optional[bool]:
|
|
1925
|
+
"""Utility to quickly figure out which caching behavior to use.
|
|
1926
|
+
|
|
1927
|
+
This method is intended to be called by the top-level methods that provide statistics and enable a selective
|
|
1928
|
+
caching which overwrites the caching behavior of the statistics interface.
|
|
1929
|
+
|
|
1930
|
+
Parameters
|
|
1931
|
+
----------
|
|
1932
|
+
local_cache_enabled : Optional[bool]
|
|
1933
|
+
The caching setting selected by the callee / user.
|
|
1934
|
+
|
|
1935
|
+
Returns
|
|
1936
|
+
-------
|
|
1937
|
+
Optional[bool]
|
|
1938
|
+
Whether caching should be enabled or the determined by the actual database interface.
|
|
1939
|
+
"""
|
|
1940
|
+
return (
|
|
1941
|
+
self.cache_enabled if local_cache_enabled is None else local_cache_enabled
|
|
1942
|
+
)
|
|
1943
|
+
|
|
1944
|
+
def __repr__(self) -> str:
|
|
1945
|
+
return str(self)
|
|
1946
|
+
|
|
1947
|
+
def __str__(self) -> str:
|
|
1948
|
+
return f"Database statistics of {self._db}"
|
|
1949
|
+
|
|
1950
|
+
|
|
1951
|
+
class HintWarning(UserWarning):
|
|
1952
|
+
"""Custom warning category for hinting-related problems."""
|
|
1953
|
+
|
|
1954
|
+
def __init__(self, msg: str) -> None:
|
|
1955
|
+
super().__init__(msg)
|
|
1956
|
+
|
|
1957
|
+
|
|
1958
|
+
class HintService(abc.ABC):
|
|
1959
|
+
"""Provides the necessary tools to generate system-specific query instances based on optimizer decisions.
|
|
1960
|
+
|
|
1961
|
+
Hints are PostBOUNDs way to enforce that decisions made in the optimization pipeline are respected by the native
|
|
1962
|
+
query optimizer once the query is executed in an actual database system. The general documentation provides much
|
|
1963
|
+
more information about why this is necessary and how PostBOUND approaches query optimization and query generation.
|
|
1964
|
+
|
|
1965
|
+
Each database system has to implement this interface to be usable as part of an optimization pipeline.
|
|
1966
|
+
|
|
1967
|
+
See Also
|
|
1968
|
+
--------
|
|
1969
|
+
OptimizationPipeline.optimize_query : For a general introduction into the query optimization process
|
|
1970
|
+
"""
|
|
1971
|
+
|
|
1972
|
+
@abc.abstractmethod
|
|
1973
|
+
def generate_hints(
|
|
1974
|
+
self,
|
|
1975
|
+
query: SqlQuery,
|
|
1976
|
+
plan: Optional[QueryPlan] = None,
|
|
1977
|
+
*,
|
|
1978
|
+
join_order: Optional[JoinTree] = None,
|
|
1979
|
+
physical_operators: Optional[PhysicalOperatorAssignment] = None,
|
|
1980
|
+
plan_parameters: Optional[PlanParameterization] = None,
|
|
1981
|
+
) -> SqlQuery:
|
|
1982
|
+
"""Transforms the input query such that the given optimization decisions are respected during query execution.
|
|
1983
|
+
|
|
1984
|
+
In the most common case this involves building a `Hint` clause that encodes the optimization decisions in a
|
|
1985
|
+
system-specific way. However, depending on the concrete database system, this might also involve a
|
|
1986
|
+
restructuring of certain parts of the query, e.g. the usage of specific join statements, the introduction of
|
|
1987
|
+
non-standard SQL statements, or a reordering of the *FROM* clause.
|
|
1988
|
+
|
|
1989
|
+
Notice that all optimization information is optional. If individual parameters are set to *None*, nothing
|
|
1990
|
+
has been enforced by PostBOUND's optimization process and the native optimizer of the database system should
|
|
1991
|
+
"fill the gaps".
|
|
1992
|
+
|
|
1993
|
+
Implementations of this method are required to adhere to operators for joins and scans as much as possible. However,
|
|
1994
|
+
there is no requirement to represent auxiliary nodes (e.g. sorts) if this is not possible or meaningful for the plan.
|
|
1995
|
+
As a rule of thumb, implementations should rate the integrity of the plan in the database higher than a perfect
|
|
1996
|
+
representation of the input data.
|
|
1997
|
+
|
|
1998
|
+
Parameters
|
|
1999
|
+
----------
|
|
2000
|
+
query : SqlQuery
|
|
2001
|
+
The query that should be transformed
|
|
2002
|
+
plan : Optional[QueryPlan], optional
|
|
2003
|
+
The query execution plan. If this is given, all other parameters should be *None*. This essentially
|
|
2004
|
+
enforces the given query plan.
|
|
2005
|
+
join_order : Optional[JoinTree], optional
|
|
2006
|
+
The sequence in which individual joins should be executed.
|
|
2007
|
+
physical_operators : Optional[PhysicalOperatorAssignment], optional
|
|
2008
|
+
The physical operators that should be used for the query execution. In addition to selecting specific
|
|
2009
|
+
operators for specific joins or scans, this can also include disabling certain operators for the entire
|
|
2010
|
+
query.
|
|
2011
|
+
plan_parameters : Optional[PlanParameterization], optional
|
|
2012
|
+
Additional parameters and metadata for the native optimizer of the database system. Probably the most
|
|
2013
|
+
important use-case of these parameters is the supply of cardinality estimates for different joins and
|
|
2014
|
+
scans. For example, these can be combined with a join order to influence the physical operators that the
|
|
2015
|
+
native optimizer chooses. Another scenario is to only supply such cardinality estimates and leave the
|
|
2016
|
+
`join_order` and `physical_operators` completely empty, which essentially simulates a different cardinality
|
|
2017
|
+
estimation algorithm for the query. Notice however, that in this scenario cardinality estimates for all
|
|
2018
|
+
possible intermediate results of the query have to be supplied. Otherwise, the native optimizer once
|
|
2019
|
+
again "fills the gaps" and uses its own estimates for the remaining intermediate results that it explores
|
|
2020
|
+
during plan enumeration. This would probably effectively break the estimation algorithm.
|
|
2021
|
+
|
|
2022
|
+
Returns
|
|
2023
|
+
-------
|
|
2024
|
+
SqlQuery
|
|
2025
|
+
The transformed query. It contains all necessary information to enforce the optimization decisions as best
|
|
2026
|
+
as possible. Notice that whether the native optimizer of the database system is obliged to respect the
|
|
2027
|
+
optimization decisions depends on the specific system. For example, for MySQL hints are really just hints
|
|
2028
|
+
and the optimizer is only encouraged to use specific operators but not forced to do so.
|
|
2029
|
+
"""
|
|
2030
|
+
raise NotImplementedError
|
|
2031
|
+
|
|
2032
|
+
@abc.abstractmethod
|
|
2033
|
+
def format_query(self, query: SqlQuery) -> str:
|
|
2034
|
+
"""Transforms the query into a database-specific string, mostly to incorporate deviations from standard SQL.
|
|
2035
|
+
|
|
2036
|
+
This method is necessary because the query abstraction layer is focused on modelling and unifying different
|
|
2037
|
+
parts of an SQL query. However, some database systems (cough .. MySQL .. cough) deviate from standard SQL
|
|
2038
|
+
syntax and express different parts of a query different. The most prominent example are older versions of
|
|
2039
|
+
MySQL that used double quotes for string values rather than the SQL standard single quotes. Therefore, the
|
|
2040
|
+
`format_query` method takes an abstract representation of an SQL query as input and turns it into a string
|
|
2041
|
+
representation that accounts for all such deviations.
|
|
2042
|
+
|
|
2043
|
+
Parameters
|
|
2044
|
+
----------
|
|
2045
|
+
query : SqlQuery
|
|
2046
|
+
The query that should be adapted for the database system
|
|
2047
|
+
|
|
2048
|
+
Returns
|
|
2049
|
+
-------
|
|
2050
|
+
str
|
|
2051
|
+
An equivalent notation of the query that incorporates system-specific deviations from standard SQL.
|
|
2052
|
+
Notice that this query possibly can no longer be parsed by the query abstraction layer. It is a one-way
|
|
2053
|
+
process.
|
|
2054
|
+
|
|
2055
|
+
See Also
|
|
2056
|
+
--------
|
|
2057
|
+
postbound.qal : the query abstraction layer provided by PostBOUND
|
|
2058
|
+
"""
|
|
2059
|
+
raise NotImplementedError
|
|
2060
|
+
|
|
2061
|
+
@abc.abstractmethod
|
|
2062
|
+
def supports_hint(self, hint: PhysicalOperator | HintType) -> bool:
|
|
2063
|
+
"""Checks, whether the database system is capable of using the specified hint or operator
|
|
2064
|
+
|
|
2065
|
+
Parameters
|
|
2066
|
+
----------
|
|
2067
|
+
hint : PhysicalOperator | HintType
|
|
2068
|
+
The hint/feature to check
|
|
2069
|
+
|
|
2070
|
+
Returns
|
|
2071
|
+
-------
|
|
2072
|
+
bool
|
|
2073
|
+
Indicates whether the feature is supported by the specific database system.
|
|
2074
|
+
"""
|
|
2075
|
+
raise NotImplementedError
|
|
2076
|
+
|
|
2077
|
+
|
|
2078
|
+
class OptimizerInterface(abc.ABC):
|
|
2079
|
+
"""Provides high-level access to internal optimizer-related data for the database system.
|
|
2080
|
+
|
|
2081
|
+
Each funtionality is available through a dedicated method. Notice that not all database systems necessarily
|
|
2082
|
+
support all of this functions.
|
|
2083
|
+
"""
|
|
2084
|
+
|
|
2085
|
+
@abc.abstractmethod
|
|
2086
|
+
def query_plan(self, query: SqlQuery | str) -> QueryPlan:
|
|
2087
|
+
"""Obtains the query execution plan for a specific query.
|
|
2088
|
+
|
|
2089
|
+
This respects all hints that potentially influence the optimization process.
|
|
2090
|
+
|
|
2091
|
+
Parameters
|
|
2092
|
+
----------
|
|
2093
|
+
query : SqlQuery | str
|
|
2094
|
+
The input query
|
|
2095
|
+
|
|
2096
|
+
Returns
|
|
2097
|
+
-------
|
|
2098
|
+
QueryPlan
|
|
2099
|
+
The corresponding execution plan. This will never be an *ANALYZE* plan, but contain as much meaningful
|
|
2100
|
+
information as can be derived for the specific database system (e.g. regarding cardinality and cost
|
|
2101
|
+
estimates)
|
|
2102
|
+
"""
|
|
2103
|
+
raise NotImplementedError
|
|
2104
|
+
|
|
2105
|
+
@abc.abstractmethod
|
|
2106
|
+
def analyze_plan(self, query: SqlQuery) -> QueryPlan:
|
|
2107
|
+
"""Executes a specific query and provides the query execution plan supplemented with runtime information.
|
|
2108
|
+
|
|
2109
|
+
This respects all hints that potentially influence the optimization process.
|
|
2110
|
+
|
|
2111
|
+
Parameters
|
|
2112
|
+
----------
|
|
2113
|
+
query : SqlQuery
|
|
2114
|
+
The input query
|
|
2115
|
+
|
|
2116
|
+
Returns
|
|
2117
|
+
-------
|
|
2118
|
+
QueryPlan
|
|
2119
|
+
The corresponding execution plan. This plan will be an *ANALYZE* plan and contain all information that
|
|
2120
|
+
can be derived for the specific database system (e.g. cardinality estimates as well as true cardinality
|
|
2121
|
+
counts)
|
|
2122
|
+
"""
|
|
2123
|
+
raise NotImplementedError
|
|
2124
|
+
|
|
2125
|
+
@abc.abstractmethod
|
|
2126
|
+
def cardinality_estimate(self, query: SqlQuery | str) -> Cardinality:
|
|
2127
|
+
"""Queries the DBMS query optimizer for its cardinality estimate, instead of executing the query.
|
|
2128
|
+
|
|
2129
|
+
The cardinality estimate will correspond to the estimate for the final node. Therefore, running this method
|
|
2130
|
+
with aggregate queries is not particularly meaningful.
|
|
2131
|
+
|
|
2132
|
+
Parameters
|
|
2133
|
+
----------
|
|
2134
|
+
query : SqlQuery | str
|
|
2135
|
+
The input query
|
|
2136
|
+
|
|
2137
|
+
Returns
|
|
2138
|
+
-------
|
|
2139
|
+
Cardinality
|
|
2140
|
+
The cardinality estimate of the native optimizer for the database system.
|
|
2141
|
+
"""
|
|
2142
|
+
raise NotImplementedError
|
|
2143
|
+
|
|
2144
|
+
@abc.abstractmethod
|
|
2145
|
+
def cost_estimate(self, query: SqlQuery | str) -> Cost:
|
|
2146
|
+
"""Queries the DBMS query optimizer for the estimated cost of executing the query.
|
|
2147
|
+
|
|
2148
|
+
The cost estimate will correspond to the estimate for the final node. Typically, this cost includes the cost
|
|
2149
|
+
of all sub-operators as well.
|
|
2150
|
+
|
|
2151
|
+
Parameters
|
|
2152
|
+
----------
|
|
2153
|
+
query : SqlQuery | str
|
|
2154
|
+
The input query
|
|
2155
|
+
|
|
2156
|
+
Returns
|
|
2157
|
+
-------
|
|
2158
|
+
Cost
|
|
2159
|
+
The cost estimate of the native optimizer for the database system.
|
|
2160
|
+
"""
|
|
2161
|
+
raise NotImplementedError
|
|
2162
|
+
|
|
2163
|
+
|
|
2164
|
+
_DB_POOL: DatabasePool | None = None
|
|
2165
|
+
"""Private variable that captures the current singleton instance of the `DatabasePool`."""
|
|
2166
|
+
|
|
2167
|
+
|
|
2168
|
+
class DatabasePool:
|
|
2169
|
+
"""The database pool allows different parts of the code base to easily obtain access to a database.
|
|
2170
|
+
|
|
2171
|
+
This is achieved by maintaining one global pool of database connections which is shared by the entire system.
|
|
2172
|
+
New database instances can be registered and retrieved via unique keys. As long as there is just a single database
|
|
2173
|
+
instance, it can be accessed via the `current_database` method.
|
|
2174
|
+
|
|
2175
|
+
The database pool implementation follows the singleton pattern. Use the static `get_instance` method to retrieve
|
|
2176
|
+
the database pool instance. All other functionality is provided based on that pool instance.
|
|
2177
|
+
|
|
2178
|
+
References
|
|
2179
|
+
----------
|
|
2180
|
+
|
|
2181
|
+
.. Singleton pattern: https://en.wikipedia.org/wiki/Singleton_pattern
|
|
2182
|
+
"""
|
|
2183
|
+
|
|
2184
|
+
@staticmethod
|
|
2185
|
+
def get_instance() -> DatabasePool:
|
|
2186
|
+
"""Provides access to the singleton database pool, creating a new pool instance if necessary.
|
|
2187
|
+
|
|
2188
|
+
Returns
|
|
2189
|
+
-------
|
|
2190
|
+
DatabasePool
|
|
2191
|
+
The current pool instance
|
|
2192
|
+
"""
|
|
2193
|
+
global _DB_POOL
|
|
2194
|
+
if _DB_POOL is None:
|
|
2195
|
+
_DB_POOL = DatabasePool()
|
|
2196
|
+
return _DB_POOL
|
|
2197
|
+
|
|
2198
|
+
def __init__(self):
|
|
2199
|
+
self._pool: dict[str, Database] = {}
|
|
2200
|
+
|
|
2201
|
+
def current_database(self) -> Database:
|
|
2202
|
+
"""Provides the database that is currently stored in the pool, provided there is just one.
|
|
2203
|
+
|
|
2204
|
+
Returns
|
|
2205
|
+
-------
|
|
2206
|
+
Database
|
|
2207
|
+
The only database in the pool
|
|
2208
|
+
|
|
2209
|
+
Raises
|
|
2210
|
+
------
|
|
2211
|
+
ValueError
|
|
2212
|
+
If there are multiple database instances registered in the pool
|
|
2213
|
+
"""
|
|
2214
|
+
return util.dicts.value(self._pool)
|
|
2215
|
+
|
|
2216
|
+
def register_database(self, key: str, db: Database) -> None:
|
|
2217
|
+
"""Stores a new database in the pool.
|
|
2218
|
+
|
|
2219
|
+
This method is typically called by the connect methods of the respective database system implementations.
|
|
2220
|
+
|
|
2221
|
+
Parameters
|
|
2222
|
+
----------
|
|
2223
|
+
key : str
|
|
2224
|
+
A unique identifier under which the database can be retrieved
|
|
2225
|
+
db : Database
|
|
2226
|
+
The database to store
|
|
2227
|
+
"""
|
|
2228
|
+
self._pool[key] = db
|
|
2229
|
+
|
|
2230
|
+
def retrieve_database(self, key: str) -> Database:
|
|
2231
|
+
"""Provides the database that is registered under a specific key.
|
|
2232
|
+
|
|
2233
|
+
Parameters
|
|
2234
|
+
----------
|
|
2235
|
+
key : str
|
|
2236
|
+
The key that was previously used to register the database
|
|
2237
|
+
|
|
2238
|
+
Returns
|
|
2239
|
+
-------
|
|
2240
|
+
Database
|
|
2241
|
+
The corresponding database
|
|
2242
|
+
|
|
2243
|
+
Raises
|
|
2244
|
+
------
|
|
2245
|
+
KeyError
|
|
2246
|
+
If no database was registered under the given key.
|
|
2247
|
+
"""
|
|
2248
|
+
return self._pool[key]
|
|
2249
|
+
|
|
2250
|
+
def empty(self) -> bool:
|
|
2251
|
+
"""Checks, whether the database pool is currently emtpy (i.e. no database are registered).
|
|
2252
|
+
|
|
2253
|
+
Returns
|
|
2254
|
+
-------
|
|
2255
|
+
bool
|
|
2256
|
+
*True* if the pool is empty.
|
|
2257
|
+
"""
|
|
2258
|
+
return len(self._pool) == 0
|
|
2259
|
+
|
|
2260
|
+
def clear(self) -> None:
|
|
2261
|
+
"""Removes all currently registered databases from the pool."""
|
|
2262
|
+
self._pool.clear()
|
|
2263
|
+
|
|
2264
|
+
def __contains__(self, key: str) -> bool:
|
|
2265
|
+
return key in self._pool
|
|
2266
|
+
|
|
2267
|
+
def __repr__(self) -> str:
|
|
2268
|
+
return str(self)
|
|
2269
|
+
|
|
2270
|
+
def __str__(self) -> str:
|
|
2271
|
+
return f"DatabasePool {self._pool}"
|
|
2272
|
+
|
|
2273
|
+
|
|
2274
|
+
def current_database() -> Database:
|
|
2275
|
+
"""Provides the current database from the `DatabasePool`.
|
|
2276
|
+
|
|
2277
|
+
Returns
|
|
2278
|
+
-------
|
|
2279
|
+
Database
|
|
2280
|
+
The current database instance. If there is not exactly one database in the pool, a `ValueError` is raised.
|
|
2281
|
+
|
|
2282
|
+
See Also
|
|
2283
|
+
--------
|
|
2284
|
+
DatabasePool.current_database
|
|
2285
|
+
"""
|
|
2286
|
+
return DatabasePool.get_instance().current_database()
|
|
2287
|
+
|
|
2288
|
+
|
|
2289
|
+
class UnsupportedDatabaseFeatureError(RuntimeError):
|
|
2290
|
+
"""Indicates that some requested feature is not supported by the database.
|
|
2291
|
+
|
|
2292
|
+
For example, PostgreSQL (at least up to version 15) does not capture minimum or maximum column values in its
|
|
2293
|
+
system statistics. Therefore, forcing the DBS to retrieve such information from its metadata could result in this
|
|
2294
|
+
error.
|
|
2295
|
+
|
|
2296
|
+
Parameters
|
|
2297
|
+
----------
|
|
2298
|
+
database : Database
|
|
2299
|
+
The database that was requested to provide the problematic feature
|
|
2300
|
+
feature : str
|
|
2301
|
+
A textual description for the requested feature
|
|
2302
|
+
"""
|
|
2303
|
+
|
|
2304
|
+
def __init__(self, database: Database, feature: str) -> None:
|
|
2305
|
+
super().__init__(
|
|
2306
|
+
f"Database {database.system_name} does not support feature {feature}"
|
|
2307
|
+
)
|
|
2308
|
+
self.database = database
|
|
2309
|
+
self.feature = feature
|
|
2310
|
+
|
|
2311
|
+
|
|
2312
|
+
class DatabaseServerError(RuntimeError):
|
|
2313
|
+
"""Indicates an error caused by the database server occured while executing a database operation.
|
|
2314
|
+
|
|
2315
|
+
The error was **not** due to a mistake in the user input (such as an SQL syntax error or access privilege
|
|
2316
|
+
violation), but an implementation issue instead (such as out of memory during query execution).
|
|
2317
|
+
|
|
2318
|
+
Parameters
|
|
2319
|
+
----------
|
|
2320
|
+
message : str, optional
|
|
2321
|
+
A textual description of the error, e.g. *out of memory*. Can be left empty by default.
|
|
2322
|
+
context : Optional[object], optional
|
|
2323
|
+
Additional context information for when the error occurred, e.g. the query that caused the error. Mainly
|
|
2324
|
+
intended for debugging purposes.
|
|
2325
|
+
"""
|
|
2326
|
+
|
|
2327
|
+
def __init__(self, message: str = "", context: Optional[object] = None) -> None:
|
|
2328
|
+
super().__init__(message)
|
|
2329
|
+
self.ctx = context
|
|
2330
|
+
|
|
2331
|
+
|
|
2332
|
+
class DatabaseUserError(RuntimeError):
|
|
2333
|
+
"""Indicates that a database operation failed due to an error on the user's end.
|
|
2334
|
+
|
|
2335
|
+
The error could be due to an SQL syntax error, access privilege violation, etc.
|
|
2336
|
+
|
|
2337
|
+
Parameters
|
|
2338
|
+
----------
|
|
2339
|
+
message : str, optional
|
|
2340
|
+
A textual description of the error, e.g. *no such table*. Can be left empty by default.
|
|
2341
|
+
context : Optional[object], optional
|
|
2342
|
+
Additional context information for when the error occurred, e.g. the query that caused the error. Mainly
|
|
2343
|
+
intended for debugging purposes.
|
|
2344
|
+
"""
|
|
2345
|
+
|
|
2346
|
+
def __init__(self, message: str = "", context: Optional[object] = None) -> None:
|
|
2347
|
+
super().__init__(message)
|
|
2348
|
+
self.ctx = context
|