PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/db/_db.py ADDED
@@ -0,0 +1,2348 @@
1
+ """This module provides PostBOUNDs basic interaction with databases.
2
+
3
+ More specifically, this includes
4
+
5
+ - an interface to interact with databases (the `Database` interface)
6
+ - an interface to retrieve schema information (the `DatabaseSchema` interface)
7
+ - an interface to obtain different table-level and column-level statistics (the `DatabaseStatistics` interface)
8
+ - an interface to modify queries such that optimization decisions are respected during the actual query execution (the
9
+ `HintService` interface)
10
+ - an interface to access information of the native optimizer of the database system (the `OptimizerInterface` class)
11
+ - a utility to easily obtain database connections (the `DatabasePool` singleton class).
12
+
13
+ Take a look at the central `Database` class for more details. All concrete database systems need to implement this
14
+ interface.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import abc
20
+ import atexit
21
+ import collections
22
+ import json
23
+ import os
24
+ import textwrap
25
+ import warnings
26
+ from collections.abc import Iterable, Sequence
27
+ from datetime import date, datetime, time, timedelta
28
+ from typing import Any, Optional, Protocol, Type, runtime_checkable
29
+
30
+ import networkx as nx
31
+
32
+ from .. import util
33
+ from .._core import (
34
+ Cardinality,
35
+ ColumnReference,
36
+ Cost,
37
+ TableReference,
38
+ UnboundColumnError,
39
+ VirtualTableError,
40
+ )
41
+ from .._hints import (
42
+ HintType,
43
+ PhysicalOperator,
44
+ PhysicalOperatorAssignment,
45
+ PlanParameterization,
46
+ )
47
+ from .._jointree import JoinTree
48
+ from .._qep import QueryPlan
49
+ from ..qal._qal import SqlQuery
50
+
51
+ ResultRow = tuple
52
+ """Simple type alias to denote a single tuple from a result set."""
53
+
54
+ ResultSet = Sequence[ResultRow]
55
+ """Simple type alias to denote the result relation of a query."""
56
+
57
+
58
+ class Cursor(Protocol):
59
+ """Interface for database cursors that adhere to the Python Database API specification.
60
+
61
+ This is not a complete representation and only focuses on the parts of the specification that are important for
62
+ PostBOUND right now. In the future, additional methods might get added.
63
+
64
+ This type is only intended to denote the expected return type of certain methods, the cursors themselves are
65
+ supplied by the respective database integrations. There should be no need to implement one manually and all cursors
66
+ should be compatible with this interface by default (since they are DB API 2.0 cursor objects).
67
+
68
+ See PEP 249 for details (https://peps.python.org/pep-0249/)
69
+ """
70
+
71
+ @abc.abstractmethod
72
+ def close(self) -> None:
73
+ raise NotImplementedError
74
+
75
+ @abc.abstractmethod
76
+ def execute(
77
+ self, operation: str, parameters: Optional[dict | Sequence] = None
78
+ ) -> Optional[Cursor]:
79
+ raise NotImplementedError
80
+
81
+ @abc.abstractmethod
82
+ def fetchone(self) -> Optional[ResultRow]:
83
+ raise NotImplementedError
84
+
85
+ @abc.abstractmethod
86
+ def fetchall(self) -> Optional[ResultSet]:
87
+ raise NotImplementedError
88
+
89
+
90
+ class Connection(Protocol):
91
+ """Interface for database connections that adhere to the Python Database API specification.
92
+
93
+ This is not a complete representation and only focuses on the parts of the specification that are important for
94
+ PostBOUND right now. In the future, additional methods might get added.
95
+
96
+ This type is only intended to denote the expected return type of certain methods, the connections themselves are
97
+ supplied by the respective database integrations. There should be no need to implement one manually and all
98
+ connections should be compatible with this interface by default (since they are DB API 2.0 connection objects).
99
+
100
+ See PEP 249 for details (https://peps.python.org/pep-0249/)
101
+ """
102
+
103
+ @abc.abstractmethod
104
+ def close(self) -> None:
105
+ raise NotImplementedError
106
+
107
+ @abc.abstractmethod
108
+ def cursor(self) -> Cursor:
109
+ raise NotImplementedError
110
+
111
+
112
+ @runtime_checkable
113
+ class PrewarmingSupport(Protocol):
114
+ """Some databases might support adding specific tables to their shared buffer.
115
+
116
+ If so, they should implement this protocol to allow other parts of the framework to exploit this feature.
117
+ """
118
+
119
+ @abc.abstractmethod
120
+ def prewarm_tables(
121
+ self,
122
+ tables: Optional[TableReference | Iterable[TableReference]] = None,
123
+ *more_tables: TableReference,
124
+ exclude_table_pages: bool = False,
125
+ include_primary_index: bool = True,
126
+ include_secondary_indexes: bool = True,
127
+ ) -> None:
128
+ """Prepares the database buffer pool with tuples from specific tables.
129
+
130
+ Parameters
131
+ ----------
132
+ tables : Optional[TableReference | Iterable[TableReference]], optional
133
+ The tables that should be placed into the buffer pool
134
+ *more_tables : TableReference
135
+ More tables that should be placed into the buffer pool, enabling a more convenient usage of this method.
136
+ See examples for details on the usage.
137
+ exclude_table_pages : bool, optional
138
+ Whether the table data (i.e. pages containing the actual tuples) should *not* be prewarmed. This is off by default,
139
+ meaning that prewarming is applied to the data pages. This can be toggled on to only prewarm index pages (see
140
+ `include_primary_index` and `include_secondary_index`).
141
+ include_primary_index : bool, optional
142
+ Whether the pages of the primary key index should also be prewarmed. Enabled by default.
143
+ include_secondary_indexes : bool, optional
144
+ Whether the pages for secondary indexes should also be prewarmed. Enabled by default.
145
+
146
+ Notes
147
+ -----
148
+ If the database should prewarm more table pages than can be contained in the shared buffer, the actual contents of the
149
+ pool are not specified. All prewarming tasks might happen sequentially, in which case the first prewarmed relations
150
+ will typically be evicted and only the last relations (tables or indexes) are retained in the shared buffer. The
151
+ precise order in which the prewarming tasks are executed is not specified and depends on the actual relations.
152
+
153
+ Examples
154
+ --------
155
+ >>> database.prewarm_tables([table1, table2])
156
+ >>> database.prewarm_tables(table1, table2)
157
+ """
158
+ ...
159
+
160
+
161
+ @runtime_checkable
162
+ class TimeoutSupport(Protocol):
163
+ """Marks database systems that support executing queries with a timeout."""
164
+
165
+ def execute_with_timeout(
166
+ self, query: SqlQuery | str, *, timeout: float = 60.0
167
+ ) -> Optional[ResultSet]:
168
+ """Executes a query with a specific timeout.
169
+
170
+ For query execution, we use the following rules in contrast to `Database.execute_query`:
171
+
172
+ 1. We never make use of the database interfaces' cache, even if it the query is contained in the cache
173
+ 2. We never attempt to simplify the result set, even if this would be possible (e.g., for single-row result sets).
174
+ This is more of a pragmatic decision to be able to indicate a timeout with *None* and distinguishing it from a
175
+ valid result set of a single *NULL* tuple. Otherwise, we would have to resort to raising *TimeoutError* or similar
176
+ strategies, which complicates the control flow for the caller.
177
+
178
+ Parameters
179
+ ----------
180
+ query : SqlQuery | str
181
+ The query to execute. If this contains hints or other special features, those will be treated normally.
182
+ timeout : float, optional
183
+ The timeout in seconds. If the query takes longer (inlcuding all special treatment of the database interface),
184
+ it will be cancelled. Defaults to 60 seconds.
185
+
186
+ Returns
187
+ -------
188
+ Optional[ResultSet]
189
+ The result set of the query. If the query was cancelled, this will be *None*.
190
+ """
191
+ ...
192
+
193
+
194
+ @runtime_checkable
195
+ class StopwatchSupport(Protocol):
196
+ """Marks the database systems that support measurement of query execution times."""
197
+
198
+ def time_query(
199
+ self, query: SqlQuery | str, *, timeout: Optional[float] = None
200
+ ) -> float:
201
+ """Determines the execution time of a query.
202
+
203
+ The execution time is measured from the moment the query is passed to the internal cursor (i.e. including sending the
204
+ query to the database server), until the execution is finished. Therfore, it does not include the time required to
205
+ transfer the result set back to the client.
206
+
207
+ Parameters
208
+ ----------
209
+ query : SqlQuery | str
210
+ The query to execute.
211
+ timeout : Optional[float], optional
212
+ Cancels the query execution if it takes longer than this number (in seconds). Notice that this parameter requires
213
+ timeout support from the database system.
214
+
215
+ Returns
216
+ -------
217
+ float
218
+ The runtime of the query in seconds. The result set is ignored.
219
+
220
+ Raises
221
+ ------
222
+ UnsupportedDatabaseFeatureError
223
+ If the database system does not support timeouts. You can use the `TimeoutSupport` protocol to check this
224
+ beforehand.
225
+ """
226
+ ...
227
+
228
+ def last_query_runtime(self) -> float:
229
+ """Get the runtime of the last executed query.
230
+
231
+ The execution time is measured from the moment the query is passed to the internal cursor (i.e. including sending the
232
+ query to the database server), until the execution is finished. Therfore, it does not include the time required to
233
+ transfer the result set back to the client.
234
+
235
+ Returns
236
+ -------
237
+ float
238
+ The runtime of the last executed query in seconds. If no query has been executed before, *NaN* is returned.
239
+ """
240
+ ...
241
+
242
+
243
+ class QueryCacheWarning(UserWarning):
244
+ """Warning to indicate that the query result cache was not found."""
245
+
246
+ def __init__(self, msg: str) -> None:
247
+ super().__init__(msg)
248
+
249
+
250
+ def simplify_result_set(result_set: list[tuple[Any]]) -> Any:
251
+ """Default implementation of the result set simplification logic outlined in `Database.execute_query`.
252
+
253
+ Parameters
254
+ ----------
255
+ result_set : list[tuple[Any]]
256
+ Result set to simplify: each entry in the list corresponds to one row in the result set and each component of the
257
+ tuples corresponds to one column in the result set
258
+
259
+ Returns
260
+ -------
261
+ Any
262
+ The simplified result set: if the result set consists just of a single row, this row is unwrapped from the list. If the
263
+ result set contains just a single column, this is unwrapped from the tuple. Both simplifications are also combined,
264
+ such that a result set of a single row of a single column is turned into the single value.
265
+ """
266
+ # simplify the query result as much as possible: [(42, 24)] becomes (42, 24) and [(1,), (2,)] becomes [1, 2]
267
+ # [(42, 24), (4.2, 2.4)] is left as-is
268
+ if not result_set:
269
+ return []
270
+
271
+ result_structure = result_set[0] # what do the result tuples look like?
272
+ if len(result_structure) == 1: # do we have just one column?
273
+ result_set = [
274
+ row[0] for row in result_set
275
+ ] # if it is just one column, unwrap it
276
+
277
+ if len(result_set) == 1: # if it is just one row, unwrap it
278
+ return result_set[0]
279
+ return result_set
280
+
281
+
282
+ class _DBCacheJsonEncoder(json.JSONEncoder):
283
+ def default(self, obj: Any) -> Any:
284
+ if isinstance(obj, datetime):
285
+ return {"$datetime": obj.isoformat()}
286
+ elif isinstance(obj, date):
287
+ return {"$date": obj.isoformat()}
288
+ elif isinstance(obj, time):
289
+ return {"$time": obj.isoformat()}
290
+ elif isinstance(obj, timedelta):
291
+ return {"$timedelta": obj.total_seconds()}
292
+ return super().default(obj)
293
+
294
+
295
+ class _DBCacheJsonDecoder(json.JSONDecoder):
296
+ def __init__(self, *args, **kwargs):
297
+ self._second_hook = kwargs.get("object_hook")
298
+ super().__init__(object_hook=self.object_hook, *args, **kwargs)
299
+
300
+ def object_hook(self, obj: Any) -> Any:
301
+ if self._second_hook:
302
+ return self._second_hook(obj)
303
+
304
+ if "$datetime" in obj:
305
+ return datetime.fromisoformat(obj["$datetime"])
306
+ elif "$date" in obj:
307
+ return date.fromisoformat(obj["$date"])
308
+ elif "$time" in obj:
309
+ return time.fromisoformat(obj["$time"])
310
+ elif "$timedelta" in obj:
311
+ return timedelta(seconds=obj["$timedelta"])
312
+ return obj
313
+
314
+
315
+ class Database(abc.ABC):
316
+ """A `Database` is PostBOUND's logical abstraction of physical database management systems.
317
+
318
+ It provides high-level access to internal functionality provided by such systems. More specifically, each
319
+ `Database` instance supports the following functionality:
320
+
321
+ - executing arbitrary SQL queries
322
+ - retrieving schema information, most importantly about primary keys and foreign keys
323
+ - accessing statistical information, such as most common values or the number of rows in a table
324
+ - query formatting and generation of query hints to enforce optimizer decisions (join orders, operators, etc.)
325
+ - introspection of the query optimizer to retrieve query execution plans, cost estimates, etc.
326
+
327
+ Notice, that all this information is by design read-only and functionality to write queries is intentionally not
328
+ implemented (although one could issue `INSERT`/`UPDATE`/`DELETE` queries via the query execution functionality).
329
+
330
+ This restriction to read-only information enables the caching of query results to provide them without running a
331
+ query over and over again. This is achieved by storing the results of past queries in a special JSON file, which is
332
+ read upon creation of the `Database` instance. If this behavior is not desired, it can simply be turned off
333
+ globally via the `cache_enabled` property, or on a per-method-call basis by setting the corresponding parameter. If
334
+ no such parameter is available, the specific method does not make use of the caching mechanic.
335
+
336
+ Each database management system will need to implement this basic interface to enable PostBOUND to access the
337
+ necessary information.
338
+
339
+ Parameters
340
+ ----------
341
+ system_name : str
342
+ The name of the database system for which the connection is established. This is only really important to
343
+ distinguish different instances of the interface in a convenient manner.
344
+ cache_enabled : bool, optional
345
+ Whether complex queries that are executed against the database system should be cached. This is especially useful to
346
+ emulate certain statistics that are not maintained by the specific database system (see `DatabaseStatistics` for
347
+ details). If this is *False*, the query cache will not be loaded as well. Defaults to *True*.
348
+
349
+ Notes
350
+ -----
351
+ When the `__init__` method is called, the connection to the specific database system has to be established already,
352
+ i.e. calling any of the public methods should provide a valid result. This is particularly important, because this
353
+ method takes care of the cache initialization. This initialization in turn relies on identifying the correct
354
+ cache file, which in turn depends on the system name, system version and database name of the connection.
355
+ """
356
+
357
+ def __init__(self, system_name: str, *, cache_enabled: bool = True) -> None:
358
+ self.system_name = system_name
359
+
360
+ self._cache_enabled = cache_enabled
361
+ self._query_cache: dict[str, ResultSet] = {}
362
+ if self._cache_enabled:
363
+ self._inflate_query_cache()
364
+ atexit.register(self.close)
365
+
366
+ @abc.abstractmethod
367
+ def schema(self) -> DatabaseSchema:
368
+ """Provides access to the underlying schema information of the database.
369
+
370
+ Returns
371
+ -------
372
+ DatabaseSchema
373
+ An object implementing the schema interface for the actual database system. This should normally be
374
+ completely stateless.
375
+ """
376
+ raise NotImplementedError
377
+
378
+ @abc.abstractmethod
379
+ def statistics(self) -> DatabaseStatistics:
380
+ """Provides access to the current statistics of the database.
381
+
382
+ Implementing generalized statistics for a framework that supports multiple different physical database systems
383
+ is much more complicated than it might seem at first. Therefore, different modes for the statistics
384
+ provisioning exist. These modes can be changed by setting the properties of the interface. See the
385
+ documentation of `DatabaseStatistics` for more details.
386
+
387
+ Repeated calls to this method are guaranteed to provide the same object. Therefore, changes to the statistics
388
+ interface configuration are guaranteed to be persisted accross multiple accesses to the statistics system.
389
+
390
+ Returns
391
+ -------
392
+ DatabaseStatistics
393
+ The statistics interface. Repeated calls to this method are guaranteed to provide the same object.
394
+ """
395
+ raise NotImplementedError
396
+
397
+ @abc.abstractmethod
398
+ def hinting(self) -> HintService:
399
+ """Provides access to the hint generation facilities for the current database system.
400
+
401
+ Returns
402
+ -------
403
+ HintService
404
+ The hinting service. This should normally be completely stateless.
405
+ """
406
+ raise NotImplementedError
407
+
408
+ @abc.abstractmethod
409
+ def optimizer(self) -> OptimizerInterface:
410
+ """Provides access to optimizer-related functionality of the database system.
411
+
412
+ Returns
413
+ -------
414
+ OptimizerInterface
415
+ The optimizer interface. This should normally be completely stateless.
416
+
417
+ Raises
418
+ ------
419
+ UnsupportedDatabaseFeatureError
420
+ If the database system does not provide any sort of external access to the optimizer.
421
+ """
422
+ raise NotImplementedError
423
+
424
+ @abc.abstractmethod
425
+ def execute_query(
426
+ self,
427
+ query: SqlQuery | str,
428
+ *,
429
+ cache_enabled: Optional[bool] = None,
430
+ raw: bool = False,
431
+ ) -> Any:
432
+ """Executes the given query and returns the associated result set.
433
+
434
+ Parameters
435
+ ----------
436
+ query : SqlQuery | str
437
+ The query to execute. If it contains a `Hint` with `preparatory_statements`, these will be executed
438
+ beforehand. Notice that such statements are never subject to caching.
439
+ cache_enabled : Optional[bool], optional
440
+ Controls the caching behavior for just this one query. The default value of *None* indicates that the
441
+ "global" configuration of the database system should be used. Setting this parameter to a boolean value
442
+ forces or deactivates caching for the specific query for the specific execution no matter what the "global"
443
+ configuration is.
444
+ raw : bool, optional
445
+ Whether the result set should be returned as-is. By default, the result set is simplified. Raw mode skips this
446
+ step.
447
+
448
+ Returns
449
+ -------
450
+ Any
451
+ Result set of the input query. This is a list of equal-length tuples in the most general case. Each
452
+ component of the tuple corresponds to a specific column of the result set and each tuple corresponds to a
453
+ row in the result set. However, many queries do not provide a 2-dimensional result set (e.g. *COUNT(\\*)*
454
+ queries). In such cases, the nested structure of the result set makes it quite cumbersome to use.
455
+ Therefore, this method tries to simplify the return value of the query for more convenient use (if `raw` mode is
456
+ disabled). More specifically, if the query returns just a single row, this row is returned directly as a tuple.
457
+ Furthermore, if the query returns just a single column, the values of that column are returned directly in
458
+ a list. Both simplifications will also be combined, such that a result set of a single row of a single
459
+ value will be returned as that single value directly. In all other cases, the result will be a list
460
+ consisting of the different result tuples.
461
+
462
+ Notes
463
+ -----
464
+ This method is mainly intended to execute read-only SQL queries. In fact, the only types of SQL queries that
465
+ can be modelled by the query abstraction layer are precisely such read-only queries. However, if one really
466
+ needs to execute mutating queries, they can be issued as plain text. Just remember that this behavior is
467
+ heavily discouraged!
468
+
469
+ The precise behavior of this method depends on whether caching is enabled or not. If it is, the query will
470
+ only be executed against the live database system, if it is not in the cache. Otherwise, the result will simply
471
+ be retrieved. Caching can be enabled/disabled for just this one query via the `cache_enabled` switch. If this
472
+ is not specified, caching depends on the `cache_enabled` property.
473
+
474
+ If caching should be used for this method, but is disabled at a database-level, the current cache will still
475
+ be read and persisted. This ensures that all cached queries are properly saved and none of the previous cache
476
+ content is lost.
477
+ """
478
+ raise NotImplementedError
479
+
480
+ @abc.abstractmethod
481
+ def database_name(self) -> str:
482
+ """Provides the name of the (physical) database that the database interface is connected to.
483
+
484
+ Returns
485
+ -------
486
+ str
487
+ The database name, e.g. *imdb* or *tpc-h*
488
+ """
489
+ raise NotImplementedError
490
+
491
+ def database_system_name(self) -> str:
492
+ """Provides the name of the database management system that this interface is connected to.
493
+
494
+ Returns
495
+ -------
496
+ str
497
+ The database system name, e.g. *PostgreSQL*
498
+ """
499
+ return self.system_name
500
+
501
+ @abc.abstractmethod
502
+ def database_system_version(self) -> util.Version:
503
+ """Returns the release version of the database management system that this interface is connected to.
504
+
505
+ Returns
506
+ -------
507
+ util.Version
508
+ The version
509
+ """
510
+ raise NotImplementedError
511
+
512
+ @abc.abstractmethod
513
+ def describe(self) -> dict:
514
+ """Provides a representation of the current database connection as well as its system settings.
515
+
516
+ This description is intended to transparently document which customizations have been applied, thereby giving
517
+ an idea of how the default query execution might have been affected. It can be JSON-serialized and will be
518
+ included by most of the output of the utilities in the `runner` module of the `experiments` package.
519
+
520
+ Returns
521
+ -------
522
+ dict
523
+ The actual description
524
+ """
525
+ raise NotImplementedError
526
+
527
+ @abc.abstractmethod
528
+ def reset_connection(self) -> None:
529
+ """Obtains a new network connection for the database. Useful for debugging purposes or in case of crashes.
530
+
531
+ Notice that resetting the connection can have unintended side-effects if other methods rely on the cursor
532
+ object. After resetting, the former cursor object will probably no longer be valid. Therefore, this method
533
+ should be used with caution.
534
+
535
+ See Also
536
+ --------
537
+ Database.cursor
538
+ """
539
+ raise NotImplementedError
540
+
541
+ def reset_cache(self) -> None:
542
+ """Removes all results from the query cache. Useful for debugging purposes."""
543
+ self._query_cache = {}
544
+
545
+ @abc.abstractmethod
546
+ def cursor(self) -> Cursor:
547
+ """Provides a cursor to execute queries and iterate over result sets manually.
548
+
549
+ Returns
550
+ -------
551
+ Cursor
552
+ A cursor compatible with the Python DB API specification 2.0 (PEP 249). The specific cursor type depends on
553
+ the concrete database implementation however.
554
+
555
+ References
556
+ ----------
557
+
558
+ .. Python DB API specification 2.0 (PEP 249): https://peps.python.org/pep-0249/
559
+ """
560
+ raise NotImplementedError
561
+
562
+ @abc.abstractmethod
563
+ def close(self) -> None:
564
+ """Shuts down all currently open connections to the database."""
565
+ raise NotImplementedError
566
+
567
+ def provides(self, support: Type) -> bool:
568
+ """Checks, whether the database interface supports a specific protocol."""
569
+ return isinstance(self, support)
570
+
571
+ def _get_cache_enabled(self) -> bool:
572
+ """Getter for the `cache_enabled` property.
573
+
574
+ Returns
575
+ -------
576
+ bool
577
+ Whether caching is currently enabled
578
+ """
579
+ return self._cache_enabled
580
+
581
+ def _set_cache_enabled(self, enabled: bool) -> None:
582
+ """Setter for the `cache_enabled` property. Inflates the query cache if necessary.
583
+
584
+ If the cache should be enabled now, but no cached data exists, the cache will be inflated from disk.
585
+
586
+ Parameters
587
+ ----------
588
+ enabled : bool
589
+ Whether caching should be enabled
590
+ """
591
+ if enabled and not self._query_cache:
592
+ self._inflate_query_cache()
593
+ self._cache_enabled = enabled
594
+
595
+ cache_enabled = property(_get_cache_enabled, _set_cache_enabled)
596
+ """Controls, whether the results of executed queries should be cached to prevent future re-execution.
597
+
598
+ If caching should be enabled later on and no cached data exists, the cache will be inflated from disk.
599
+ """
600
+
601
+ def _inflate_query_cache(self) -> None:
602
+ """Tries to read the query cache for this database.
603
+
604
+ This reads a JSON file that contains all cached queries and their result sets. It should not be edited
605
+ manually.
606
+ """
607
+ if self._query_cache:
608
+ return
609
+ query_cache_name = self._query_cache_name()
610
+ if os.path.isfile(query_cache_name):
611
+ with open(query_cache_name, "r") as cache_file:
612
+ try:
613
+ self._query_cache = json.load(cache_file, cls=_DBCacheJsonDecoder)
614
+ except json.JSONDecodeError as e:
615
+ warnings.warn(
616
+ "Could not read query cache: " + str(e),
617
+ category=QueryCacheWarning,
618
+ )
619
+ self._query_cache = {}
620
+ else:
621
+ warnings.warn(
622
+ f"Could not read query cache: File {query_cache_name} does not exist",
623
+ category=QueryCacheWarning,
624
+ )
625
+ self._query_cache = {}
626
+ atexit.register(self._store_query_cache, query_cache_name)
627
+
628
+ def _store_query_cache(self, query_cache_name: str) -> None:
629
+ """Stores the query cache into a JSON file.
630
+
631
+ Parameters
632
+ ----------
633
+ query_cache_name : str
634
+ The path where to write the file to. If it exists, it will be overwritten.
635
+ """
636
+ with open(query_cache_name, "w") as cache_file:
637
+ json.dump(self._query_cache, cache_file, cls=_DBCacheJsonEncoder)
638
+
639
+ def _query_cache_name(self) -> str:
640
+ """Provides a normalized file name for the query cache.
641
+
642
+ Returns
643
+ -------
644
+ str
645
+ The cache file name. It consists of the database system name, system version and the name of the database
646
+ """
647
+ identifier = "_".join(
648
+ [
649
+ self.database_system_name(),
650
+ self.database_system_version().formatted(prefix="v", separator="_"),
651
+ self.database_name(),
652
+ ]
653
+ )
654
+ return f".query_cache_{identifier}.json"
655
+
656
+ def __hash__(self) -> int:
657
+ return hash(self._query_cache_name())
658
+
659
+ def __eq__(self, other: object) -> bool:
660
+ return (
661
+ isinstance(other, type(self))
662
+ and self._query_cache_name() == other._query_cache_name()
663
+ )
664
+
665
+ def __repr__(self) -> str:
666
+ return str(self)
667
+
668
+ def __str__(self) -> str:
669
+ return f"{self.database_name()} @ {self.database_system_name()} ({self.database_system_version()})"
670
+
671
+
672
+ ForeignKeyRef = collections.namedtuple("ForeignKeyRef", ["fk_col", "referenced_col"])
673
+ """
674
+ A foreign key references has a foreign key column `fk_col` (the first element) that requires a matching value in the
675
+ `referenced_col` (the second element) of the target table.
676
+ """
677
+
678
+
679
+ class DatabaseSchema(abc.ABC):
680
+ """This interface provides access to different information about the logical structure of a database.
681
+
682
+ In contrast to database statistics, schema information is much more standardized. PostBOUND therefore only takes on
683
+ the role of a mediator to delegate requests to different parts of the schema to the approapriate - and sometimes
684
+ system specific - metadata catalogs of the database systems. For each kind of schema information a dedicated query
685
+ method exists. Take a look at these methods to understand the functionality provided by the database schema
686
+ interface.
687
+
688
+ Parameters
689
+ ----------
690
+ db : Database
691
+ The database for which the schema information should be read. This is required to obtain cursors that request
692
+ the desired data.
693
+ prep_placeholder : str, optional
694
+ The placeholder that is used for prepared statements. Some systems use `?` as a placeholder, while others use *%s*
695
+ (the default). This needs to be specified to ensure that the information_schema queries are correctly formatted.
696
+
697
+ Notes
698
+ -----
699
+ **Hint for implementors:** the database schema contains no abstract methods that need to be overridden. All methods come
700
+ with a default implementation that uses the *information_schema* to retrieve the necessary information. However, if the
701
+ target database system does not support specific features of the information_schema, the corresponding methods need to be
702
+ overridden to provide the necessary functionality. The documentation of each method details which parts of the
703
+ information_schema it needs.
704
+ """
705
+
706
+ def __init__(self, db: Database, *, prep_placeholder: str = "%s"):
707
+ self._db = db
708
+ self._prep_placeholder = prep_placeholder
709
+
710
+ def tables(self) -> set[TableReference]:
711
+ """Fetches all user-defined tables that are contained in the current database.
712
+
713
+ Returns
714
+ -------
715
+ set[TableReference]
716
+ All tables in the current schema, including materialized views, etc.
717
+
718
+ Notes
719
+ -----
720
+ **Hint for implementors:** the default implementation of this method relies on the *information_schema.tables* view.
721
+ """
722
+ query_template = textwrap.dedent(f"""
723
+ SELECT table_name
724
+ FROM information_schema.tables
725
+ WHERE table_catalog = {self._prep_placeholder}
726
+ AND table_schema = current_schema()
727
+ """)
728
+ self._db.cursor().execute(query_template, (self._db.database_name(),))
729
+ result_set = self._db.cursor().fetchall()
730
+ assert result_set is not None
731
+ return set(TableReference(row[0]) for row in result_set)
732
+
733
+ def columns(self, table: TableReference | str) -> Sequence[ColumnReference]:
734
+ """Fetches all columns of the given table.
735
+
736
+ Parameters
737
+ ----------
738
+ table : TableReference | str
739
+ A table in the current schema
740
+
741
+ Returns
742
+ -------
743
+ Sequence[ColumnReference]
744
+ All columns for the given table. Columns are ordered according to their position in the table.
745
+ Will be empty if the table is not found or does not contain any columns.
746
+
747
+ Raises
748
+ ------
749
+ postbound.qal.VirtualTableError
750
+ If the given table is virtual (e.g. subquery or CTE)
751
+
752
+ Notes
753
+ -----
754
+ **Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
755
+ """
756
+
757
+ # The documentation of lookup_column() reference an implementation detail of this method.
758
+ # Make sure to keep the two in sync.
759
+
760
+ table = table if isinstance(table, TableReference) else TableReference(table)
761
+ if table.virtual:
762
+ raise VirtualTableError(table)
763
+ schema_placeholder = (
764
+ self._prep_placeholder if table.schema else "current_schema()"
765
+ )
766
+ query_template = textwrap.dedent(f"""
767
+ SELECT column_name
768
+ FROM information_schema.columns
769
+ WHERE table_name = {self._prep_placeholder}
770
+ AND table_catalog = current_database()
771
+ AND table_schema = {schema_placeholder}
772
+ ORDER BY ordinal_position
773
+ """)
774
+ params = [table.full_name]
775
+ if table.schema:
776
+ params.append(table.schema)
777
+ self._db.cursor().execute(query_template, params)
778
+ result_set = self._db.cursor().fetchall()
779
+ assert result_set is not None
780
+ return [ColumnReference(row[0], table) for row in result_set]
781
+
782
+ def is_view(self, table: TableReference | str) -> bool:
783
+ """Checks, whether a specific table is actually is a view.
784
+
785
+ Parameters
786
+ ----------
787
+ table : TableReference | str
788
+ The table to check. May not be a virtual table.
789
+
790
+ Returns
791
+ -------
792
+ bool
793
+ Whether the table is a view
794
+
795
+ Raises
796
+ ------
797
+ ValueError
798
+ If the table was not found in the current database
799
+
800
+ Notes
801
+ -----
802
+ **Hint for implementors:** the default implementation of this method relies on the *information_schema.tables* view.
803
+ """
804
+ if isinstance(table, TableReference) and table.virtual:
805
+ raise VirtualTableError(table)
806
+ table = table if isinstance(table, str) else table.full_name
807
+ db_name = self._db.database_name()
808
+
809
+ query_template = textwrap.dedent(f"""
810
+ SELECT table_type
811
+ FROM information_schema.tables
812
+ WHERE table_catalog = {self._prep_placeholder}
813
+ AND table_name = {self._prep_placeholder}
814
+ AND table_catalog = current_database()
815
+ """)
816
+ self._db.cursor().execute(query_template, (db_name, table))
817
+ result_set = self._db.cursor().fetchall()
818
+
819
+ assert result_set is not None
820
+ if not result_set:
821
+ raise ValueError(f"Table '{table}' not found in database '{db_name}'")
822
+ table_type = result_set[0][0]
823
+ return table_type == "VIEW"
824
+
825
+ def lookup_column(
826
+ self,
827
+ column: ColumnReference | str,
828
+ candidate_tables: Iterable[TableReference],
829
+ *,
830
+ expect_match: bool = False,
831
+ ) -> Optional[TableReference]:
832
+ """Searches for a table that owns the given column.
833
+
834
+ Parameters
835
+ ----------
836
+ column : ColumnReference | str
837
+ The column that is being looked up
838
+ candidate_tables : Iterable[TableReference]
839
+ Tables that could possibly own the given column
840
+ expect_match : bool, optional
841
+ If enabled, an error is raised whenever no table is found. Otherwise *None* is returned. By default, this is
842
+ disabled.
843
+
844
+ Returns
845
+ -------
846
+ TableReference
847
+ The first of the `candidate_tables` that has a column of similar name.
848
+
849
+ Raises
850
+ ------
851
+ ValueError
852
+ If `expect_match` is enabled and none of the candidate tables has a column of the given name.
853
+
854
+ Notes
855
+ -----
856
+ **Hint for implementors:** the default implementation of this method (transitively) relies on the
857
+ *information_schema.columns* view.
858
+ """
859
+ for candidate in candidate_tables:
860
+ candidate_cols = self.columns(candidate)
861
+ if column in candidate_cols:
862
+ return candidate
863
+
864
+ if expect_match:
865
+ raise ValueError(
866
+ f"Column '{column}' not found in any of the candidate tables: {candidate_tables}"
867
+ )
868
+ return None
869
+
870
+ def is_primary_key(self, column: ColumnReference) -> bool:
871
+ """Checks, whether a column is the primary key for its associated table.
872
+
873
+ Parameters
874
+ ----------
875
+ column : ColumnReference
876
+ The column to check
877
+
878
+ Returns
879
+ -------
880
+ bool
881
+ Whether the column is the primary key of its table. If it is part of a compound primary key, this is *False*.
882
+
883
+ Raises
884
+ ------
885
+ postbound.qal.UnboundColumnError
886
+ If the column is not associated with any table
887
+ postbound.qal.VirtualTableError
888
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
889
+
890
+ Notes
891
+ -----
892
+ **Hint for implementors:** the default implementation of this method relies on the
893
+ *information_schema.table_constraints* and *information_schema.constraint_column_usage* views.
894
+ """
895
+ if not column.is_bound():
896
+ raise UnboundColumnError(
897
+ f"Cannot check primary key status for column {column}: Column is not bound to any table."
898
+ )
899
+
900
+ schema_placeholder = (
901
+ self._prep_placeholder if column.table.schema else "current_schema()"
902
+ )
903
+ query_template = textwrap.dedent(f"""
904
+ SELECT ccu.column_name
905
+ FROM information_schema.table_constraints tc
906
+ JOIN information_schema.constraint_column_usage ccu
907
+ ON tc.constraint_name = ccu.constraint_name
908
+ AND tc.table_catalog = ccu.table_catalog
909
+ AND tc.table_schema = ccu.table_schema
910
+ AND tc.table_name = ccu.table_name
911
+ AND tc.constraint_catalog = ccu.constraint_catalog
912
+ WHERE tc.table_name = {self._prep_placeholder}
913
+ AND ccu.column_name = {self._prep_placeholder}
914
+ AND tc.constraint_type = 'PRIMARY KEY'
915
+ AND tc.table_catalog = current_database()
916
+ AND tc.table_schema = {schema_placeholder};
917
+ """)
918
+
919
+ params = [column.table.full_name, column.name]
920
+ if column.table.schema:
921
+ params.append(column.table.schema)
922
+
923
+ self._db.cursor().execute(query_template, params)
924
+ result_set = self._db.cursor().fetchone()
925
+
926
+ return result_set is not None
927
+
928
+ def primary_key_column(
929
+ self, table: TableReference | str
930
+ ) -> Optional[ColumnReference]:
931
+ """Determines the primary key column of a specific table.
932
+
933
+ Parameters
934
+ ----------
935
+ table : TableReference | str
936
+ The table to check
937
+
938
+ Returns
939
+ -------
940
+ Optional[ColumnReference]
941
+ The primary key if it exists, or *None* otherwise.
942
+
943
+ Notes
944
+ -----
945
+ **Hint for implementors:** the default implementation of this method relies on the
946
+ *information_schema.table_constraints* and *information_schema.constraint_column_usage* views.
947
+ """
948
+ schema_placeholder = (
949
+ self._prep_placeholder if table.schema else "current_schema()"
950
+ )
951
+ query_template = textwrap.dedent(f"""
952
+ SELECT ccu.column_name
953
+ FROM information_schema.table_constraints tc
954
+ JOIN information_schema.constraint_column_usage ccu
955
+ ON tc.constraint_name = ccu.constraint_name
956
+ AND tc.table_catalog = ccu.table_catalog
957
+ AND tc.table_schema = ccu.table_schema
958
+ AND tc.table_name = ccu.table_name
959
+ AND tc.constraint_catalog = ccu.constraint_catalog
960
+ WHERE tc.table_name = {self._prep_placeholder}
961
+ AND tc.constraint_type = 'PRIMARY KEY'
962
+ AND tc.table_catalog = current_database()
963
+ AND tc.table_schema = {schema_placeholder};
964
+ """)
965
+
966
+ params = [table.full_name]
967
+ if table.schema:
968
+ params.append(table.schema)
969
+
970
+ self._db.cursor().execute(query_template, params)
971
+ result_set = self._db.cursor().fetchall()
972
+
973
+ if not result_set:
974
+ return None
975
+ elif len(result_set) > 1:
976
+ raise ValueError(
977
+ f"Table {table} has multiple primary key columns: {result_set}"
978
+ )
979
+ col = result_set[0][0]
980
+ return ColumnReference(col, table)
981
+
982
+ def has_secondary_index(self, column: ColumnReference) -> bool:
983
+ """Checks, whether a secondary index is available for a specific column.
984
+
985
+ Parameters
986
+ ----------
987
+ column : ColumnReference
988
+ The column to check
989
+
990
+ Returns
991
+ -------
992
+ bool
993
+ Whether a secondary index of any kind was created for the column. Compound indexes and primary key indexes
994
+ fail this test.
995
+
996
+ Raises
997
+ ------
998
+ postbound.qal.UnboundColumnError
999
+ If the column is not associated with any table
1000
+ postbound.qal.VirtualTableError
1001
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1002
+
1003
+ Notes
1004
+ -----
1005
+ **Hints for implementors:**
1006
+ The default implementation of this method assumes that each foreign key column and each column with a UNIQUE constraint
1007
+ has an associated index. If this should not be the case, a custom implementation needs to be supplied.
1008
+ Furthermore, the implementation relies on the *information_schema.table_constraints*,
1009
+ *information_schema.constraint_column_usage* and *information_schema.key_column_usage* views.
1010
+ """
1011
+
1012
+ # The documentation of has_index() references an implementation detail of this method.
1013
+ # Make sure to keep the two in sync.
1014
+
1015
+ if not column.is_bound():
1016
+ raise UnboundColumnError(
1017
+ f"Cannot check index status for column {column}: Column is not bound to any table."
1018
+ )
1019
+
1020
+ schema_placeholder = (
1021
+ self._prep_placeholder if column.table.schema else "current_schema()"
1022
+ )
1023
+
1024
+ # The query template is much more complicated here, due to the different semantics of the constraint_column_usage
1025
+ # view. For UNIQUE constraints, the column is the column that is constrained. However, for foreign keys, the column
1026
+ # is the column that is being referenced.
1027
+ query_template = textwrap.dedent(f"""
1028
+ SELECT ccu.column_name
1029
+ FROM information_schema.table_constraints tc
1030
+ JOIN information_schema.constraint_column_usage ccu
1031
+ ON tc.constraint_name = ccu.constraint_name
1032
+ AND tc.table_catalog = ccu.table_catalog
1033
+ AND tc.table_schema = ccu.table_schema
1034
+ AND tc.table_name = ccu.table_name
1035
+ AND tc.constraint_catalog = ccu.constraint_catalog
1036
+ WHERE tc.table_name = {self._prep_placeholder}
1037
+ AND ccu.column_name = {self._prep_placeholder}
1038
+ AND tc.constraint_type = 'UNIQUE'
1039
+ AND tc.table_catalog = current_database()
1040
+ AND tc.table_schema = {schema_placeholder}
1041
+ UNION
1042
+ SELECT kcu.column_name
1043
+ FROM information_schema.table_constraints tc
1044
+ JOIN information_schema.key_column_usage kcu
1045
+ ON tc.constraint_name = kcu.constraint_name
1046
+ AND tc.table_catalog = kcu.table_catalog
1047
+ AND tc.table_schema = kcu.table_schema
1048
+ AND tc.table_name = kcu.table_name
1049
+ AND tc.constraint_catalog = kcu.constraint_catalog
1050
+ WHERE tc.table_name = {self._prep_placeholder}
1051
+ AND kcu.column_name = {self._prep_placeholder}
1052
+ AND tc.constraint_type = 'FOREIGN KEY'
1053
+ AND tc.table_catalog = current_database()
1054
+ AND tc.table_schema = {schema_placeholder};
1055
+ """)
1056
+
1057
+ # Due to the UNION query, we need to repeat the placeholders. While the implementation is definitely not elegant,
1058
+ # this solution is arguably better than relying on named parameters which might or might not be supported by the
1059
+ # target database.
1060
+ params = [column.table.full_name, column.name]
1061
+ if column.table.schema:
1062
+ params.append(column.table.schema)
1063
+ params.extend([column.table.full_name, column.name])
1064
+ if column.table.schema:
1065
+ params.append(column.table.schema)
1066
+
1067
+ self._db.cursor().execute(query_template, params)
1068
+ result_set = self._db.cursor().fetchone()
1069
+
1070
+ return result_set is not None
1071
+
1072
+ def foreign_keys_on(self, column: ColumnReference) -> set[ColumnReference]:
1073
+ """Fetches all foreign key constraints that are specified on a specific column.
1074
+
1075
+ The provided columns are the target columns that are referenced by the foreign key constraint. E.g., suppose there are
1076
+ tables A and B with columns x and y. We specify a foreign key constraint on column y to ensure that all values in y
1077
+ reference a value in x. Then, calling this method on column y will return column x. If there are multiple foreign key
1078
+ constraints on the same column, all of them will be returned.
1079
+
1080
+ Parameters
1081
+ ----------
1082
+ column : ColumnReference
1083
+ The column to check. All foreign keys that are "pointing from" this column to another column are returned.
1084
+
1085
+ Returns
1086
+ -------
1087
+ set[ColumnReference]
1088
+ The columns that are "pointed to" by foreign key constraints on the given column. If no such foreign keys exist,
1089
+ an empty set is returned.
1090
+
1091
+ Raises
1092
+ ------
1093
+ postbound.qal.UnboundColumnError
1094
+ If the column is not associated with any table
1095
+ postbound.qal.VirtualTableError
1096
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1097
+ """
1098
+ if not column.is_bound():
1099
+ raise UnboundColumnError(
1100
+ f"Cannot check foreign keys for column {column}: Column is not bound to any table."
1101
+ )
1102
+
1103
+ schema_placeholder = (
1104
+ self._prep_placeholder if column.table.schema else "current_schema()"
1105
+ )
1106
+ query_template = textwrap.dedent(f"""
1107
+ SELECT ccu.table_name, ccu.column_name
1108
+ FROM information_schema.table_constraints tc
1109
+ JOIN information_schema.key_column_usage kcu
1110
+ ON tc.constraint_name = kcu.constraint_name
1111
+ AND tc.table_schema = kcu.table_schema
1112
+ AND tc.table_name = kcu.table_name
1113
+ JOIN information_schema.constraint_column_usage ccu
1114
+ ON tc.constraint_name = ccu.constraint_name
1115
+ AND tc.table_schema = ccu.table_schema
1116
+ AND tc.table_catalog = ccu.table_catalog
1117
+ WHERE tc.table_name = {self._prep_placeholder}
1118
+ AND kcu.column_name = {self._prep_placeholder}
1119
+ AND tc.constraint_type = 'FOREIGN KEY'
1120
+ AND tc.table_schema = {schema_placeholder}
1121
+ AND tc.table_catalog = current_database();
1122
+ """)
1123
+ params = [column.table.full_name, column.name]
1124
+ if column.table.schema:
1125
+ params.append(column.table.schema)
1126
+
1127
+ self._db.cursor().execute(query_template, params)
1128
+ result_set = self._db.cursor().fetchall()
1129
+
1130
+ return {
1131
+ ColumnReference(row[1], TableReference(row[0], schema=column.table.schema))
1132
+ for row in result_set
1133
+ }
1134
+
1135
+ def has_index(self, column: ColumnReference) -> bool:
1136
+ """Checks, whether there is any index structure available on a column
1137
+
1138
+ Parameters
1139
+ ----------
1140
+ column : ColumnReference
1141
+ The column to check
1142
+
1143
+ Returns
1144
+ -------
1145
+ bool
1146
+ Whether any kind of index (primary, or secondary) is available for the column. Only compound indexes will
1147
+ fail this test.
1148
+
1149
+ Raises
1150
+ ------
1151
+ postbound.qal.UnboundColumnError
1152
+ If the column is not associated with any table
1153
+ postbound.qal.VirtualTableError
1154
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1155
+
1156
+ Notes
1157
+ -----
1158
+ **Hints for implementors:** the default implementation of this method (transitively) relies on the
1159
+ **information_schema.table_constraints** and **information_schema.constraint_column_usage** views. It assumes that
1160
+ primary keys, foreign keys and unique constraints are all associated with an index structure. If this is not the case,
1161
+ a custom implementation needs to be supplied.
1162
+ """
1163
+ return self.is_primary_key(column) or self.has_secondary_index(column)
1164
+
1165
+ def indexes_on(self, column: ColumnReference) -> set[str]:
1166
+ """Retrieves the names of all indexes of a specific column.
1167
+
1168
+ Parameters
1169
+ ----------
1170
+ column : ColumnReference
1171
+ The column to check.
1172
+
1173
+ Returns
1174
+ -------
1175
+ set[str]
1176
+ The indexes. If no indexes are available, the set will be empty.
1177
+
1178
+ Raises
1179
+ ------
1180
+ postbound.qal.UnboundColumnError
1181
+ If the column is not associated with any table
1182
+ postbound.qal.VirtualTableError
1183
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1184
+
1185
+ Notes
1186
+ -----
1187
+ **Hints for implementors:** the default implementation of this method assumes that primary keys, foreign keys and
1188
+ unique constraints are all associated with an index structure. It provides the names of the corresponding constraints.
1189
+ The implementation relies on the *information_schema.table_constraints*, *information_schema.constraint_column_usage*
1190
+ and *information_schema.key_column_usage* views.
1191
+ """
1192
+ if not column.is_bound():
1193
+ raise UnboundColumnError(
1194
+ f"Cannot retrieve indexes for column {column}: Column is not bound to any table."
1195
+ )
1196
+
1197
+ schema_placeholder = (
1198
+ self._prep_placeholder if column.table.schema else "current_schema()"
1199
+ )
1200
+
1201
+ # The query template is much more complicated here, due to the different semantics of the constraint_column_usage
1202
+ # view. For UNIQUE constraints, the column is the column that is constrained. However, for foreign keys, the column
1203
+ # is the column that is being referenced.
1204
+ query_template = textwrap.dedent(f"""
1205
+ SELECT tc.constraint_name
1206
+ FROM information_schema.table_constraints tc
1207
+ JOIN information_schema.constraint_column_usage ccu
1208
+ ON tc.constraint_name = ccu.constraint_name
1209
+ AND tc.table_catalog = ccu.table_catalog
1210
+ AND tc.table_schema = ccu.table_schema
1211
+ AND tc.table_name = ccu.table_name
1212
+ AND tc.constraint_catalog = ccu.constraint_catalog
1213
+ WHERE tc.table_name = {self._prep_placeholder}
1214
+ AND ccu.column_name = {self._prep_placeholder}
1215
+ AND tc.constraint_type IN ('PRIMARY KEY', 'UNIQUE')
1216
+ AND tc.table_catalog = current_database()
1217
+ AND tc.table_schema = {schema_placeholder}
1218
+ UNION
1219
+ SELECT tc.constraint_name
1220
+ FROM information_schema.table_constraints tc
1221
+ JOIN information_schema.key_column_usage kcu
1222
+ ON tc.constraint_name = kcu.constraint_name
1223
+ AND tc.table_catalog = kcu.table_catalog
1224
+ AND tc.table_schema = kcu.table_schema
1225
+ AND tc.table_name = kcu.table_name
1226
+ AND tc.constraint_catalog = kcu.constraint_catalog
1227
+ WHERE tc.table_name = {self._prep_placeholder}
1228
+ AND kcu.column_name = {self._prep_placeholder}
1229
+ AND tc.constraint_type = 'FOREIGN KEY'
1230
+ AND tc.table_catalog = current_database()
1231
+ AND tc.table_schema = {schema_placeholder};
1232
+ """)
1233
+
1234
+ # Due to the UNION query, we need to repeat the placeholders. While the implementation is definitely not elegant,
1235
+ # this solution is arguably better than relying on named parameters which might or might not be supported by the
1236
+ # target database.
1237
+ params = [column.table.full_name, column.name]
1238
+ if column.table.schema:
1239
+ params.append(column.table.schema)
1240
+ params.extend([column.table.full_name, column.name])
1241
+ if column.table.schema:
1242
+ params.append(column.table.schema)
1243
+
1244
+ self._db.cursor().execute(query_template, params)
1245
+ result_set = self._db.cursor().fetchall()
1246
+
1247
+ return {row[0] for row in result_set}
1248
+
1249
+ def datatype(self, column: ColumnReference) -> str:
1250
+ """Retrieves the (physical) data type of a column.
1251
+
1252
+ The provided type can be a standardized SQL-type, but it can be a type specific to the concrete database
1253
+ system just as well. It is up to the user to figure this out and to react accordingly.
1254
+
1255
+ Parameters
1256
+ ----------
1257
+ column : ColumnReference
1258
+ The colum to check
1259
+
1260
+ Returns
1261
+ -------
1262
+ str
1263
+ The datatype. Will never be empty.
1264
+
1265
+ Raises
1266
+ ------
1267
+ postbound.qal.UnboundColumnError
1268
+ If the column is not associated with any table
1269
+ postbound.qal.VirtualTableError
1270
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1271
+
1272
+ Notes
1273
+ -----
1274
+ **Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
1275
+ """
1276
+ if not column.is_bound():
1277
+ raise UnboundColumnError(
1278
+ f"Cannot check datatype for column {column}: Column is not bound to any table."
1279
+ )
1280
+
1281
+ schema_placeholder = (
1282
+ self._prep_placeholder if column.table.schema else "current_schema()"
1283
+ )
1284
+ query_template = textwrap.dedent(f"""
1285
+ SELECT data_type
1286
+ FROM information_schema.columns
1287
+ WHERE table_name = {self._prep_placeholder}
1288
+ AND column_name = {self._prep_placeholder}
1289
+ AND table_catalog = current_database()
1290
+ AND table_schema = {schema_placeholder};
1291
+ """)
1292
+
1293
+ params = [column.table.full_name, column.name]
1294
+ if column.table.schema:
1295
+ params.append(column.table.schema)
1296
+
1297
+ self._db.cursor().execute(query_template, params)
1298
+ result_set = self._db.cursor().fetchone()
1299
+ assert result_set
1300
+
1301
+ return result_set[0]
1302
+
1303
+ def is_nullable(self, column: ColumnReference) -> bool:
1304
+ """Checks, whether a specific column may contain NULL values.
1305
+
1306
+ Parameters
1307
+ ----------
1308
+ column : ColumnReference
1309
+ The column to check
1310
+
1311
+ Returns
1312
+ -------
1313
+ bool
1314
+ Whether the column may contain NULL values
1315
+
1316
+ Raises
1317
+ ------
1318
+ postbound.qal.UnboundColumnError
1319
+ If the column is not associated with any table
1320
+ postbound.qal.VirtualTableError
1321
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1322
+
1323
+ Notes
1324
+ -----
1325
+ **Hint for implementors:** the default implementation of this method relies on the *information_schema.columns* view.
1326
+ """
1327
+ if not column.is_bound():
1328
+ raise UnboundColumnError(
1329
+ f"Cannot check nullability for column {column}: Column is not bound to any table."
1330
+ )
1331
+
1332
+ schema_placeholder = (
1333
+ self._prep_placeholder if column.table.schema else "current_schema()"
1334
+ )
1335
+ query_template = textwrap.dedent(f"""
1336
+ SELECT is_nullable
1337
+ FROM information_schema.columns
1338
+ WHERE table_name = {self._prep_placeholder}
1339
+ AND column_name = {self._prep_placeholder}
1340
+ AND table_catalog = current_database()
1341
+ AND table_schema = {schema_placeholder};
1342
+ """)
1343
+
1344
+ params = [column.table.full_name, column.name]
1345
+ if column.table.schema:
1346
+ params.append(column.table.schema)
1347
+
1348
+ self._db.cursor().execute(query_template, params)
1349
+ result_set = self._db.cursor().fetchone()
1350
+ assert result_set
1351
+
1352
+ return result_set[0] == "YES"
1353
+
1354
+ def as_graph(self) -> nx.DiGraph:
1355
+ """Constructs a compact representation of the database schema.
1356
+
1357
+ The schema is expressed as a directed graph. Each table is represented as a node. Nodes contain the following
1358
+ attributes:
1359
+ - `columns`: a list of all columns in the table
1360
+ - `data_type`: a dictionary mapping each column to its data type
1361
+ - `primary_key`: the primary key of the table (if it exists, otherwise *None*)
1362
+
1363
+ In addition, edges are used to model foreign key constraints. Each edge points from the table that contains the foreign
1364
+ key (column *y* in the example in `foreign_keys_on`) to the table that is referenced by the foreign key (*x* in the
1365
+ example in `foreign_keys_on`). Edges contain an attribute `foreign_keys` with a list of the foreign key
1366
+ relationships. Each such constraint is described by a `ForeignKeyRef`.
1367
+ """
1368
+ g = nx.DiGraph()
1369
+ all_columns: set[ColumnReference] = set()
1370
+
1371
+ for table in self.tables():
1372
+ if self.is_view(table):
1373
+ continue
1374
+
1375
+ cols = self.columns(table)
1376
+ dtypes = {col: self.datatype(col) for col in cols}
1377
+ pkey = self.primary_key_column(table)
1378
+ g.add_node(table, columns=cols, data_type=dtypes, primary_key=pkey)
1379
+
1380
+ all_columns |= set(cols)
1381
+
1382
+ for col in all_columns:
1383
+ foreign_keys = self.foreign_keys_on(col)
1384
+ for fk_target in foreign_keys:
1385
+ fk_constraint = ForeignKeyRef(fk_target, col)
1386
+ current_edge = g.edges.get([col.table, fk_target.table])
1387
+
1388
+ if current_edge:
1389
+ current_edge["foreign_keys"].append(fk_constraint)
1390
+ else:
1391
+ g.add_edge(col.table, fk_target.table, foreign_keys=[fk_constraint])
1392
+
1393
+ return g
1394
+
1395
+ def join_equivalence_keys(self) -> dict[ColumnReference, set[ColumnReference]]:
1396
+ """Calculates the equivalence classes of joinable columns in the database schema.
1397
+
1398
+ Two columns are considered joinable, if they are linked by a foreign key constraint.
1399
+ For example, consider a schema with three tables R, S and T with foreign keys R.a -> S.b and S.b -> T.c.
1400
+ Then, the columns R.a, S.b and T.c are all joinable and form an equivalence class.
1401
+ Likewise, the constraints R.a -> T.c and S.b -> T.c would establish the same equivalence class.
1402
+ On the other hand, the constraints R.a -> S.b and S.c -> T.d create two different equivalence classes.
1403
+
1404
+ Returns
1405
+ -------
1406
+ dict[ColumnReference, set[ColumnReference]]
1407
+ A mapping from each column to its equivalence class, i.e. the set of all columns that are joinable with it
1408
+ (including itself).
1409
+ """
1410
+ columns = util.flatten(self.columns(table) for table in self.tables())
1411
+ g = nx.Graph()
1412
+ for col in columns:
1413
+ edges = [(col, fk_target) for fk_target in self.foreign_keys_on(col)]
1414
+ g.add_edges_from(edges)
1415
+
1416
+ eq_keys: dict[ColumnReference, set[ColumnReference]] = {}
1417
+ for component in nx.connected_components(g):
1418
+ for key in component:
1419
+ eq_keys[key] = component
1420
+
1421
+ return eq_keys
1422
+
1423
+ def join_equivalence_classes(self) -> Iterable[set[ColumnReference]]:
1424
+ """Calculates the quivalence classes of joinable columns in the database schema.
1425
+
1426
+ This method is similar to `join_equivalence_keys`, but returns the different equivalence classes instead of a
1427
+ mapping. See its documentation for more details.
1428
+
1429
+ See Also
1430
+ --------
1431
+ join_equivalence_keys
1432
+ """
1433
+ columns = util.flatten(self.columns(table) for table in self.tables())
1434
+ g = nx.Graph()
1435
+ for col in columns:
1436
+ edges = [(col, fk_target) for fk_target in self.foreign_keys_on(col)]
1437
+ g.add_edges_from(edges)
1438
+ return list(nx.connected_components(g))
1439
+
1440
+ def __hash__(self) -> int:
1441
+ return hash(self._db)
1442
+
1443
+ def __eq__(self, other: object) -> bool:
1444
+ return isinstance(other, type(self)) and self._db == other._db
1445
+
1446
+ def __repr__(self) -> str:
1447
+ return str(self)
1448
+
1449
+ def __str__(self) -> str:
1450
+ return f"Database schema of {self._db}"
1451
+
1452
+
1453
+ class DatabaseStatistics(abc.ABC):
1454
+ """The statistics interface provides unified access to table-level and column-level statistics.
1455
+
1456
+ There are two main challenges when implementing a generalized statistics interface for different database systems.
1457
+ The first one is the non-deterministic creation and maintenance of statistics by most database systems. This means
1458
+ that creating two identical databases on the same database system on the same machine might still yield different
1459
+ statistical values. This is because database systems oftentimes create statistics from random samples of column
1460
+ values to speed up computation. However, such variability hurts our efforts to enable reproducible experiments
1461
+ since different performances metrics might not be due to differences in the optimization algorithms but due to bad
1462
+ luck when creating the statistics (whether it is a good sign if an algorithm is that fragile to deviations in
1463
+ statistics is another question). The second main challenge is that different database systems maintain different
1464
+ statistics. Even though many statistics are considered quite "basic" by the research community, not all systems
1465
+ developers deemed all statistics necessary for their optimizer. Once again, this can severly hinder the application
1466
+ of an optimization algorithm if it relies on a basic statistic that just happens to not be available on the desired
1467
+ target database system.
1468
+
1469
+ To address both of these issues, the statistics interface operates in two different modes: in *native* mode it
1470
+ simply delegates all requests to statistical information to the corresponding catalogs of the database systems.
1471
+ Alternatively, the statistics interface can create the illusion of a normalized and standardized statistics
1472
+ catalogue. This so-called *emulated* mode does not rely on the statistics catalogs and issues equivalent SQL
1473
+ queries instead. For example, if a statistic on the number of distinct values of a column is requested, this
1474
+ emulated by running a *SELECT COUNT(DISTINCT column) FROM table* query.
1475
+
1476
+ The current mode can be customized using the boolean `emulated` property. If the statistics interface operates in
1477
+ native mode (i.e. based on the actual statistics catalog) and the user requests a statistic that is not available
1478
+ in the selected database system, the behavior depends on another attribute: `enable_emulation_fallback`. If this
1479
+ boolean attribute is *True*, an emulated statistic will be calculated instead. Otherwise, an
1480
+ `UnsupportedDatabaseFeatureError` is raised.
1481
+
1482
+ Since the live computation of emulated statistics can be costly, the statistics interface has its own
1483
+ `cache_enabled` attribute. It can be set to `None` to use the default caching behavior of the database system.
1484
+ However, if this attribute is set to `True` or `False` directly, caching will be used accordingly for all
1485
+ compute-intensive statistics operations (and only such operations). Once again, this only works because PostBOUND
1486
+ assumes the database to be immutable.
1487
+
1488
+ Parameters
1489
+ ----------
1490
+ db : Database
1491
+ The database for which the schema information should be read. This is required to hook into the database cache
1492
+ and to obtain the cursors to actuall execute queries.
1493
+ emulated : bool, optional
1494
+ Whether the statistics interface should operate in emulation mode. To enable reproducibility, this is *True*
1495
+ by default
1496
+ enable_emulation_fallback : bool, optional
1497
+ Whether emulation should be used for unsupported statistics when running in native mode, by default True
1498
+ cache_enabled : Optional[bool], optional
1499
+ Whether emulated statistics queries should be subject to caching, by default True. Set to *None* to use the
1500
+ caching behavior of the `db`
1501
+
1502
+ See Also
1503
+ --------
1504
+ postbound.postbound.OptimizationPipeline : The basic optimization process applied by PostBOUND
1505
+ """
1506
+
1507
+ def __init__(
1508
+ self,
1509
+ db: Database,
1510
+ *,
1511
+ emulated: bool = True,
1512
+ enable_emulation_fallback: bool = True,
1513
+ cache_enabled: Optional[bool] = True,
1514
+ ) -> None:
1515
+ self.emulated = emulated
1516
+ self.enable_emulation_fallback = enable_emulation_fallback
1517
+ self.cache_enabled = cache_enabled
1518
+ self._db = db
1519
+
1520
+ def total_rows(
1521
+ self,
1522
+ table: TableReference,
1523
+ *,
1524
+ emulated: Optional[bool] = None,
1525
+ cache_enabled: Optional[bool] = None,
1526
+ ) -> Optional[int]:
1527
+ """Provides (an estimate of) the total number of rows in a table.
1528
+
1529
+ Parameters
1530
+ ----------
1531
+ table : TableReference
1532
+ The table to check
1533
+ emulated : Optional[bool], optional
1534
+ Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
1535
+ emulation setting of the statistics interface should be used.
1536
+ cache_enabled : Optional[bool], optional
1537
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1538
+ setting of the statistics interface should be used.
1539
+
1540
+ Returns
1541
+ -------
1542
+ Optional[int]
1543
+ The total number of rows in the table. If no such statistic exists, but the database system in principle
1544
+ maintains the statistic, *None* is returned. For example, this situation can occur if the database system
1545
+ only maintains a row count if the table has at least a certain size and the table in question did not reach
1546
+ that size yet.
1547
+
1548
+ Raises
1549
+ ------
1550
+ VirtualTableError
1551
+ If the given table is virtual (e.g. subquery or CTE)
1552
+ """
1553
+ if table.virtual:
1554
+ raise VirtualTableError(table)
1555
+ if emulated or (emulated is None and self.emulated):
1556
+ return self._calculate_total_rows(
1557
+ table, cache_enabled=self._determine_caching_behavior(cache_enabled)
1558
+ )
1559
+ else:
1560
+ return self._retrieve_total_rows_from_stats(table)
1561
+
1562
+ def distinct_values(
1563
+ self,
1564
+ column: ColumnReference,
1565
+ *,
1566
+ emulated: Optional[bool] = None,
1567
+ cache_enabled: Optional[bool] = None,
1568
+ ) -> Optional[int]:
1569
+ """Provides (an estimate of) the total number of different column values of a specific column.
1570
+
1571
+ Parameters
1572
+ ----------
1573
+ column : ColumnReference
1574
+ The column to check
1575
+ emulated : Optional[bool], optional
1576
+ Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
1577
+ emulation setting of the statistics interface should be used.
1578
+ cache_enabled : Optional[bool], optional
1579
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1580
+ setting of the statistics interface should be used.
1581
+
1582
+ Returns
1583
+ -------
1584
+ Optional[int]
1585
+ The number of distinct values in the column. If no such statistic exists, but the database system in
1586
+ principle maintains the statistic, *None* is returned. For example, this situation can occur if the
1587
+ database system only maintains a distinct value count if the column values are distributed in a
1588
+ sufficiently diverse way.
1589
+
1590
+ Raises
1591
+ ------
1592
+ postbound.qal.UnboundColumnError
1593
+ If the column is not associated with any table
1594
+ postbound.qal.VirtualTableError
1595
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1596
+ """
1597
+ if not column.table:
1598
+ raise UnboundColumnError(column)
1599
+ elif column.table.virtual:
1600
+ raise VirtualTableError(column.table)
1601
+ if emulated or (emulated is None and self.emulated):
1602
+ return self._calculate_distinct_values(
1603
+ column, cache_enabled=self._determine_caching_behavior(cache_enabled)
1604
+ )
1605
+ else:
1606
+ return self._retrieve_distinct_values_from_stats(column)
1607
+
1608
+ def min_max(
1609
+ self,
1610
+ column: ColumnReference,
1611
+ *,
1612
+ emulated: Optional[bool] = None,
1613
+ cache_enabled: Optional[bool] = None,
1614
+ ) -> Optional[tuple[Any, Any]]:
1615
+ """Provides (an estimate of) the minimum and maximum values in a column.
1616
+
1617
+ Parameters
1618
+ ----------
1619
+ column : ColumnReference
1620
+ The column to check
1621
+ emulated : Optional[bool], optional
1622
+ Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
1623
+ emulation setting of the statistics interface should be used.
1624
+ cache_enabled : Optional[bool], optional
1625
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1626
+ setting of the statistics interface should be used.
1627
+
1628
+ Returns
1629
+ -------
1630
+ Optional[tuple[Any, Any]]
1631
+ A tuple of minimum and maximum value. If no such statistic exists, but the database system in principle
1632
+ maintains the statistic, *None* is returned. For example, this situation can occur if thec database
1633
+ system only maintains the min/max value if they are sufficiently far apart.
1634
+
1635
+ Raises
1636
+ ------
1637
+ postbound.qal.UnboundColumnError
1638
+ If the column is not associated with any table
1639
+ postbound.qal.VirtualTableError
1640
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1641
+ """
1642
+ if not column.table:
1643
+ raise UnboundColumnError(column)
1644
+ elif column.table.virtual:
1645
+ raise VirtualTableError(column.table)
1646
+ if emulated or (emulated is None and self.emulated):
1647
+ return self._calculate_min_max_values(
1648
+ column, cache_enabled=self._determine_caching_behavior(cache_enabled)
1649
+ )
1650
+ else:
1651
+ return self._retrieve_min_max_values_from_stats(column)
1652
+
1653
+ def most_common_values(
1654
+ self,
1655
+ column: ColumnReference,
1656
+ *,
1657
+ k: int = 10,
1658
+ emulated: Optional[bool] = None,
1659
+ cache_enabled: Optional[bool] = None,
1660
+ ) -> Sequence[tuple[Any, int]]:
1661
+ """Provides (an estimate of) the total number of occurrences of the `k` most frequent values of a column.
1662
+
1663
+ Parameters
1664
+ ----------
1665
+ column : ColumnReference
1666
+ The column to check
1667
+ k : int, optional
1668
+ The maximum number of most common values to return. Defaults to 10. If there are less values available, all
1669
+ of the available values will be returned.
1670
+ emulated : Optional[bool], optional
1671
+ Whether to force emulation mode for this single call. Defaults to *None* which indicates that the
1672
+ emulation setting of the statistics interface should be used.
1673
+ cache_enabled : Optional[bool], optional
1674
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1675
+ setting of the statistics interface should be used.
1676
+
1677
+ Returns
1678
+ -------
1679
+ Sequence[tuple[Any, int]]
1680
+ The most common values in pairs of (value, frequency), starting with the highest frequency. Notice that
1681
+ this sequence can be empty if no values are available. This can happen if the database system in principle
1682
+ maintains this statistic but does considers the value distribution to uniform to make the maintenance
1683
+ worthwhile. Likewise, if less common values exist than the requested `k` value, only the available values
1684
+ will be returned (and the sequence will be shorter than `k` in that case).
1685
+
1686
+ Raises
1687
+ ------
1688
+ postbound.qal.UnboundColumnError
1689
+ If the column is not associated with any table
1690
+ postbound.qal.VirtualTableError
1691
+ If the table associated with the column is a virtual table (e.g. subquery or CTE)
1692
+ """
1693
+ if not column.table:
1694
+ raise UnboundColumnError(column)
1695
+ elif column.table.virtual:
1696
+ raise VirtualTableError(column.table)
1697
+ if emulated or (emulated is None and self.emulated):
1698
+ return self._calculate_most_common_values(
1699
+ column, k, cache_enabled=self._determine_caching_behavior(cache_enabled)
1700
+ )
1701
+ else:
1702
+ return self._retrieve_most_common_values_from_stats(column, k)
1703
+
1704
+ def _calculate_total_rows(
1705
+ self, table: TableReference, *, cache_enabled: Optional[bool] = None
1706
+ ) -> int:
1707
+ """Retrieves the total number of rows of a table by issuing a *COUNT(\\*)* query against the live database.
1708
+
1709
+ The table is assumed to be non-virtual.
1710
+
1711
+ Parameters
1712
+ ----------
1713
+ table : TableReference
1714
+ The table to check
1715
+ cache_enabled : Optional[bool], optional
1716
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1717
+ setting of the statistics interface should be used.
1718
+
1719
+ Returns
1720
+ -------
1721
+ int
1722
+ The total number of rows in the table.
1723
+ """
1724
+ query_template = "SELECT COUNT(*) FROM {tab}".format(tab=table.full_name)
1725
+ return self._db.execute_query(
1726
+ query_template,
1727
+ cache_enabled=self._determine_caching_behavior(cache_enabled),
1728
+ )
1729
+
1730
+ def _calculate_distinct_values(
1731
+ self, column: ColumnReference, *, cache_enabled: Optional[bool] = None
1732
+ ) -> int:
1733
+ """Retrieves the number of distinct column values by issuing a *COUNT(\\*)* / *GROUP BY* query over that
1734
+ column against the live database.
1735
+
1736
+ The column is assumed to be bound to a (non-virtual) table.
1737
+
1738
+ Parameters
1739
+ ----------
1740
+ column : ColumnReference
1741
+ The column to check
1742
+ cache_enabled : Optional[bool], optional
1743
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1744
+ setting of the statistics interface should be used.
1745
+
1746
+ Returns
1747
+ -------
1748
+ int
1749
+ The number of distinct values in the column
1750
+ """
1751
+ query_template = "SELECT COUNT(DISTINCT {col}) FROM {tab}".format(
1752
+ col=column.name, tab=column.table.full_name
1753
+ )
1754
+ return self._db.execute_query(
1755
+ query_template,
1756
+ cache_enabled=self._determine_caching_behavior(cache_enabled),
1757
+ )
1758
+
1759
+ def _calculate_min_max_values(
1760
+ self, column: ColumnReference, *, cache_enabled: Optional[bool] = None
1761
+ ) -> tuple[Any, Any]:
1762
+ """Retrieves the minimum/maximum values in a column by issuing an aggregation query for that column against the
1763
+ live database.
1764
+
1765
+ The column is assumed to be bound to a (non-virtual) table.
1766
+
1767
+ Parameters
1768
+ ----------
1769
+ column : ColumnReference
1770
+ The column to check
1771
+ cache_enabled : Optional[bool], optional
1772
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1773
+ setting of the statistics interface should be used.
1774
+
1775
+ Returns
1776
+ -------
1777
+ tuple[Any, Any]
1778
+ A tuple of *(min, max)*
1779
+ """
1780
+ query_template = "SELECT MIN({col}), MAX({col}) FROM {tab}".format(
1781
+ col=column.name, tab=column.table.full_name
1782
+ )
1783
+ return self._db.execute_query(
1784
+ query_template,
1785
+ cache_enabled=self._determine_caching_behavior(cache_enabled),
1786
+ )
1787
+
1788
+ def _calculate_most_common_values(
1789
+ self, column: ColumnReference, k: int, *, cache_enabled: Optional[bool] = None
1790
+ ) -> Sequence[tuple[Any, int]]:
1791
+ """Retrieves the `k` most frequent values of a column along with their frequencies by issuing a query over that
1792
+ column against the live database.
1793
+
1794
+ The actual query combines a *COUNT(\\*)* aggregation, with a grouping over the column values, followed by a
1795
+ count-based ordering and limit.
1796
+
1797
+ The column is assumed to be bound to a (non-virtual) table.
1798
+
1799
+ Parameters
1800
+ ----------
1801
+ column : ColumnReference
1802
+ The column to check
1803
+ k : int
1804
+ The number of most frequent values to retrieve. If less values are available (because there are not as much
1805
+ distinct values in the column), the frequencies of all values is returned.
1806
+ cache_enabled : Optional[bool], optional
1807
+ Whether to enable result caching in emulation mode. Defaults to *None* which indicates that the caching
1808
+ setting of the statistics interface should be used.
1809
+
1810
+ Returns
1811
+ -------
1812
+ Sequence[tuple[Any, int]]
1813
+ The most common values in *(value, frequency)* pairs, ordered by largest frequency first. Can be smaller
1814
+ than the requested `k` value if the column contains less distinct values.
1815
+ """
1816
+ query_template = textwrap.dedent(
1817
+ """
1818
+ SELECT {col}, COUNT(*) AS n
1819
+ FROM {tab}
1820
+ GROUP BY {col}
1821
+ ORDER BY n DESC, {col}
1822
+ LIMIT {k}""".format(col=column.name, tab=column.table.full_name, k=k)
1823
+ )
1824
+ return self._db.execute_query(
1825
+ query_template,
1826
+ cache_enabled=self._determine_caching_behavior(cache_enabled),
1827
+ )
1828
+
1829
+ @abc.abstractmethod
1830
+ def _retrieve_total_rows_from_stats(self, table: TableReference) -> Optional[int]:
1831
+ """Queries the DBMS-internal metadata for the number of rows in a table.
1832
+
1833
+ The table is assumed to be non-virtual.
1834
+
1835
+ Parameters
1836
+ ----------
1837
+ table : TableReference
1838
+ The table to check
1839
+
1840
+ Returns
1841
+ -------
1842
+ Optional[int]
1843
+ The total number of rows in the table. If no such statistic exists, but the database system in principle
1844
+ maintains the statistic, *None* is returned. For example, this situation can occur if the database system
1845
+ only maintains a row count if the table has at least a certain size and the table in question did not reach
1846
+ that size yet.
1847
+ """
1848
+ raise NotImplementedError
1849
+
1850
+ @abc.abstractmethod
1851
+ def _retrieve_distinct_values_from_stats(
1852
+ self, column: ColumnReference
1853
+ ) -> Optional[int]:
1854
+ """Queries the DBMS-internal metadata for the number of distinct values of the column.
1855
+
1856
+ The column is assumed to be bound to a (non-virtual) table.
1857
+
1858
+ Parameters
1859
+ ----------
1860
+ column : ColumnReference
1861
+ The column to check
1862
+
1863
+ Returns
1864
+ -------
1865
+ Optional[int]
1866
+ The number of distinct values in the column. If no such statistic exists, but the database system in
1867
+ principle maintains the statistic, *None* is returned. For example, this situation can occur if the
1868
+ database system only maintains a distinct value count if the column values are distributed in a
1869
+ sufficiently diverse way.
1870
+ """
1871
+ raise NotImplementedError
1872
+
1873
+ @abc.abstractmethod
1874
+ def _retrieve_min_max_values_from_stats(
1875
+ self, column: ColumnReference
1876
+ ) -> Optional[tuple[Any, Any]]:
1877
+ """Queries the DBMS-internal metadata for the minimum / maximum value in a column.
1878
+
1879
+ The column is assumed to be bound to a (non-virtual) table.
1880
+
1881
+ Parameters
1882
+ ----------
1883
+ column : ColumnReference
1884
+ The column to check
1885
+
1886
+ Returns
1887
+ -------
1888
+ Optional[tuple[Any, Any]]
1889
+ A tuple of minimum and maximum value. If no such statistic exists, but the database system in principle
1890
+ maintains the statistic, *None* is returned. For example, this situation can occur if thec database
1891
+ system only maintains the min/max value if they are sufficiently far apart.
1892
+ """
1893
+ raise NotImplementedError
1894
+
1895
+ @abc.abstractmethod
1896
+ def _retrieve_most_common_values_from_stats(
1897
+ self, column: ColumnReference, k: int
1898
+ ) -> Sequence[tuple[Any, int]]:
1899
+ """Queries the DBMS-internal metadata for the `k` most common values of the `column`.
1900
+
1901
+ The column is assumed to be bound to a (non-virtual) table.
1902
+
1903
+ Parameters
1904
+ ----------
1905
+ column : ColumnReference
1906
+ The column to check
1907
+ k : int, optional
1908
+ The maximum number of most common values to return. Defaults to 10. If there are less values available, all
1909
+ of the available values will be returned.
1910
+
1911
+ Returns
1912
+ -------
1913
+ Sequence[tuple[Any, int]]
1914
+ The most common values in pairs of (value, frequency), starting with the highest frequency. Notice that
1915
+ this sequence can be empty if no values are available. This can happen if the database system in principle
1916
+ maintains this statistic but does considers the value distribution to uniform to make the maintenance
1917
+ worthwhile. Likewise, if less common values exist than the requested `k` value, only the available values
1918
+ will be returned (and the sequence will be shorter than `k` in that case).
1919
+ """
1920
+ raise NotImplementedError
1921
+
1922
+ def _determine_caching_behavior(
1923
+ self, local_cache_enabled: Optional[bool]
1924
+ ) -> Optional[bool]:
1925
+ """Utility to quickly figure out which caching behavior to use.
1926
+
1927
+ This method is intended to be called by the top-level methods that provide statistics and enable a selective
1928
+ caching which overwrites the caching behavior of the statistics interface.
1929
+
1930
+ Parameters
1931
+ ----------
1932
+ local_cache_enabled : Optional[bool]
1933
+ The caching setting selected by the callee / user.
1934
+
1935
+ Returns
1936
+ -------
1937
+ Optional[bool]
1938
+ Whether caching should be enabled or the determined by the actual database interface.
1939
+ """
1940
+ return (
1941
+ self.cache_enabled if local_cache_enabled is None else local_cache_enabled
1942
+ )
1943
+
1944
+ def __repr__(self) -> str:
1945
+ return str(self)
1946
+
1947
+ def __str__(self) -> str:
1948
+ return f"Database statistics of {self._db}"
1949
+
1950
+
1951
+ class HintWarning(UserWarning):
1952
+ """Custom warning category for hinting-related problems."""
1953
+
1954
+ def __init__(self, msg: str) -> None:
1955
+ super().__init__(msg)
1956
+
1957
+
1958
+ class HintService(abc.ABC):
1959
+ """Provides the necessary tools to generate system-specific query instances based on optimizer decisions.
1960
+
1961
+ Hints are PostBOUNDs way to enforce that decisions made in the optimization pipeline are respected by the native
1962
+ query optimizer once the query is executed in an actual database system. The general documentation provides much
1963
+ more information about why this is necessary and how PostBOUND approaches query optimization and query generation.
1964
+
1965
+ Each database system has to implement this interface to be usable as part of an optimization pipeline.
1966
+
1967
+ See Also
1968
+ --------
1969
+ OptimizationPipeline.optimize_query : For a general introduction into the query optimization process
1970
+ """
1971
+
1972
+ @abc.abstractmethod
1973
+ def generate_hints(
1974
+ self,
1975
+ query: SqlQuery,
1976
+ plan: Optional[QueryPlan] = None,
1977
+ *,
1978
+ join_order: Optional[JoinTree] = None,
1979
+ physical_operators: Optional[PhysicalOperatorAssignment] = None,
1980
+ plan_parameters: Optional[PlanParameterization] = None,
1981
+ ) -> SqlQuery:
1982
+ """Transforms the input query such that the given optimization decisions are respected during query execution.
1983
+
1984
+ In the most common case this involves building a `Hint` clause that encodes the optimization decisions in a
1985
+ system-specific way. However, depending on the concrete database system, this might also involve a
1986
+ restructuring of certain parts of the query, e.g. the usage of specific join statements, the introduction of
1987
+ non-standard SQL statements, or a reordering of the *FROM* clause.
1988
+
1989
+ Notice that all optimization information is optional. If individual parameters are set to *None*, nothing
1990
+ has been enforced by PostBOUND's optimization process and the native optimizer of the database system should
1991
+ "fill the gaps".
1992
+
1993
+ Implementations of this method are required to adhere to operators for joins and scans as much as possible. However,
1994
+ there is no requirement to represent auxiliary nodes (e.g. sorts) if this is not possible or meaningful for the plan.
1995
+ As a rule of thumb, implementations should rate the integrity of the plan in the database higher than a perfect
1996
+ representation of the input data.
1997
+
1998
+ Parameters
1999
+ ----------
2000
+ query : SqlQuery
2001
+ The query that should be transformed
2002
+ plan : Optional[QueryPlan], optional
2003
+ The query execution plan. If this is given, all other parameters should be *None*. This essentially
2004
+ enforces the given query plan.
2005
+ join_order : Optional[JoinTree], optional
2006
+ The sequence in which individual joins should be executed.
2007
+ physical_operators : Optional[PhysicalOperatorAssignment], optional
2008
+ The physical operators that should be used for the query execution. In addition to selecting specific
2009
+ operators for specific joins or scans, this can also include disabling certain operators for the entire
2010
+ query.
2011
+ plan_parameters : Optional[PlanParameterization], optional
2012
+ Additional parameters and metadata for the native optimizer of the database system. Probably the most
2013
+ important use-case of these parameters is the supply of cardinality estimates for different joins and
2014
+ scans. For example, these can be combined with a join order to influence the physical operators that the
2015
+ native optimizer chooses. Another scenario is to only supply such cardinality estimates and leave the
2016
+ `join_order` and `physical_operators` completely empty, which essentially simulates a different cardinality
2017
+ estimation algorithm for the query. Notice however, that in this scenario cardinality estimates for all
2018
+ possible intermediate results of the query have to be supplied. Otherwise, the native optimizer once
2019
+ again "fills the gaps" and uses its own estimates for the remaining intermediate results that it explores
2020
+ during plan enumeration. This would probably effectively break the estimation algorithm.
2021
+
2022
+ Returns
2023
+ -------
2024
+ SqlQuery
2025
+ The transformed query. It contains all necessary information to enforce the optimization decisions as best
2026
+ as possible. Notice that whether the native optimizer of the database system is obliged to respect the
2027
+ optimization decisions depends on the specific system. For example, for MySQL hints are really just hints
2028
+ and the optimizer is only encouraged to use specific operators but not forced to do so.
2029
+ """
2030
+ raise NotImplementedError
2031
+
2032
+ @abc.abstractmethod
2033
+ def format_query(self, query: SqlQuery) -> str:
2034
+ """Transforms the query into a database-specific string, mostly to incorporate deviations from standard SQL.
2035
+
2036
+ This method is necessary because the query abstraction layer is focused on modelling and unifying different
2037
+ parts of an SQL query. However, some database systems (cough .. MySQL .. cough) deviate from standard SQL
2038
+ syntax and express different parts of a query different. The most prominent example are older versions of
2039
+ MySQL that used double quotes for string values rather than the SQL standard single quotes. Therefore, the
2040
+ `format_query` method takes an abstract representation of an SQL query as input and turns it into a string
2041
+ representation that accounts for all such deviations.
2042
+
2043
+ Parameters
2044
+ ----------
2045
+ query : SqlQuery
2046
+ The query that should be adapted for the database system
2047
+
2048
+ Returns
2049
+ -------
2050
+ str
2051
+ An equivalent notation of the query that incorporates system-specific deviations from standard SQL.
2052
+ Notice that this query possibly can no longer be parsed by the query abstraction layer. It is a one-way
2053
+ process.
2054
+
2055
+ See Also
2056
+ --------
2057
+ postbound.qal : the query abstraction layer provided by PostBOUND
2058
+ """
2059
+ raise NotImplementedError
2060
+
2061
+ @abc.abstractmethod
2062
+ def supports_hint(self, hint: PhysicalOperator | HintType) -> bool:
2063
+ """Checks, whether the database system is capable of using the specified hint or operator
2064
+
2065
+ Parameters
2066
+ ----------
2067
+ hint : PhysicalOperator | HintType
2068
+ The hint/feature to check
2069
+
2070
+ Returns
2071
+ -------
2072
+ bool
2073
+ Indicates whether the feature is supported by the specific database system.
2074
+ """
2075
+ raise NotImplementedError
2076
+
2077
+
2078
+ class OptimizerInterface(abc.ABC):
2079
+ """Provides high-level access to internal optimizer-related data for the database system.
2080
+
2081
+ Each funtionality is available through a dedicated method. Notice that not all database systems necessarily
2082
+ support all of this functions.
2083
+ """
2084
+
2085
+ @abc.abstractmethod
2086
+ def query_plan(self, query: SqlQuery | str) -> QueryPlan:
2087
+ """Obtains the query execution plan for a specific query.
2088
+
2089
+ This respects all hints that potentially influence the optimization process.
2090
+
2091
+ Parameters
2092
+ ----------
2093
+ query : SqlQuery | str
2094
+ The input query
2095
+
2096
+ Returns
2097
+ -------
2098
+ QueryPlan
2099
+ The corresponding execution plan. This will never be an *ANALYZE* plan, but contain as much meaningful
2100
+ information as can be derived for the specific database system (e.g. regarding cardinality and cost
2101
+ estimates)
2102
+ """
2103
+ raise NotImplementedError
2104
+
2105
+ @abc.abstractmethod
2106
+ def analyze_plan(self, query: SqlQuery) -> QueryPlan:
2107
+ """Executes a specific query and provides the query execution plan supplemented with runtime information.
2108
+
2109
+ This respects all hints that potentially influence the optimization process.
2110
+
2111
+ Parameters
2112
+ ----------
2113
+ query : SqlQuery
2114
+ The input query
2115
+
2116
+ Returns
2117
+ -------
2118
+ QueryPlan
2119
+ The corresponding execution plan. This plan will be an *ANALYZE* plan and contain all information that
2120
+ can be derived for the specific database system (e.g. cardinality estimates as well as true cardinality
2121
+ counts)
2122
+ """
2123
+ raise NotImplementedError
2124
+
2125
+ @abc.abstractmethod
2126
+ def cardinality_estimate(self, query: SqlQuery | str) -> Cardinality:
2127
+ """Queries the DBMS query optimizer for its cardinality estimate, instead of executing the query.
2128
+
2129
+ The cardinality estimate will correspond to the estimate for the final node. Therefore, running this method
2130
+ with aggregate queries is not particularly meaningful.
2131
+
2132
+ Parameters
2133
+ ----------
2134
+ query : SqlQuery | str
2135
+ The input query
2136
+
2137
+ Returns
2138
+ -------
2139
+ Cardinality
2140
+ The cardinality estimate of the native optimizer for the database system.
2141
+ """
2142
+ raise NotImplementedError
2143
+
2144
+ @abc.abstractmethod
2145
+ def cost_estimate(self, query: SqlQuery | str) -> Cost:
2146
+ """Queries the DBMS query optimizer for the estimated cost of executing the query.
2147
+
2148
+ The cost estimate will correspond to the estimate for the final node. Typically, this cost includes the cost
2149
+ of all sub-operators as well.
2150
+
2151
+ Parameters
2152
+ ----------
2153
+ query : SqlQuery | str
2154
+ The input query
2155
+
2156
+ Returns
2157
+ -------
2158
+ Cost
2159
+ The cost estimate of the native optimizer for the database system.
2160
+ """
2161
+ raise NotImplementedError
2162
+
2163
+
2164
+ _DB_POOL: DatabasePool | None = None
2165
+ """Private variable that captures the current singleton instance of the `DatabasePool`."""
2166
+
2167
+
2168
+ class DatabasePool:
2169
+ """The database pool allows different parts of the code base to easily obtain access to a database.
2170
+
2171
+ This is achieved by maintaining one global pool of database connections which is shared by the entire system.
2172
+ New database instances can be registered and retrieved via unique keys. As long as there is just a single database
2173
+ instance, it can be accessed via the `current_database` method.
2174
+
2175
+ The database pool implementation follows the singleton pattern. Use the static `get_instance` method to retrieve
2176
+ the database pool instance. All other functionality is provided based on that pool instance.
2177
+
2178
+ References
2179
+ ----------
2180
+
2181
+ .. Singleton pattern: https://en.wikipedia.org/wiki/Singleton_pattern
2182
+ """
2183
+
2184
+ @staticmethod
2185
+ def get_instance() -> DatabasePool:
2186
+ """Provides access to the singleton database pool, creating a new pool instance if necessary.
2187
+
2188
+ Returns
2189
+ -------
2190
+ DatabasePool
2191
+ The current pool instance
2192
+ """
2193
+ global _DB_POOL
2194
+ if _DB_POOL is None:
2195
+ _DB_POOL = DatabasePool()
2196
+ return _DB_POOL
2197
+
2198
+ def __init__(self):
2199
+ self._pool: dict[str, Database] = {}
2200
+
2201
+ def current_database(self) -> Database:
2202
+ """Provides the database that is currently stored in the pool, provided there is just one.
2203
+
2204
+ Returns
2205
+ -------
2206
+ Database
2207
+ The only database in the pool
2208
+
2209
+ Raises
2210
+ ------
2211
+ ValueError
2212
+ If there are multiple database instances registered in the pool
2213
+ """
2214
+ return util.dicts.value(self._pool)
2215
+
2216
+ def register_database(self, key: str, db: Database) -> None:
2217
+ """Stores a new database in the pool.
2218
+
2219
+ This method is typically called by the connect methods of the respective database system implementations.
2220
+
2221
+ Parameters
2222
+ ----------
2223
+ key : str
2224
+ A unique identifier under which the database can be retrieved
2225
+ db : Database
2226
+ The database to store
2227
+ """
2228
+ self._pool[key] = db
2229
+
2230
+ def retrieve_database(self, key: str) -> Database:
2231
+ """Provides the database that is registered under a specific key.
2232
+
2233
+ Parameters
2234
+ ----------
2235
+ key : str
2236
+ The key that was previously used to register the database
2237
+
2238
+ Returns
2239
+ -------
2240
+ Database
2241
+ The corresponding database
2242
+
2243
+ Raises
2244
+ ------
2245
+ KeyError
2246
+ If no database was registered under the given key.
2247
+ """
2248
+ return self._pool[key]
2249
+
2250
+ def empty(self) -> bool:
2251
+ """Checks, whether the database pool is currently emtpy (i.e. no database are registered).
2252
+
2253
+ Returns
2254
+ -------
2255
+ bool
2256
+ *True* if the pool is empty.
2257
+ """
2258
+ return len(self._pool) == 0
2259
+
2260
+ def clear(self) -> None:
2261
+ """Removes all currently registered databases from the pool."""
2262
+ self._pool.clear()
2263
+
2264
+ def __contains__(self, key: str) -> bool:
2265
+ return key in self._pool
2266
+
2267
+ def __repr__(self) -> str:
2268
+ return str(self)
2269
+
2270
+ def __str__(self) -> str:
2271
+ return f"DatabasePool {self._pool}"
2272
+
2273
+
2274
+ def current_database() -> Database:
2275
+ """Provides the current database from the `DatabasePool`.
2276
+
2277
+ Returns
2278
+ -------
2279
+ Database
2280
+ The current database instance. If there is not exactly one database in the pool, a `ValueError` is raised.
2281
+
2282
+ See Also
2283
+ --------
2284
+ DatabasePool.current_database
2285
+ """
2286
+ return DatabasePool.get_instance().current_database()
2287
+
2288
+
2289
+ class UnsupportedDatabaseFeatureError(RuntimeError):
2290
+ """Indicates that some requested feature is not supported by the database.
2291
+
2292
+ For example, PostgreSQL (at least up to version 15) does not capture minimum or maximum column values in its
2293
+ system statistics. Therefore, forcing the DBS to retrieve such information from its metadata could result in this
2294
+ error.
2295
+
2296
+ Parameters
2297
+ ----------
2298
+ database : Database
2299
+ The database that was requested to provide the problematic feature
2300
+ feature : str
2301
+ A textual description for the requested feature
2302
+ """
2303
+
2304
+ def __init__(self, database: Database, feature: str) -> None:
2305
+ super().__init__(
2306
+ f"Database {database.system_name} does not support feature {feature}"
2307
+ )
2308
+ self.database = database
2309
+ self.feature = feature
2310
+
2311
+
2312
+ class DatabaseServerError(RuntimeError):
2313
+ """Indicates an error caused by the database server occured while executing a database operation.
2314
+
2315
+ The error was **not** due to a mistake in the user input (such as an SQL syntax error or access privilege
2316
+ violation), but an implementation issue instead (such as out of memory during query execution).
2317
+
2318
+ Parameters
2319
+ ----------
2320
+ message : str, optional
2321
+ A textual description of the error, e.g. *out of memory*. Can be left empty by default.
2322
+ context : Optional[object], optional
2323
+ Additional context information for when the error occurred, e.g. the query that caused the error. Mainly
2324
+ intended for debugging purposes.
2325
+ """
2326
+
2327
+ def __init__(self, message: str = "", context: Optional[object] = None) -> None:
2328
+ super().__init__(message)
2329
+ self.ctx = context
2330
+
2331
+
2332
+ class DatabaseUserError(RuntimeError):
2333
+ """Indicates that a database operation failed due to an error on the user's end.
2334
+
2335
+ The error could be due to an SQL syntax error, access privilege violation, etc.
2336
+
2337
+ Parameters
2338
+ ----------
2339
+ message : str, optional
2340
+ A textual description of the error, e.g. *no such table*. Can be left empty by default.
2341
+ context : Optional[object], optional
2342
+ Additional context information for when the error occurred, e.g. the query that caused the error. Mainly
2343
+ intended for debugging purposes.
2344
+ """
2345
+
2346
+ def __init__(self, message: str = "", context: Optional[object] = None) -> None:
2347
+ super().__init__(message)
2348
+ self.ctx = context