PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1150 @@
|
|
|
1
|
+
"""Provides an implementation of a dynamic join graph, as well as some related objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import collections
|
|
6
|
+
import copy
|
|
7
|
+
from collections.abc import Callable, Collection, Iterable, Iterator, Mapping
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Literal, Optional
|
|
10
|
+
|
|
11
|
+
import networkx as nx
|
|
12
|
+
|
|
13
|
+
from .. import util
|
|
14
|
+
from .._core import ColumnReference, DBCatalog, TableReference
|
|
15
|
+
from ..qal import transform
|
|
16
|
+
from ..qal._qal import (
|
|
17
|
+
AbstractPredicate,
|
|
18
|
+
CompoundPredicate,
|
|
19
|
+
ImplicitSqlQuery,
|
|
20
|
+
determine_join_equivalence_classes,
|
|
21
|
+
generate_predicates_for_equivalence_classes,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class JoinPath:
|
|
27
|
+
"""A join path models the join between two tables.
|
|
28
|
+
|
|
29
|
+
Usually a path represents a join where one table is part of an intermediate result, whereas the other table is already part
|
|
30
|
+
of an intermediate result. However, this is not required.
|
|
31
|
+
|
|
32
|
+
Attributes
|
|
33
|
+
----------
|
|
34
|
+
start_table : TableReference
|
|
35
|
+
The first join partner involved in the join. This is the table that is already part of the intermediate result of the
|
|
36
|
+
query
|
|
37
|
+
target_table : TableReference
|
|
38
|
+
The second join partner involved in the join. This is the table that is not yet part of any intermediate result. Thus,
|
|
39
|
+
this is the table that should be joined next
|
|
40
|
+
join_condition : Optional[AbstractPredicate], optional
|
|
41
|
+
The predicate that is used to actually join the `target_table` with the current intermediate result. Usually the
|
|
42
|
+
predicate is restricted to the the join between `start_table` and `target_table`, but can also include additional join
|
|
43
|
+
predicates over other tables in the intermediate results.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
start_table: TableReference
|
|
47
|
+
target_table: TableReference
|
|
48
|
+
join_condition: Optional[AbstractPredicate] = None
|
|
49
|
+
|
|
50
|
+
def tables(self) -> tuple[TableReference, TableReference]:
|
|
51
|
+
"""Provides the tables that are joined.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
tuple[TableReference, TableReference]
|
|
56
|
+
The tables
|
|
57
|
+
|
|
58
|
+
Warnings
|
|
59
|
+
--------
|
|
60
|
+
The definition of this methods differs slightly from other definitions of the tables method that can be found in the
|
|
61
|
+
query abstraction layer. The tables method for join paths really only focuses on `start_table` and `target_table`. If
|
|
62
|
+
additional tables appear as part of the `join_condition`, they are ignored.
|
|
63
|
+
"""
|
|
64
|
+
return self.start_table, self.target_table
|
|
65
|
+
|
|
66
|
+
def spans_table(self, table: TableReference) -> bool:
|
|
67
|
+
"""Checks, whether a specific table is either the start, or the target table in this path.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
table : TableReference
|
|
72
|
+
The table to check
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
bool
|
|
77
|
+
Whether the table is part of the join path. Notice that this check does not consider tables that are part of the
|
|
78
|
+
`join_condition`.
|
|
79
|
+
"""
|
|
80
|
+
return table == self.start_table or table == self.target_table
|
|
81
|
+
|
|
82
|
+
def flip_direction(self) -> JoinPath:
|
|
83
|
+
"""Creates a new join path with the start and target tables reversed.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
JoinPath
|
|
88
|
+
The new join path
|
|
89
|
+
"""
|
|
90
|
+
return JoinPath(
|
|
91
|
+
self.target_table, self.start_table, join_condition=self.join_condition
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def __repr__(self) -> str:
|
|
95
|
+
return str(self)
|
|
96
|
+
|
|
97
|
+
def __str__(self) -> str:
|
|
98
|
+
return f"{self.start_table} ⋈ {self.target_table}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class IndexInfo:
|
|
102
|
+
"""This class captures relevant information about the availability of per-column indexes and their status.
|
|
103
|
+
|
|
104
|
+
The lifecycle of an index can be managed using the `invalidate` method. This indicates that an index can no longer be used
|
|
105
|
+
for a specific join, for example because its column has become part of an intermediate result already. In contrast to many
|
|
106
|
+
other types in PostBOUND, index information is a mutable structure and can be changed in-place.
|
|
107
|
+
|
|
108
|
+
The current implementation is only focused on indexes over a single column, multi-column indexes are not supported. Another
|
|
109
|
+
limitation is that the specific type (i.e. data structure) of the index is not captured. If this information is important,
|
|
110
|
+
it has to be maintained by the user.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
column : ColumnReference
|
|
115
|
+
The column fr which the index is created
|
|
116
|
+
index_type : Literal["primary", "secondary", "none"]
|
|
117
|
+
The kind of index that is maintained. ``"none"`` indicates that there is no index on the column. This is a different
|
|
118
|
+
concept from an index that exists, but cannot be used. The latter case is indicated via the `invalid` parameter
|
|
119
|
+
invalid : bool, optional
|
|
120
|
+
Whether the index can still be used during query execution. Typically, this is true for relations that have not been
|
|
121
|
+
included in any intermediate result and false afterwards.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def primary_index(column: ColumnReference) -> IndexInfo:
|
|
126
|
+
"""Creates index information for a primary key index.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
column : ColumnReference
|
|
131
|
+
The indexed column
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
IndexInfo
|
|
136
|
+
The index information. The index is initialized as a valid index.
|
|
137
|
+
"""
|
|
138
|
+
return IndexInfo(column, "primary")
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def secondary_index(column: ColumnReference) -> IndexInfo:
|
|
142
|
+
"""Creates index information for a secondary index.
|
|
143
|
+
|
|
144
|
+
Foreign key indexes are often defined this way.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
column : ColumnReference
|
|
149
|
+
The indexed column
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
IndexInfo
|
|
154
|
+
The index information. The index is initialized as a valid index.
|
|
155
|
+
"""
|
|
156
|
+
return IndexInfo(column, "secondary")
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def no_index(column: ColumnReference) -> IndexInfo:
|
|
160
|
+
"""Creates index information that indicates the absence of an index.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
column : ColumnReference
|
|
165
|
+
A column that does not have any index
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
IndexInfo
|
|
170
|
+
The index information
|
|
171
|
+
"""
|
|
172
|
+
return IndexInfo(column, "none")
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def generate_for(column: ColumnReference, db_schema: DBCatalog) -> IndexInfo:
|
|
176
|
+
"""Determines available indexes for a specific column.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
column : ColumnReference
|
|
181
|
+
The column. It has to be connected to a valid, non-virtual table reference
|
|
182
|
+
db_schema : DBCatalog
|
|
183
|
+
The schema of the database to which the column belongs.
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
IndexInfo
|
|
188
|
+
_description_
|
|
189
|
+
|
|
190
|
+
Raises
|
|
191
|
+
------
|
|
192
|
+
base.UnboundColumnError
|
|
193
|
+
If the column is not associated with any table
|
|
194
|
+
"""
|
|
195
|
+
if db_schema.is_primary_key(column):
|
|
196
|
+
return IndexInfo.primary_index(column)
|
|
197
|
+
elif db_schema.has_secondary_index(column):
|
|
198
|
+
return IndexInfo.secondary_index(column)
|
|
199
|
+
else:
|
|
200
|
+
return IndexInfo.no_index(column)
|
|
201
|
+
|
|
202
|
+
def __init__(
|
|
203
|
+
self,
|
|
204
|
+
column: ColumnReference,
|
|
205
|
+
index_type: Literal["primary", "secondary", "none"],
|
|
206
|
+
invalid: bool = False,
|
|
207
|
+
) -> None:
|
|
208
|
+
self._column = column
|
|
209
|
+
self._index_type = index_type
|
|
210
|
+
self._is_invalid = invalid
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def column(self) -> ColumnReference:
|
|
214
|
+
"""Get the column to which the index information belongs.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
ColumnReference
|
|
219
|
+
The column
|
|
220
|
+
"""
|
|
221
|
+
return self._column
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def index_type(self) -> Literal["primary", "secondary", "none"]:
|
|
225
|
+
"""Get the kind of index that is in principle available on the column.
|
|
226
|
+
|
|
227
|
+
The index type does not contain any information about whether an index is actually usable for a specific join. It
|
|
228
|
+
merely states whether an index has been defined.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
str
|
|
233
|
+
The index type. Can be *primary*, *secondary* or *none*.
|
|
234
|
+
"""
|
|
235
|
+
return self._index_type
|
|
236
|
+
|
|
237
|
+
@property
|
|
238
|
+
def is_invalid(self) -> bool:
|
|
239
|
+
"""Get whether the index is actually usable.
|
|
240
|
+
|
|
241
|
+
To determine whether an index can be used right now, this property has to be combined with the `index_type` value.
|
|
242
|
+
If there never was an index on the column, `is_valid` might have been true from the get-go. To make this check easier,
|
|
243
|
+
a number of utility methods exist.
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
bool
|
|
248
|
+
Whether the index is usable if it exists. If there is no index on the column, the index cannot be interpreted in
|
|
249
|
+
any meaningful way.
|
|
250
|
+
"""
|
|
251
|
+
return self._is_invalid
|
|
252
|
+
|
|
253
|
+
def is_primary(self) -> bool:
|
|
254
|
+
"""Checks, whether this is a valid primary index.
|
|
255
|
+
|
|
256
|
+
Returns
|
|
257
|
+
-------
|
|
258
|
+
bool
|
|
259
|
+
Whether this is a primary key index and ensures that it is still valid.
|
|
260
|
+
"""
|
|
261
|
+
return not self._is_invalid and self._index_type == "primary"
|
|
262
|
+
|
|
263
|
+
def is_secondary(self) -> bool:
|
|
264
|
+
"""Checks, whether this is a valid secondary index.
|
|
265
|
+
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
bool
|
|
269
|
+
Whether this is a secondary index and ensures that it is still valid.
|
|
270
|
+
"""
|
|
271
|
+
return not self._is_invalid and self._index_type == "secondary"
|
|
272
|
+
|
|
273
|
+
def is_indexed(self) -> bool:
|
|
274
|
+
"""Checks, whether there is any valid index defined for the column.
|
|
275
|
+
|
|
276
|
+
This check does not differentiate between primary key indexes and secondary indexes.
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
bool
|
|
281
|
+
Whether this is a primary key or secondary index and ensures that it is still valid.
|
|
282
|
+
"""
|
|
283
|
+
return self.is_primary() or self.is_secondary()
|
|
284
|
+
|
|
285
|
+
def can_pk_fk_join(self, other: IndexInfo) -> bool:
|
|
286
|
+
"""Checks, whether two columns can be joined via a primary key/foreign key join.
|
|
287
|
+
|
|
288
|
+
This method does not restrict the direction of such a join, i.e. each column could act as the primary key or
|
|
289
|
+
foreign key. Likewise, no datatype checks are performed and it is assumed that a database system would be
|
|
290
|
+
able to actually join the two columns involved.
|
|
291
|
+
|
|
292
|
+
If indexes on any of the columns are no longer available, this check fails.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
other : IndexInfo
|
|
297
|
+
The index information of the other column that should participate in the join
|
|
298
|
+
|
|
299
|
+
Returns
|
|
300
|
+
-------
|
|
301
|
+
bool
|
|
302
|
+
Whether a primary key/foreign key join could be executed between the columns.
|
|
303
|
+
"""
|
|
304
|
+
if not self.is_indexed() or not other.is_indexed():
|
|
305
|
+
return False
|
|
306
|
+
|
|
307
|
+
if self.is_secondary() and other.is_secondary():
|
|
308
|
+
return False
|
|
309
|
+
|
|
310
|
+
# all other cases have at least one primary key index available
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
def invalidate(self) -> None:
|
|
314
|
+
"""Marks the index as invalid.
|
|
315
|
+
|
|
316
|
+
Once a table is included in an intermediate join result, the index structures of its columns most likely
|
|
317
|
+
become invalid, and it is no longer possible to use the index to query for specific tuples (because the
|
|
318
|
+
occurrences of the individual tuples are multiplied when executing the join). This method can be used to model
|
|
319
|
+
the lifecycle of index structures within the course of the execution of a single query.
|
|
320
|
+
"""
|
|
321
|
+
self._is_invalid = True
|
|
322
|
+
|
|
323
|
+
def __repr__(self) -> str:
|
|
324
|
+
return str(self)
|
|
325
|
+
|
|
326
|
+
def __str__(self) -> str:
|
|
327
|
+
invalid_state = " INVALID" if self._is_invalid else ""
|
|
328
|
+
if self._index_type == "none":
|
|
329
|
+
return f"NO INDEX({self._column})"
|
|
330
|
+
elif self._index_type == "primary":
|
|
331
|
+
return f"PRIMARY INDEX({self._column}){invalid_state}"
|
|
332
|
+
else:
|
|
333
|
+
return f"SECONDARY INDEX({self._column}){invalid_state}"
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@dataclass(frozen=True)
|
|
337
|
+
class TableInfo:
|
|
338
|
+
"""This class captures information about the state of tables in the join graph.
|
|
339
|
+
|
|
340
|
+
Attributes
|
|
341
|
+
----------
|
|
342
|
+
free : bool
|
|
343
|
+
Whether the table is still *free*, i.e. is not a part of any intermediate join result.
|
|
344
|
+
index_info : Collection[IndexInfo]
|
|
345
|
+
Information about the indexes of all columns that belong to the table. If a column does not appear in this collection,
|
|
346
|
+
it does not have any indexes, or the column is not relevant in the current join graph (i.e. because it does not
|
|
347
|
+
appear in any join predicate)
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
free: bool
|
|
351
|
+
index_info: Collection[IndexInfo]
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
_PredicateMap = collections.defaultdict[
|
|
355
|
+
frozenset[TableReference], list[AbstractPredicate]
|
|
356
|
+
]
|
|
357
|
+
"""Type alias for an (internally used) predicate map"""
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class JoinGraph(Mapping[TableReference, TableInfo]):
|
|
361
|
+
"""The join graph models the connection between different tables in a query.
|
|
362
|
+
|
|
363
|
+
All tables that are referenced in the query are represented as the nodes in the graph. If two tables are joined via a join
|
|
364
|
+
predicate in the SQL query, they will be linked with an edge in the join graph. This graph is further annotated by the join
|
|
365
|
+
predicate. Additionally, the join graph stores index information for each of the relevant columns in the query.
|
|
366
|
+
|
|
367
|
+
In contrast to many other types in PostBOUND, a join graph is a mutable structure. It also models the current state of the
|
|
368
|
+
optimizer once specific tables have been included in an intermediate join result.
|
|
369
|
+
|
|
370
|
+
The wording of the different join graph methods distinguishes three states of joins (and correspondingly tables):
|
|
371
|
+
|
|
372
|
+
- a join might be *free*, if at least one of the corresponding tables have not been marked as joined, yet
|
|
373
|
+
- a join might be *available*, if it is *free* and one of the tables is already included in some intermediate join
|
|
374
|
+
- a join is *consumed*, if its no longer *free*. This occurs once the partner tables have both been marked as joined.
|
|
375
|
+
|
|
376
|
+
A further distinction is made between n:m joins and primary key/foreign key joins. Information about the available joins of
|
|
377
|
+
each type can be queried easily and many methods are available in two variants: one that includes all possible joins and
|
|
378
|
+
one that is only focused on primary key/foreign key joins. To determine the precise join types, the join graph needs to
|
|
379
|
+
access the database schema. A n:m join is one were the column values of both join partners can appear an arbitrary number
|
|
380
|
+
of times, corresponding to an n:m relation between the two tables.
|
|
381
|
+
|
|
382
|
+
By calling the `mark_joined` method, the state of individual joins and their corresponding tables might change. This also
|
|
383
|
+
means that former primary key/foreign key joins might become n:m joins (which is the case exactly when the primary key
|
|
384
|
+
table is inserted into an intermediate join result).
|
|
385
|
+
|
|
386
|
+
Parameters
|
|
387
|
+
----------
|
|
388
|
+
query : ImplicitSqlQuery
|
|
389
|
+
The query for which the join graph should be generated
|
|
390
|
+
db_schema : Optional[DBCatalog], optional
|
|
391
|
+
The schema of the database on which the query should be executed. If this is ``None``, the database schema is inferred
|
|
392
|
+
based on the `DatabasePool`.
|
|
393
|
+
include_predicate_equivalence_classes : bool, optional
|
|
394
|
+
Whether predicates from the same equivalence class should be added to the join graph as well, even if they do not exist
|
|
395
|
+
in the original query. Consider two join conditions *a = b* and *b = c*. From these conditions, it follows that *a = c*
|
|
396
|
+
by the transitive property. If `include_predicate_equivalence_classes` is true, that last predicate will also be added
|
|
397
|
+
to the join graph.
|
|
398
|
+
|
|
399
|
+
Warnings
|
|
400
|
+
--------
|
|
401
|
+
If predicate equivalence classes are used, the optimization algorithm can potentially generate queries that contain
|
|
402
|
+
additional predicates that were not present in the original query.
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
def __init__(
|
|
406
|
+
self,
|
|
407
|
+
query: ImplicitSqlQuery,
|
|
408
|
+
db_schema: Optional[DBCatalog] = None,
|
|
409
|
+
*,
|
|
410
|
+
include_predicate_equivalence_classes: bool = False,
|
|
411
|
+
) -> None:
|
|
412
|
+
if db_schema is None:
|
|
413
|
+
from .. import db # local import to avoid circular dependencies
|
|
414
|
+
|
|
415
|
+
db_schema = db.DatabasePool.get_instance().current_database().schema()
|
|
416
|
+
|
|
417
|
+
self.query = query
|
|
418
|
+
self._db_schema = db_schema
|
|
419
|
+
self._index_structures: dict[ColumnReference, IndexInfo] = {}
|
|
420
|
+
|
|
421
|
+
graph = nx.Graph()
|
|
422
|
+
graph.add_nodes_from(query.tables(), free=True)
|
|
423
|
+
edges = []
|
|
424
|
+
predicate_map: _PredicateMap = collections.defaultdict(list)
|
|
425
|
+
join_predicates = query.predicates().joins()
|
|
426
|
+
if include_predicate_equivalence_classes:
|
|
427
|
+
join_equivalence_classes = determine_join_equivalence_classes(
|
|
428
|
+
join_predicates
|
|
429
|
+
)
|
|
430
|
+
equivalence_class_predicates = generate_predicates_for_equivalence_classes(
|
|
431
|
+
join_equivalence_classes
|
|
432
|
+
)
|
|
433
|
+
join_predicates = set(join_predicates) | equivalence_class_predicates
|
|
434
|
+
for join_predicate in join_predicates:
|
|
435
|
+
if len(join_predicate.columns()) != 2:
|
|
436
|
+
continue
|
|
437
|
+
first_col, second_col = join_predicate.columns()
|
|
438
|
+
predicate_map[frozenset([first_col.table, second_col.table])].append(
|
|
439
|
+
join_predicate
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
for tables, joins in predicate_map.items():
|
|
443
|
+
first_tab, second_tab = tables
|
|
444
|
+
join_predicate = CompoundPredicate.create_and(joins)
|
|
445
|
+
edges.append((first_tab, second_tab, {"predicate": join_predicate}))
|
|
446
|
+
for column in join_predicate.columns():
|
|
447
|
+
self._index_structures[column] = IndexInfo.generate_for(
|
|
448
|
+
column, db_schema
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
graph.add_edges_from(edges)
|
|
452
|
+
self._graph = graph
|
|
453
|
+
|
|
454
|
+
def initial(self) -> bool:
|
|
455
|
+
"""Checks, whether the join graph has already been modified.
|
|
456
|
+
|
|
457
|
+
Returns
|
|
458
|
+
-------
|
|
459
|
+
bool
|
|
460
|
+
``True`` indicates that the join graph is still in its initial state, i.e. no table has been marked as joined, yet.
|
|
461
|
+
"""
|
|
462
|
+
return all(is_free for __, is_free in self._graph.nodes.data("free"))
|
|
463
|
+
|
|
464
|
+
def contains_cross_products(self) -> bool:
|
|
465
|
+
"""Checks, whether there are any cross products in the input query.
|
|
466
|
+
|
|
467
|
+
A cross product is a join between tables without a restricting join predicate. Note that this is only the case
|
|
468
|
+
if those tables are also not linked via a sequence of join predicates with other tables.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
bool
|
|
473
|
+
Whether the join graph contains at least one cross product.
|
|
474
|
+
"""
|
|
475
|
+
return not nx.is_connected(self._graph)
|
|
476
|
+
|
|
477
|
+
def contains_free_tables(self) -> bool:
|
|
478
|
+
"""Checks, whether there is at least one more free tables remaining in the graph.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
bool
|
|
483
|
+
Whether there are still free tables in the join graph
|
|
484
|
+
"""
|
|
485
|
+
return any(is_free for __, is_free in self._graph.nodes.data("free"))
|
|
486
|
+
|
|
487
|
+
def contains_free_n_m_joins(self) -> bool:
|
|
488
|
+
"""Checks, whether there is at least one more free n:m join remaining in the graph.
|
|
489
|
+
|
|
490
|
+
Returns
|
|
491
|
+
-------
|
|
492
|
+
bool
|
|
493
|
+
Whether there are still n:m joins with at least one free table.
|
|
494
|
+
"""
|
|
495
|
+
is_first_join = self.initial()
|
|
496
|
+
for first_tab, second_tab, predicate in self._graph.edges.data("predicate"):
|
|
497
|
+
if not self.is_available_join(first_tab, second_tab) and not is_first_join:
|
|
498
|
+
continue
|
|
499
|
+
for first_col, second_col in predicate.join_partners():
|
|
500
|
+
if not self._index_structures[first_col].can_pk_fk_join(
|
|
501
|
+
self._index_structures[second_col]
|
|
502
|
+
):
|
|
503
|
+
return True
|
|
504
|
+
return False
|
|
505
|
+
|
|
506
|
+
def count_consumed_tables(self) -> int:
|
|
507
|
+
"""Determines the number of tables that have been joined already.
|
|
508
|
+
|
|
509
|
+
This number might be 1 if only the initial tables has been selected, or 0 if the join graph is still in its initial
|
|
510
|
+
state.
|
|
511
|
+
|
|
512
|
+
Returns
|
|
513
|
+
-------
|
|
514
|
+
int
|
|
515
|
+
The number of joined tables
|
|
516
|
+
"""
|
|
517
|
+
return len(
|
|
518
|
+
[is_free for __, is_free in self._graph.nodes.data("free") if not is_free]
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
def join_components(self) -> Iterable[JoinGraph]:
|
|
522
|
+
"""Provides all components of the join graph.
|
|
523
|
+
|
|
524
|
+
A component is a subgraph of the original join graph, such that the subgraph is connected but there was no
|
|
525
|
+
edge between nodes from different sub-graphs. This corresponds to the parts of the query that have to be joined
|
|
526
|
+
via a cross product.
|
|
527
|
+
|
|
528
|
+
Returns
|
|
529
|
+
-------
|
|
530
|
+
Iterable[JoinGraph]
|
|
531
|
+
The components of the join graph, each as its own full join graph object
|
|
532
|
+
"""
|
|
533
|
+
components = []
|
|
534
|
+
for component in nx.connected_components(self._graph):
|
|
535
|
+
component_query = transform.extract_query_fragment(self.query, component)
|
|
536
|
+
components.append(JoinGraph(component_query, self._db_schema))
|
|
537
|
+
return components
|
|
538
|
+
|
|
539
|
+
def joined_tables(self) -> frozenset[TableReference]:
|
|
540
|
+
"""Provides all non-free tables in the join graph.
|
|
541
|
+
|
|
542
|
+
Returns
|
|
543
|
+
-------
|
|
544
|
+
frozenset[TableReference]
|
|
545
|
+
The tables that have already been joined / consumed.
|
|
546
|
+
"""
|
|
547
|
+
return frozenset(table for table in self if not self.is_free_table(table))
|
|
548
|
+
|
|
549
|
+
def free_tables(self) -> frozenset[TableReference]:
|
|
550
|
+
"""Provides all tables that have not been joined, yet.
|
|
551
|
+
|
|
552
|
+
Returns
|
|
553
|
+
-------
|
|
554
|
+
frozenset[TableReference]
|
|
555
|
+
The tables that are not consumed
|
|
556
|
+
"""
|
|
557
|
+
return frozenset(table for table in self if self.is_free_table(table))
|
|
558
|
+
|
|
559
|
+
def all_joins(self) -> Iterable[tuple[TableReference, TableReference]]:
|
|
560
|
+
"""Provides all edges in the join graph, no matter whether they are available or not.
|
|
561
|
+
|
|
562
|
+
Returns
|
|
563
|
+
-------
|
|
564
|
+
Iterable[tuple[TableReference, TableReference]]
|
|
565
|
+
The possible joins in the graph. The assignment to the first or second component of the tuple is arbitrary
|
|
566
|
+
"""
|
|
567
|
+
return list(self._graph.edges)
|
|
568
|
+
|
|
569
|
+
def available_join_paths(
|
|
570
|
+
self, *, both_directions_on_initial: bool = False
|
|
571
|
+
) -> Iterable[JoinPath]:
|
|
572
|
+
"""Provides all joins that can be executed in the current join graph.
|
|
573
|
+
|
|
574
|
+
The precise output of this method depends on the current state of the join graph: If the graph is still in its initial
|
|
575
|
+
state (i.e. none of the tables is joined yet), all joins are provided. Otherwise, only those join paths are considered
|
|
576
|
+
available, where one table is already joined, and the join partner is still free. The free table will be the target
|
|
577
|
+
table in the join path whereas the joined table will be the start table.
|
|
578
|
+
|
|
579
|
+
Parameters
|
|
580
|
+
----------
|
|
581
|
+
both_directions_on_initial : bool, optional
|
|
582
|
+
Whether to include the join path *R* -> *S* as well as *S* -> *R* for initial join graphs, assuming there is a join
|
|
583
|
+
between *R* and *S* in the graph.
|
|
584
|
+
|
|
585
|
+
Returns
|
|
586
|
+
-------
|
|
587
|
+
Iterable[JoinPath]
|
|
588
|
+
All possible joins in the current graph.
|
|
589
|
+
"""
|
|
590
|
+
join_paths = []
|
|
591
|
+
if self.initial():
|
|
592
|
+
for join_edge in self._graph.edges.data("predicate"):
|
|
593
|
+
source_table, target_table, join_condition = join_edge
|
|
594
|
+
current_join_path = JoinPath(source_table, target_table, join_condition)
|
|
595
|
+
join_paths.append(current_join_path)
|
|
596
|
+
if both_directions_on_initial:
|
|
597
|
+
join_paths.append(current_join_path.flip_direction())
|
|
598
|
+
return join_paths
|
|
599
|
+
|
|
600
|
+
for join_edge in self._graph.edges.data("predicate"):
|
|
601
|
+
source_table, target_table, join_condition = join_edge
|
|
602
|
+
if self.is_free_table(source_table) and self.is_free_table(target_table):
|
|
603
|
+
# both tables are still free -> no path
|
|
604
|
+
continue
|
|
605
|
+
elif not self.is_free_table(source_table) and not self.is_free_table(
|
|
606
|
+
target_table
|
|
607
|
+
):
|
|
608
|
+
# both tables are already joined -> no path
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
if self.is_free_table(source_table):
|
|
612
|
+
# fix directionality
|
|
613
|
+
source_table, target_table = target_table, source_table
|
|
614
|
+
join_paths.append(JoinPath(source_table, target_table, join_condition))
|
|
615
|
+
|
|
616
|
+
return join_paths
|
|
617
|
+
|
|
618
|
+
def available_n_m_join_paths(
|
|
619
|
+
self, *, both_directions_on_initial: bool = False
|
|
620
|
+
) -> Iterable[JoinPath]:
|
|
621
|
+
"""Provides exactly those join paths from `available_join_paths` that correspond to n:m joins.
|
|
622
|
+
|
|
623
|
+
The logic for initial and "dirty" join graphs is inherited from `available_join_paths` and can be further customized
|
|
624
|
+
via the `both_directions_on_initial` parameter.
|
|
625
|
+
|
|
626
|
+
Parameters
|
|
627
|
+
----------
|
|
628
|
+
both_directions_on_initial : bool, optional
|
|
629
|
+
Whether to include the join path *R* -> *S* as well as *S* -> *R* for initial join graphs if *R* ⨝ *S* is an n:m
|
|
630
|
+
join.
|
|
631
|
+
|
|
632
|
+
Returns
|
|
633
|
+
-------
|
|
634
|
+
Iterable[JoinPath]
|
|
635
|
+
The available n:m joins
|
|
636
|
+
"""
|
|
637
|
+
n_m_paths = []
|
|
638
|
+
for join_path in self.available_join_paths():
|
|
639
|
+
start_table, target_table = join_path.start_table, join_path.target_table
|
|
640
|
+
if not self.is_pk_fk_join(
|
|
641
|
+
start_table, target_table
|
|
642
|
+
) and not self.is_pk_fk_join(target_table, start_table):
|
|
643
|
+
n_m_paths.append(join_path)
|
|
644
|
+
if both_directions_on_initial and self.initial():
|
|
645
|
+
n_m_paths.append(join_path.flip_direction())
|
|
646
|
+
return n_m_paths
|
|
647
|
+
|
|
648
|
+
def available_join_paths_for(self, table: TableReference) -> Iterable[JoinPath]:
|
|
649
|
+
"""Returns all possible joins of a specific table.
|
|
650
|
+
|
|
651
|
+
What constitutes a possible join depends on the state of the join graph: for an initial join graph, the only
|
|
652
|
+
requirement is a valid join predicate between the tables. In all other cases, exactly one of the tables has to be free
|
|
653
|
+
and the other table has to be consumed.
|
|
654
|
+
|
|
655
|
+
Parameters
|
|
656
|
+
----------
|
|
657
|
+
table : TableReference
|
|
658
|
+
The table that should be joined
|
|
659
|
+
|
|
660
|
+
Returns
|
|
661
|
+
-------
|
|
662
|
+
Iterable[JoinPath]
|
|
663
|
+
All possible join paths for the table. This includes n:m joins as well as primary key/foreign key joins. In
|
|
664
|
+
each join path the specified table will be the start table and the join partner will be the target table.
|
|
665
|
+
"""
|
|
666
|
+
self._assert_contains_table(table)
|
|
667
|
+
return [
|
|
668
|
+
JoinPath(table, partner_table, join_edge["predicate"])
|
|
669
|
+
for partner_table, join_edge in self._graph.adj[table].items()
|
|
670
|
+
if self.is_available_join(table, partner_table)
|
|
671
|
+
]
|
|
672
|
+
|
|
673
|
+
def nx_graph(self) -> nx.Graph:
|
|
674
|
+
"""Provides the underlying graph object for this join graph.
|
|
675
|
+
|
|
676
|
+
Returns
|
|
677
|
+
-------
|
|
678
|
+
nx.Graph
|
|
679
|
+
A deep copy of the raw join graph
|
|
680
|
+
"""
|
|
681
|
+
return copy.deepcopy(self._graph)
|
|
682
|
+
|
|
683
|
+
def is_free_table(self, table: TableReference) -> bool:
|
|
684
|
+
"""Checks, whether a specific table is still free in this join graph.
|
|
685
|
+
|
|
686
|
+
If the table is not part of the join graph, an error is raised.
|
|
687
|
+
|
|
688
|
+
Parameters
|
|
689
|
+
----------
|
|
690
|
+
table : TableReference
|
|
691
|
+
The table to check
|
|
692
|
+
|
|
693
|
+
Returns
|
|
694
|
+
-------
|
|
695
|
+
bool
|
|
696
|
+
Whether the given table is still free
|
|
697
|
+
"""
|
|
698
|
+
return self._graph.nodes[table]["free"]
|
|
699
|
+
|
|
700
|
+
def joins_tables(
|
|
701
|
+
self, first_table: TableReference, second_table: TableReference
|
|
702
|
+
) -> bool:
|
|
703
|
+
"""Checks, whether the join graph contains an edge between specific tables.
|
|
704
|
+
|
|
705
|
+
This check does not require the join in question to be available (this is what `is_available_join` is for).
|
|
706
|
+
|
|
707
|
+
Parameters
|
|
708
|
+
----------
|
|
709
|
+
first_table : TableReference
|
|
710
|
+
The first join partner
|
|
711
|
+
second_table : TableReference
|
|
712
|
+
The second join partner
|
|
713
|
+
|
|
714
|
+
Returns
|
|
715
|
+
-------
|
|
716
|
+
bool
|
|
717
|
+
Whether there is any join predicate between the given tables. The direction or availability does not matter for
|
|
718
|
+
this check
|
|
719
|
+
"""
|
|
720
|
+
return (first_table, second_table) in self._graph.edges
|
|
721
|
+
|
|
722
|
+
def is_available_join(
|
|
723
|
+
self, first_table: TableReference, second_table: TableReference
|
|
724
|
+
) -> bool:
|
|
725
|
+
"""Checks, whether the join between two tables is still available.
|
|
726
|
+
|
|
727
|
+
For initial join graphs, this check passes as long as there is a valid join predicate between the two given tables. In
|
|
728
|
+
all other cases, one of the join partners has to be consumed, whereas the other partner has to be free.
|
|
729
|
+
|
|
730
|
+
Parameters
|
|
731
|
+
----------
|
|
732
|
+
first_table : TableReference
|
|
733
|
+
The first join partner
|
|
734
|
+
second_table : TableReference
|
|
735
|
+
The second join partner
|
|
736
|
+
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
bool
|
|
740
|
+
Whether there is a valid join between the given tables and whether this join is still available. The join direction
|
|
741
|
+
and join type do not matter.
|
|
742
|
+
"""
|
|
743
|
+
first_free, second_free = (
|
|
744
|
+
self._graph.nodes[first_table]["free"],
|
|
745
|
+
self._graph.nodes[second_table]["free"],
|
|
746
|
+
)
|
|
747
|
+
valid_join = self.joins_tables(first_table, second_table)
|
|
748
|
+
available_join = (
|
|
749
|
+
(first_free and not second_free)
|
|
750
|
+
or (not first_free and second_free)
|
|
751
|
+
or self.initial()
|
|
752
|
+
)
|
|
753
|
+
return valid_join and available_join
|
|
754
|
+
|
|
755
|
+
def is_pk_fk_join(self, fk_table: TableReference, pk_table: TableReference) -> bool:
|
|
756
|
+
"""Checks, whether the join between the supplied tables is a primary key/foreign key join.
|
|
757
|
+
|
|
758
|
+
This check does not require the indicated join to be available.
|
|
759
|
+
|
|
760
|
+
Parameters
|
|
761
|
+
----------
|
|
762
|
+
fk_table : TableReference
|
|
763
|
+
The foreign key table
|
|
764
|
+
pk_table : TableReference
|
|
765
|
+
The primary key table
|
|
766
|
+
|
|
767
|
+
Returns
|
|
768
|
+
-------
|
|
769
|
+
bool
|
|
770
|
+
Whether the join between the given tables is a primary key/foreign key join with the correct direction
|
|
771
|
+
|
|
772
|
+
Warnings
|
|
773
|
+
--------
|
|
774
|
+
In the current implementation, this check only works for (conjunctions of) binary join predicates. An error is raised
|
|
775
|
+
for joins between multiple columns
|
|
776
|
+
"""
|
|
777
|
+
|
|
778
|
+
if not self.joins_tables(fk_table, pk_table):
|
|
779
|
+
return False
|
|
780
|
+
|
|
781
|
+
predicate: AbstractPredicate = self._graph.edges[fk_table, pk_table][
|
|
782
|
+
"predicate"
|
|
783
|
+
]
|
|
784
|
+
for base_predicate in predicate.base_predicates():
|
|
785
|
+
fk_col = util.simplify(base_predicate.columns_of(fk_table))
|
|
786
|
+
pk_col = util.simplify(base_predicate.columns_of(pk_table))
|
|
787
|
+
if (
|
|
788
|
+
self._index_structures[fk_col].is_indexed()
|
|
789
|
+
and self._index_structures[pk_col].is_primary()
|
|
790
|
+
):
|
|
791
|
+
return True
|
|
792
|
+
return False
|
|
793
|
+
|
|
794
|
+
def is_n_m_join(
|
|
795
|
+
self, first_table: TableReference, second_table: TableReference
|
|
796
|
+
) -> bool:
|
|
797
|
+
"""Checks, whether the join between the supplied tables is an n:m join.
|
|
798
|
+
|
|
799
|
+
This check does not require the indicated join to be available.
|
|
800
|
+
|
|
801
|
+
Parameters
|
|
802
|
+
----------
|
|
803
|
+
first_table : TableReference
|
|
804
|
+
The first join partner
|
|
805
|
+
second_table : TableReference
|
|
806
|
+
The second join partner
|
|
807
|
+
|
|
808
|
+
Returns
|
|
809
|
+
-------
|
|
810
|
+
bool
|
|
811
|
+
Whether the join between the given tables is an n:m join
|
|
812
|
+
|
|
813
|
+
Warnings
|
|
814
|
+
--------
|
|
815
|
+
In the current implementation, this check only works for (conjunctions of) binary join predicates. An error is raised
|
|
816
|
+
for joins between multiple columns
|
|
817
|
+
"""
|
|
818
|
+
return (
|
|
819
|
+
self.joins_tables(first_table, second_table)
|
|
820
|
+
and not self.is_pk_fk_join(first_table, second_table)
|
|
821
|
+
and not self.is_pk_fk_join(second_table, first_table)
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
def available_pk_fk_joins_for(self, fk_table: TableReference) -> Iterable[JoinPath]:
|
|
825
|
+
"""Provides all available primary key/foreign key joins with a specific foreign key table.
|
|
826
|
+
|
|
827
|
+
This method does not restrict itself to available joins, but requires that at least one of the join parts is free.
|
|
828
|
+
|
|
829
|
+
Parameters
|
|
830
|
+
----------
|
|
831
|
+
fk_table : TableReference
|
|
832
|
+
The foreign key table. This will be the start table in all join paths.
|
|
833
|
+
|
|
834
|
+
Returns
|
|
835
|
+
-------
|
|
836
|
+
Iterable[JoinPath]
|
|
837
|
+
All matching join paths. The start table of the path will be the foreign key table, whereas the primary key table
|
|
838
|
+
will be the target table.
|
|
839
|
+
"""
|
|
840
|
+
self._assert_contains_table(fk_table)
|
|
841
|
+
return [
|
|
842
|
+
JoinPath(fk_table, pk_table, join_edge["predicate"])
|
|
843
|
+
for pk_table, join_edge in self._graph.adj[fk_table].items()
|
|
844
|
+
if self.is_pk_fk_join(fk_table, pk_table)
|
|
845
|
+
and (self.is_free_table(fk_table) or self.is_free_table(pk_table))
|
|
846
|
+
]
|
|
847
|
+
|
|
848
|
+
def available_deep_pk_join_paths_for(
|
|
849
|
+
self,
|
|
850
|
+
fk_table: TableReference,
|
|
851
|
+
ordering: Callable[[TableReference, dict], int] | None = None,
|
|
852
|
+
) -> Iterable[JoinPath]:
|
|
853
|
+
"""Provides all available pk/fk joins with the given table, as well as follow-up pk/fk joins.
|
|
854
|
+
|
|
855
|
+
In contrast to the `available_pk_fk_joins_for` method, this method does not only return direct joins between the
|
|
856
|
+
foreign key table, but augments its output in the following way: suppose the foreign key table is pk/fk joined with a
|
|
857
|
+
primary key table *t*. Then, this method also includes all joins of *t* with additional tables *t'*, such
|
|
858
|
+
that *t* ⋈ *t'* is once again a primary key/foreign key join, but this time with *t* acting as the foreign key
|
|
859
|
+
and *t'* as the primary key. This procedure is repeated for all *t'* tables recursively until no more primary
|
|
860
|
+
key/foreign key joins are available.
|
|
861
|
+
|
|
862
|
+
Essentially, this is equivalent to performing a breadth-first search on all (directed) primary key/foreign key
|
|
863
|
+
joins, starting at the foreign key table. The sequence in which joins on the same level are placed into the resulting
|
|
864
|
+
iterable can be customized via the `ordering` parameter. This callable receives the current primary key table
|
|
865
|
+
and the edge data as input and produces a numerical position weight as output (smaller values meaning earlier
|
|
866
|
+
placement). The provided edge data contains the join predicate under the ``"predicate"`` key. Using the join predicate,
|
|
867
|
+
the join partner (i.e. the foreign key table) can be retrieved.
|
|
868
|
+
|
|
869
|
+
Parameters
|
|
870
|
+
----------
|
|
871
|
+
fk_table : TableReference
|
|
872
|
+
The foreign key table at which the search should be anchored.
|
|
873
|
+
ordering : Callable[[TableReference, dict], int] | None, optional
|
|
874
|
+
How to sort different primary key join partners on the same level. Lower values mean earlier positioning. This
|
|
875
|
+
defaults to ``None``, in which case an arbitrary ordering is used.
|
|
876
|
+
|
|
877
|
+
Returns
|
|
878
|
+
-------
|
|
879
|
+
Iterable[JoinPath]
|
|
880
|
+
All deep primary key/foreign key join paths, starting at the `fk_table`
|
|
881
|
+
"""
|
|
882
|
+
self._assert_contains_table(fk_table)
|
|
883
|
+
available_joins = util.nx.nx_bfs_tree(
|
|
884
|
+
self._graph, fk_table, self._check_pk_fk_join, node_order=ordering
|
|
885
|
+
)
|
|
886
|
+
join_paths = []
|
|
887
|
+
for join in available_joins:
|
|
888
|
+
current_pk_table: TableReference = join[0]
|
|
889
|
+
join_predicate: AbstractPredicate = join[1]["predicate"]
|
|
890
|
+
current_fk_table = util.simplify(
|
|
891
|
+
{
|
|
892
|
+
column.table
|
|
893
|
+
for column in join_predicate.join_partners_of(current_pk_table)
|
|
894
|
+
}
|
|
895
|
+
)
|
|
896
|
+
join_paths.append(
|
|
897
|
+
JoinPath(current_fk_table, current_pk_table, join_predicate)
|
|
898
|
+
)
|
|
899
|
+
return join_paths
|
|
900
|
+
|
|
901
|
+
def join_partners_from(
|
|
902
|
+
self, table: TableReference, candidate_tables: Iterable[TableReference]
|
|
903
|
+
) -> set[TableReference]:
|
|
904
|
+
"""Provides exactly those tables from a set of candidates that are joined with a specific given table.
|
|
905
|
+
|
|
906
|
+
This check does not require the joins in question to be available. The existence of a join edge is sufficient.
|
|
907
|
+
|
|
908
|
+
Parameters
|
|
909
|
+
----------
|
|
910
|
+
table : TableReference
|
|
911
|
+
The table that should be joined with the candidates
|
|
912
|
+
candidate_tables : Iterable[TableReference]
|
|
913
|
+
Possible join partners for the `table`. Join type and direction do not matter
|
|
914
|
+
|
|
915
|
+
Returns
|
|
916
|
+
-------
|
|
917
|
+
set[TableReference]
|
|
918
|
+
Those tables of the `candidate_tables` that can be joined with the partner table.
|
|
919
|
+
"""
|
|
920
|
+
candidate_tables = set(candidate_tables)
|
|
921
|
+
return set(
|
|
922
|
+
neighbor
|
|
923
|
+
for neighbor in self._graph.adj[table].keys()
|
|
924
|
+
if neighbor in candidate_tables
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
def join_predicates_between(
|
|
928
|
+
self,
|
|
929
|
+
first_tables: TableReference | Iterable[TableReference],
|
|
930
|
+
second_tables: Optional[TableReference | Iterable[TableReference]] = None,
|
|
931
|
+
) -> Collection[AbstractPredicate]:
|
|
932
|
+
"""Provides all join predicates between sets of tables.
|
|
933
|
+
|
|
934
|
+
This method operates in two modes: if only one set of tables is given, all join predicates for tables within that set
|
|
935
|
+
are collected. If two sets are given, all join predicates for tables from both sets are collected, but not predicates
|
|
936
|
+
from tables within the same set.
|
|
937
|
+
|
|
938
|
+
The status of the tables, as well as the join type, do not play a role in this check.
|
|
939
|
+
|
|
940
|
+
Parameters
|
|
941
|
+
----------
|
|
942
|
+
first_tables : TableReference | Iterable[TableReference]
|
|
943
|
+
The first set of candidate tables. Can optionally also be a single table, in which case the check is only
|
|
944
|
+
performed for this table and the partner set
|
|
945
|
+
second_tables : Optional[TableReference | Iterable[TableReference]], optional
|
|
946
|
+
The second set of candidate tables. By default this is ``None``, which results in collecting only join predicates
|
|
947
|
+
for tables from `first_tables`. Can also be a single table, in which case the check is only performed for this
|
|
948
|
+
table and the partner set
|
|
949
|
+
|
|
950
|
+
Returns
|
|
951
|
+
-------
|
|
952
|
+
Collection[AbstractPredicate]
|
|
953
|
+
All join predicates
|
|
954
|
+
"""
|
|
955
|
+
first_tables = util.enlist(first_tables)
|
|
956
|
+
second_tables = util.enlist(second_tables) if second_tables else first_tables
|
|
957
|
+
matching_predicates = set()
|
|
958
|
+
|
|
959
|
+
for first_table in first_tables:
|
|
960
|
+
for second_table in second_tables:
|
|
961
|
+
join_predicate = self._fetch_join_predicate(first_table, second_table)
|
|
962
|
+
if join_predicate:
|
|
963
|
+
matching_predicates.add(join_predicate)
|
|
964
|
+
|
|
965
|
+
return matching_predicates
|
|
966
|
+
|
|
967
|
+
def mark_joined(
|
|
968
|
+
self, table: TableReference, join_edge: Optional[AbstractPredicate] = None
|
|
969
|
+
) -> None:
|
|
970
|
+
"""Updates the join graph to include a specific table in the intermediate result.
|
|
971
|
+
|
|
972
|
+
This procedure also changes the available index structures according to the kind of join that was executed.
|
|
973
|
+
This is determined based on the current state of the join graph, the index structures, as well as the supplied join
|
|
974
|
+
predicate. If no join predicate is supplied, it is inferred from the query predicates.
|
|
975
|
+
|
|
976
|
+
Parameters
|
|
977
|
+
----------
|
|
978
|
+
table : TableReference
|
|
979
|
+
The tables that becomes part of an intermediate result
|
|
980
|
+
join_edge : Optional[AbstractPredicate], optional
|
|
981
|
+
The condition that is used to carry out the join. Defaults to ``None``, in which case the predicate is inferred
|
|
982
|
+
from the predicates that have been supplied by the initial query.
|
|
983
|
+
"""
|
|
984
|
+
|
|
985
|
+
# TODO: check, if we actually need to handle transient index updates here as well
|
|
986
|
+
# TODO: do we still need the join_edge parameter if we infer it from the predicates anyway?
|
|
987
|
+
|
|
988
|
+
self._graph.nodes[table]["free"] = False
|
|
989
|
+
if len(self.joined_tables()) == 1:
|
|
990
|
+
return
|
|
991
|
+
|
|
992
|
+
join_edge = (
|
|
993
|
+
join_edge
|
|
994
|
+
if join_edge
|
|
995
|
+
else self.query.predicates().joins_between(table, self.joined_tables())
|
|
996
|
+
)
|
|
997
|
+
if not join_edge:
|
|
998
|
+
# We still need this check even though we already know that there are at least two tables joined, since
|
|
999
|
+
# these two tables might have nothing to do with each other (e.g. different components in the join graph)
|
|
1000
|
+
return
|
|
1001
|
+
|
|
1002
|
+
partner_tables = {col.table for col in join_edge.join_partners_of(table)}
|
|
1003
|
+
for partner_table in partner_tables:
|
|
1004
|
+
pk_fk_join = self.is_pk_fk_join(table, partner_table)
|
|
1005
|
+
fk_pk_join = self.is_pk_fk_join(partner_table, table)
|
|
1006
|
+
|
|
1007
|
+
if pk_fk_join and fk_pk_join: # PK/PK join
|
|
1008
|
+
continue
|
|
1009
|
+
|
|
1010
|
+
for col1, col2 in join_edge.join_partners():
|
|
1011
|
+
joined_col, partner_col = (
|
|
1012
|
+
(col1, col2) if col1.table == table else (col2, col1)
|
|
1013
|
+
)
|
|
1014
|
+
if pk_fk_join:
|
|
1015
|
+
self._index_structures[partner_col].invalidate()
|
|
1016
|
+
elif fk_pk_join:
|
|
1017
|
+
self._index_structures[joined_col].invalidate()
|
|
1018
|
+
else:
|
|
1019
|
+
self._index_structures[partner_col].invalidate()
|
|
1020
|
+
self._index_structures[joined_col].invalidate()
|
|
1021
|
+
|
|
1022
|
+
if pk_fk_join:
|
|
1023
|
+
continue
|
|
1024
|
+
|
|
1025
|
+
for table, is_free in self._graph.nodes.data("free"):
|
|
1026
|
+
if is_free or table == partner_table:
|
|
1027
|
+
continue
|
|
1028
|
+
self._invalidate_indexes_on(table)
|
|
1029
|
+
|
|
1030
|
+
def clone(self) -> JoinGraph:
|
|
1031
|
+
"""Provides a deep copy of the current join graph.
|
|
1032
|
+
|
|
1033
|
+
Returns
|
|
1034
|
+
-------
|
|
1035
|
+
JoinGraph
|
|
1036
|
+
The copy. It can be safely modified without affecting the original join graph.
|
|
1037
|
+
"""
|
|
1038
|
+
cloned = JoinGraph(self.query, self._db_schema)
|
|
1039
|
+
cloned._graph = self.nx_graph()
|
|
1040
|
+
cloned._index_structures = copy.deepcopy(self._index_structures)
|
|
1041
|
+
return cloned
|
|
1042
|
+
|
|
1043
|
+
def _assert_contains_table(self, table: TableReference) -> None:
|
|
1044
|
+
"""Raises an error if a specific table is not part of the join graph.
|
|
1045
|
+
|
|
1046
|
+
Parameters
|
|
1047
|
+
----------
|
|
1048
|
+
table : TableReference
|
|
1049
|
+
The table to check
|
|
1050
|
+
|
|
1051
|
+
Raises
|
|
1052
|
+
------
|
|
1053
|
+
ValueError
|
|
1054
|
+
If the table is not a node in the join graph
|
|
1055
|
+
"""
|
|
1056
|
+
if table not in self:
|
|
1057
|
+
raise ValueError(f"Join graph does not contain table {table}")
|
|
1058
|
+
|
|
1059
|
+
def _check_pk_fk_join(self, pk_table: TableReference, edge_data: dict) -> bool:
|
|
1060
|
+
"""Checks, whether a specific table acts as a primary key in the join as indicated by a join graph edge.
|
|
1061
|
+
|
|
1062
|
+
Parameters
|
|
1063
|
+
----------
|
|
1064
|
+
pk_table : TableReference
|
|
1065
|
+
The table to check
|
|
1066
|
+
edge_data : dict
|
|
1067
|
+
The join that should be performed. This has to be contained in the ``"predicate"`` key.
|
|
1068
|
+
|
|
1069
|
+
Returns
|
|
1070
|
+
-------
|
|
1071
|
+
bool
|
|
1072
|
+
Whether the `pk_table` actually acts as the primary key in the given join edge.
|
|
1073
|
+
"""
|
|
1074
|
+
join_predicate: AbstractPredicate = edge_data["predicate"]
|
|
1075
|
+
for base_predicate in join_predicate.base_predicates():
|
|
1076
|
+
fk_table = util.simplify(
|
|
1077
|
+
{column.table for column in base_predicate.join_partners_of(pk_table)}
|
|
1078
|
+
)
|
|
1079
|
+
if self.is_pk_fk_join(fk_table, pk_table):
|
|
1080
|
+
return True
|
|
1081
|
+
return False
|
|
1082
|
+
|
|
1083
|
+
def _invalidate_indexes_on(self, table: TableReference) -> None:
|
|
1084
|
+
"""Invalidates all indexes on all columns that belong to the given table.
|
|
1085
|
+
|
|
1086
|
+
Parameters
|
|
1087
|
+
----------
|
|
1088
|
+
table : TableReference
|
|
1089
|
+
The table for which the invalidation should take place
|
|
1090
|
+
"""
|
|
1091
|
+
for column, index in self._index_structures.items():
|
|
1092
|
+
if column.table == table:
|
|
1093
|
+
index.invalidate()
|
|
1094
|
+
|
|
1095
|
+
def _fetch_join_predicate(
|
|
1096
|
+
self, first_table: TableReference, second_table: TableReference
|
|
1097
|
+
) -> Optional[AbstractPredicate]:
|
|
1098
|
+
"""Provides the join predicate between specific tables if there is one.
|
|
1099
|
+
|
|
1100
|
+
Parameters
|
|
1101
|
+
----------
|
|
1102
|
+
first_table : TableReference
|
|
1103
|
+
The first join partner
|
|
1104
|
+
second_table : TableReference
|
|
1105
|
+
The second join partner
|
|
1106
|
+
|
|
1107
|
+
Returns
|
|
1108
|
+
-------
|
|
1109
|
+
Optional[AbstractPredicate]
|
|
1110
|
+
The join predicate if exists or ``None`` otherwise. The status of the join partners does not matter
|
|
1111
|
+
"""
|
|
1112
|
+
if (first_table, second_table) not in self._graph.edges:
|
|
1113
|
+
return None
|
|
1114
|
+
return self._graph.edges[first_table, second_table]["predicate"]
|
|
1115
|
+
|
|
1116
|
+
def _index_info_for(self, table: TableReference) -> Collection[IndexInfo]:
|
|
1117
|
+
"""Provides all index info for a specific table (i.e. for each column that belongs to the table).
|
|
1118
|
+
|
|
1119
|
+
Parameters
|
|
1120
|
+
----------
|
|
1121
|
+
table : TableReference
|
|
1122
|
+
The table to retrieve the index info for
|
|
1123
|
+
|
|
1124
|
+
Returns
|
|
1125
|
+
-------
|
|
1126
|
+
Collection[IndexInfo]
|
|
1127
|
+
The index info of each column of the table. If no information for a specific column is contained in this
|
|
1128
|
+
collection, this indicates that the column is not important for the join graph's query.
|
|
1129
|
+
"""
|
|
1130
|
+
return [
|
|
1131
|
+
info
|
|
1132
|
+
for info in self._index_structures.values()
|
|
1133
|
+
if info._column.belongs_to(table)
|
|
1134
|
+
]
|
|
1135
|
+
|
|
1136
|
+
def __len__(self) -> int:
|
|
1137
|
+
return len(self._graph)
|
|
1138
|
+
|
|
1139
|
+
def __iter__(self) -> Iterator[TableReference]:
|
|
1140
|
+
return iter(self._graph.nodes)
|
|
1141
|
+
|
|
1142
|
+
def __contains__(self, x: object) -> bool:
|
|
1143
|
+
return x in self._graph.nodes
|
|
1144
|
+
|
|
1145
|
+
def __getitem__(self, key: TableReference) -> TableInfo:
|
|
1146
|
+
if key not in self:
|
|
1147
|
+
raise KeyError(f"Table {key} is not part of the join graph")
|
|
1148
|
+
free = self.is_free_table(key)
|
|
1149
|
+
index_info = self._index_info_for(key)
|
|
1150
|
+
return TableInfo(free, index_info)
|