PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1150 @@
1
+ """Provides an implementation of a dynamic join graph, as well as some related objects."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import collections
6
+ import copy
7
+ from collections.abc import Callable, Collection, Iterable, Iterator, Mapping
8
+ from dataclasses import dataclass
9
+ from typing import Literal, Optional
10
+
11
+ import networkx as nx
12
+
13
+ from .. import util
14
+ from .._core import ColumnReference, DBCatalog, TableReference
15
+ from ..qal import transform
16
+ from ..qal._qal import (
17
+ AbstractPredicate,
18
+ CompoundPredicate,
19
+ ImplicitSqlQuery,
20
+ determine_join_equivalence_classes,
21
+ generate_predicates_for_equivalence_classes,
22
+ )
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class JoinPath:
27
+ """A join path models the join between two tables.
28
+
29
+ Usually a path represents a join where one table is part of an intermediate result, whereas the other table is already part
30
+ of an intermediate result. However, this is not required.
31
+
32
+ Attributes
33
+ ----------
34
+ start_table : TableReference
35
+ The first join partner involved in the join. This is the table that is already part of the intermediate result of the
36
+ query
37
+ target_table : TableReference
38
+ The second join partner involved in the join. This is the table that is not yet part of any intermediate result. Thus,
39
+ this is the table that should be joined next
40
+ join_condition : Optional[AbstractPredicate], optional
41
+ The predicate that is used to actually join the `target_table` with the current intermediate result. Usually the
42
+ predicate is restricted to the the join between `start_table` and `target_table`, but can also include additional join
43
+ predicates over other tables in the intermediate results.
44
+ """
45
+
46
+ start_table: TableReference
47
+ target_table: TableReference
48
+ join_condition: Optional[AbstractPredicate] = None
49
+
50
+ def tables(self) -> tuple[TableReference, TableReference]:
51
+ """Provides the tables that are joined.
52
+
53
+ Returns
54
+ -------
55
+ tuple[TableReference, TableReference]
56
+ The tables
57
+
58
+ Warnings
59
+ --------
60
+ The definition of this methods differs slightly from other definitions of the tables method that can be found in the
61
+ query abstraction layer. The tables method for join paths really only focuses on `start_table` and `target_table`. If
62
+ additional tables appear as part of the `join_condition`, they are ignored.
63
+ """
64
+ return self.start_table, self.target_table
65
+
66
+ def spans_table(self, table: TableReference) -> bool:
67
+ """Checks, whether a specific table is either the start, or the target table in this path.
68
+
69
+ Parameters
70
+ ----------
71
+ table : TableReference
72
+ The table to check
73
+
74
+ Returns
75
+ -------
76
+ bool
77
+ Whether the table is part of the join path. Notice that this check does not consider tables that are part of the
78
+ `join_condition`.
79
+ """
80
+ return table == self.start_table or table == self.target_table
81
+
82
+ def flip_direction(self) -> JoinPath:
83
+ """Creates a new join path with the start and target tables reversed.
84
+
85
+ Returns
86
+ -------
87
+ JoinPath
88
+ The new join path
89
+ """
90
+ return JoinPath(
91
+ self.target_table, self.start_table, join_condition=self.join_condition
92
+ )
93
+
94
+ def __repr__(self) -> str:
95
+ return str(self)
96
+
97
+ def __str__(self) -> str:
98
+ return f"{self.start_table} ⋈ {self.target_table}"
99
+
100
+
101
+ class IndexInfo:
102
+ """This class captures relevant information about the availability of per-column indexes and their status.
103
+
104
+ The lifecycle of an index can be managed using the `invalidate` method. This indicates that an index can no longer be used
105
+ for a specific join, for example because its column has become part of an intermediate result already. In contrast to many
106
+ other types in PostBOUND, index information is a mutable structure and can be changed in-place.
107
+
108
+ The current implementation is only focused on indexes over a single column, multi-column indexes are not supported. Another
109
+ limitation is that the specific type (i.e. data structure) of the index is not captured. If this information is important,
110
+ it has to be maintained by the user.
111
+
112
+ Parameters
113
+ ----------
114
+ column : ColumnReference
115
+ The column fr which the index is created
116
+ index_type : Literal["primary", "secondary", "none"]
117
+ The kind of index that is maintained. ``"none"`` indicates that there is no index on the column. This is a different
118
+ concept from an index that exists, but cannot be used. The latter case is indicated via the `invalid` parameter
119
+ invalid : bool, optional
120
+ Whether the index can still be used during query execution. Typically, this is true for relations that have not been
121
+ included in any intermediate result and false afterwards.
122
+ """
123
+
124
+ @staticmethod
125
+ def primary_index(column: ColumnReference) -> IndexInfo:
126
+ """Creates index information for a primary key index.
127
+
128
+ Parameters
129
+ ----------
130
+ column : ColumnReference
131
+ The indexed column
132
+
133
+ Returns
134
+ -------
135
+ IndexInfo
136
+ The index information. The index is initialized as a valid index.
137
+ """
138
+ return IndexInfo(column, "primary")
139
+
140
+ @staticmethod
141
+ def secondary_index(column: ColumnReference) -> IndexInfo:
142
+ """Creates index information for a secondary index.
143
+
144
+ Foreign key indexes are often defined this way.
145
+
146
+ Parameters
147
+ ----------
148
+ column : ColumnReference
149
+ The indexed column
150
+
151
+ Returns
152
+ -------
153
+ IndexInfo
154
+ The index information. The index is initialized as a valid index.
155
+ """
156
+ return IndexInfo(column, "secondary")
157
+
158
+ @staticmethod
159
+ def no_index(column: ColumnReference) -> IndexInfo:
160
+ """Creates index information that indicates the absence of an index.
161
+
162
+ Parameters
163
+ ----------
164
+ column : ColumnReference
165
+ A column that does not have any index
166
+
167
+ Returns
168
+ -------
169
+ IndexInfo
170
+ The index information
171
+ """
172
+ return IndexInfo(column, "none")
173
+
174
+ @staticmethod
175
+ def generate_for(column: ColumnReference, db_schema: DBCatalog) -> IndexInfo:
176
+ """Determines available indexes for a specific column.
177
+
178
+ Parameters
179
+ ----------
180
+ column : ColumnReference
181
+ The column. It has to be connected to a valid, non-virtual table reference
182
+ db_schema : DBCatalog
183
+ The schema of the database to which the column belongs.
184
+
185
+ Returns
186
+ -------
187
+ IndexInfo
188
+ _description_
189
+
190
+ Raises
191
+ ------
192
+ base.UnboundColumnError
193
+ If the column is not associated with any table
194
+ """
195
+ if db_schema.is_primary_key(column):
196
+ return IndexInfo.primary_index(column)
197
+ elif db_schema.has_secondary_index(column):
198
+ return IndexInfo.secondary_index(column)
199
+ else:
200
+ return IndexInfo.no_index(column)
201
+
202
+ def __init__(
203
+ self,
204
+ column: ColumnReference,
205
+ index_type: Literal["primary", "secondary", "none"],
206
+ invalid: bool = False,
207
+ ) -> None:
208
+ self._column = column
209
+ self._index_type = index_type
210
+ self._is_invalid = invalid
211
+
212
+ @property
213
+ def column(self) -> ColumnReference:
214
+ """Get the column to which the index information belongs.
215
+
216
+ Returns
217
+ -------
218
+ ColumnReference
219
+ The column
220
+ """
221
+ return self._column
222
+
223
+ @property
224
+ def index_type(self) -> Literal["primary", "secondary", "none"]:
225
+ """Get the kind of index that is in principle available on the column.
226
+
227
+ The index type does not contain any information about whether an index is actually usable for a specific join. It
228
+ merely states whether an index has been defined.
229
+
230
+ Returns
231
+ -------
232
+ str
233
+ The index type. Can be *primary*, *secondary* or *none*.
234
+ """
235
+ return self._index_type
236
+
237
+ @property
238
+ def is_invalid(self) -> bool:
239
+ """Get whether the index is actually usable.
240
+
241
+ To determine whether an index can be used right now, this property has to be combined with the `index_type` value.
242
+ If there never was an index on the column, `is_valid` might have been true from the get-go. To make this check easier,
243
+ a number of utility methods exist.
244
+
245
+ Returns
246
+ -------
247
+ bool
248
+ Whether the index is usable if it exists. If there is no index on the column, the index cannot be interpreted in
249
+ any meaningful way.
250
+ """
251
+ return self._is_invalid
252
+
253
+ def is_primary(self) -> bool:
254
+ """Checks, whether this is a valid primary index.
255
+
256
+ Returns
257
+ -------
258
+ bool
259
+ Whether this is a primary key index and ensures that it is still valid.
260
+ """
261
+ return not self._is_invalid and self._index_type == "primary"
262
+
263
+ def is_secondary(self) -> bool:
264
+ """Checks, whether this is a valid secondary index.
265
+
266
+ Returns
267
+ -------
268
+ bool
269
+ Whether this is a secondary index and ensures that it is still valid.
270
+ """
271
+ return not self._is_invalid and self._index_type == "secondary"
272
+
273
+ def is_indexed(self) -> bool:
274
+ """Checks, whether there is any valid index defined for the column.
275
+
276
+ This check does not differentiate between primary key indexes and secondary indexes.
277
+
278
+ Returns
279
+ -------
280
+ bool
281
+ Whether this is a primary key or secondary index and ensures that it is still valid.
282
+ """
283
+ return self.is_primary() or self.is_secondary()
284
+
285
+ def can_pk_fk_join(self, other: IndexInfo) -> bool:
286
+ """Checks, whether two columns can be joined via a primary key/foreign key join.
287
+
288
+ This method does not restrict the direction of such a join, i.e. each column could act as the primary key or
289
+ foreign key. Likewise, no datatype checks are performed and it is assumed that a database system would be
290
+ able to actually join the two columns involved.
291
+
292
+ If indexes on any of the columns are no longer available, this check fails.
293
+
294
+ Parameters
295
+ ----------
296
+ other : IndexInfo
297
+ The index information of the other column that should participate in the join
298
+
299
+ Returns
300
+ -------
301
+ bool
302
+ Whether a primary key/foreign key join could be executed between the columns.
303
+ """
304
+ if not self.is_indexed() or not other.is_indexed():
305
+ return False
306
+
307
+ if self.is_secondary() and other.is_secondary():
308
+ return False
309
+
310
+ # all other cases have at least one primary key index available
311
+ return True
312
+
313
+ def invalidate(self) -> None:
314
+ """Marks the index as invalid.
315
+
316
+ Once a table is included in an intermediate join result, the index structures of its columns most likely
317
+ become invalid, and it is no longer possible to use the index to query for specific tuples (because the
318
+ occurrences of the individual tuples are multiplied when executing the join). This method can be used to model
319
+ the lifecycle of index structures within the course of the execution of a single query.
320
+ """
321
+ self._is_invalid = True
322
+
323
+ def __repr__(self) -> str:
324
+ return str(self)
325
+
326
+ def __str__(self) -> str:
327
+ invalid_state = " INVALID" if self._is_invalid else ""
328
+ if self._index_type == "none":
329
+ return f"NO INDEX({self._column})"
330
+ elif self._index_type == "primary":
331
+ return f"PRIMARY INDEX({self._column}){invalid_state}"
332
+ else:
333
+ return f"SECONDARY INDEX({self._column}){invalid_state}"
334
+
335
+
336
+ @dataclass(frozen=True)
337
+ class TableInfo:
338
+ """This class captures information about the state of tables in the join graph.
339
+
340
+ Attributes
341
+ ----------
342
+ free : bool
343
+ Whether the table is still *free*, i.e. is not a part of any intermediate join result.
344
+ index_info : Collection[IndexInfo]
345
+ Information about the indexes of all columns that belong to the table. If a column does not appear in this collection,
346
+ it does not have any indexes, or the column is not relevant in the current join graph (i.e. because it does not
347
+ appear in any join predicate)
348
+ """
349
+
350
+ free: bool
351
+ index_info: Collection[IndexInfo]
352
+
353
+
354
+ _PredicateMap = collections.defaultdict[
355
+ frozenset[TableReference], list[AbstractPredicate]
356
+ ]
357
+ """Type alias for an (internally used) predicate map"""
358
+
359
+
360
+ class JoinGraph(Mapping[TableReference, TableInfo]):
361
+ """The join graph models the connection between different tables in a query.
362
+
363
+ All tables that are referenced in the query are represented as the nodes in the graph. If two tables are joined via a join
364
+ predicate in the SQL query, they will be linked with an edge in the join graph. This graph is further annotated by the join
365
+ predicate. Additionally, the join graph stores index information for each of the relevant columns in the query.
366
+
367
+ In contrast to many other types in PostBOUND, a join graph is a mutable structure. It also models the current state of the
368
+ optimizer once specific tables have been included in an intermediate join result.
369
+
370
+ The wording of the different join graph methods distinguishes three states of joins (and correspondingly tables):
371
+
372
+ - a join might be *free*, if at least one of the corresponding tables have not been marked as joined, yet
373
+ - a join might be *available*, if it is *free* and one of the tables is already included in some intermediate join
374
+ - a join is *consumed*, if its no longer *free*. This occurs once the partner tables have both been marked as joined.
375
+
376
+ A further distinction is made between n:m joins and primary key/foreign key joins. Information about the available joins of
377
+ each type can be queried easily and many methods are available in two variants: one that includes all possible joins and
378
+ one that is only focused on primary key/foreign key joins. To determine the precise join types, the join graph needs to
379
+ access the database schema. A n:m join is one were the column values of both join partners can appear an arbitrary number
380
+ of times, corresponding to an n:m relation between the two tables.
381
+
382
+ By calling the `mark_joined` method, the state of individual joins and their corresponding tables might change. This also
383
+ means that former primary key/foreign key joins might become n:m joins (which is the case exactly when the primary key
384
+ table is inserted into an intermediate join result).
385
+
386
+ Parameters
387
+ ----------
388
+ query : ImplicitSqlQuery
389
+ The query for which the join graph should be generated
390
+ db_schema : Optional[DBCatalog], optional
391
+ The schema of the database on which the query should be executed. If this is ``None``, the database schema is inferred
392
+ based on the `DatabasePool`.
393
+ include_predicate_equivalence_classes : bool, optional
394
+ Whether predicates from the same equivalence class should be added to the join graph as well, even if they do not exist
395
+ in the original query. Consider two join conditions *a = b* and *b = c*. From these conditions, it follows that *a = c*
396
+ by the transitive property. If `include_predicate_equivalence_classes` is true, that last predicate will also be added
397
+ to the join graph.
398
+
399
+ Warnings
400
+ --------
401
+ If predicate equivalence classes are used, the optimization algorithm can potentially generate queries that contain
402
+ additional predicates that were not present in the original query.
403
+ """
404
+
405
+ def __init__(
406
+ self,
407
+ query: ImplicitSqlQuery,
408
+ db_schema: Optional[DBCatalog] = None,
409
+ *,
410
+ include_predicate_equivalence_classes: bool = False,
411
+ ) -> None:
412
+ if db_schema is None:
413
+ from .. import db # local import to avoid circular dependencies
414
+
415
+ db_schema = db.DatabasePool.get_instance().current_database().schema()
416
+
417
+ self.query = query
418
+ self._db_schema = db_schema
419
+ self._index_structures: dict[ColumnReference, IndexInfo] = {}
420
+
421
+ graph = nx.Graph()
422
+ graph.add_nodes_from(query.tables(), free=True)
423
+ edges = []
424
+ predicate_map: _PredicateMap = collections.defaultdict(list)
425
+ join_predicates = query.predicates().joins()
426
+ if include_predicate_equivalence_classes:
427
+ join_equivalence_classes = determine_join_equivalence_classes(
428
+ join_predicates
429
+ )
430
+ equivalence_class_predicates = generate_predicates_for_equivalence_classes(
431
+ join_equivalence_classes
432
+ )
433
+ join_predicates = set(join_predicates) | equivalence_class_predicates
434
+ for join_predicate in join_predicates:
435
+ if len(join_predicate.columns()) != 2:
436
+ continue
437
+ first_col, second_col = join_predicate.columns()
438
+ predicate_map[frozenset([first_col.table, second_col.table])].append(
439
+ join_predicate
440
+ )
441
+
442
+ for tables, joins in predicate_map.items():
443
+ first_tab, second_tab = tables
444
+ join_predicate = CompoundPredicate.create_and(joins)
445
+ edges.append((first_tab, second_tab, {"predicate": join_predicate}))
446
+ for column in join_predicate.columns():
447
+ self._index_structures[column] = IndexInfo.generate_for(
448
+ column, db_schema
449
+ )
450
+
451
+ graph.add_edges_from(edges)
452
+ self._graph = graph
453
+
454
+ def initial(self) -> bool:
455
+ """Checks, whether the join graph has already been modified.
456
+
457
+ Returns
458
+ -------
459
+ bool
460
+ ``True`` indicates that the join graph is still in its initial state, i.e. no table has been marked as joined, yet.
461
+ """
462
+ return all(is_free for __, is_free in self._graph.nodes.data("free"))
463
+
464
+ def contains_cross_products(self) -> bool:
465
+ """Checks, whether there are any cross products in the input query.
466
+
467
+ A cross product is a join between tables without a restricting join predicate. Note that this is only the case
468
+ if those tables are also not linked via a sequence of join predicates with other tables.
469
+
470
+ Returns
471
+ -------
472
+ bool
473
+ Whether the join graph contains at least one cross product.
474
+ """
475
+ return not nx.is_connected(self._graph)
476
+
477
+ def contains_free_tables(self) -> bool:
478
+ """Checks, whether there is at least one more free tables remaining in the graph.
479
+
480
+ Returns
481
+ -------
482
+ bool
483
+ Whether there are still free tables in the join graph
484
+ """
485
+ return any(is_free for __, is_free in self._graph.nodes.data("free"))
486
+
487
+ def contains_free_n_m_joins(self) -> bool:
488
+ """Checks, whether there is at least one more free n:m join remaining in the graph.
489
+
490
+ Returns
491
+ -------
492
+ bool
493
+ Whether there are still n:m joins with at least one free table.
494
+ """
495
+ is_first_join = self.initial()
496
+ for first_tab, second_tab, predicate in self._graph.edges.data("predicate"):
497
+ if not self.is_available_join(first_tab, second_tab) and not is_first_join:
498
+ continue
499
+ for first_col, second_col in predicate.join_partners():
500
+ if not self._index_structures[first_col].can_pk_fk_join(
501
+ self._index_structures[second_col]
502
+ ):
503
+ return True
504
+ return False
505
+
506
+ def count_consumed_tables(self) -> int:
507
+ """Determines the number of tables that have been joined already.
508
+
509
+ This number might be 1 if only the initial tables has been selected, or 0 if the join graph is still in its initial
510
+ state.
511
+
512
+ Returns
513
+ -------
514
+ int
515
+ The number of joined tables
516
+ """
517
+ return len(
518
+ [is_free for __, is_free in self._graph.nodes.data("free") if not is_free]
519
+ )
520
+
521
+ def join_components(self) -> Iterable[JoinGraph]:
522
+ """Provides all components of the join graph.
523
+
524
+ A component is a subgraph of the original join graph, such that the subgraph is connected but there was no
525
+ edge between nodes from different sub-graphs. This corresponds to the parts of the query that have to be joined
526
+ via a cross product.
527
+
528
+ Returns
529
+ -------
530
+ Iterable[JoinGraph]
531
+ The components of the join graph, each as its own full join graph object
532
+ """
533
+ components = []
534
+ for component in nx.connected_components(self._graph):
535
+ component_query = transform.extract_query_fragment(self.query, component)
536
+ components.append(JoinGraph(component_query, self._db_schema))
537
+ return components
538
+
539
+ def joined_tables(self) -> frozenset[TableReference]:
540
+ """Provides all non-free tables in the join graph.
541
+
542
+ Returns
543
+ -------
544
+ frozenset[TableReference]
545
+ The tables that have already been joined / consumed.
546
+ """
547
+ return frozenset(table for table in self if not self.is_free_table(table))
548
+
549
+ def free_tables(self) -> frozenset[TableReference]:
550
+ """Provides all tables that have not been joined, yet.
551
+
552
+ Returns
553
+ -------
554
+ frozenset[TableReference]
555
+ The tables that are not consumed
556
+ """
557
+ return frozenset(table for table in self if self.is_free_table(table))
558
+
559
+ def all_joins(self) -> Iterable[tuple[TableReference, TableReference]]:
560
+ """Provides all edges in the join graph, no matter whether they are available or not.
561
+
562
+ Returns
563
+ -------
564
+ Iterable[tuple[TableReference, TableReference]]
565
+ The possible joins in the graph. The assignment to the first or second component of the tuple is arbitrary
566
+ """
567
+ return list(self._graph.edges)
568
+
569
+ def available_join_paths(
570
+ self, *, both_directions_on_initial: bool = False
571
+ ) -> Iterable[JoinPath]:
572
+ """Provides all joins that can be executed in the current join graph.
573
+
574
+ The precise output of this method depends on the current state of the join graph: If the graph is still in its initial
575
+ state (i.e. none of the tables is joined yet), all joins are provided. Otherwise, only those join paths are considered
576
+ available, where one table is already joined, and the join partner is still free. The free table will be the target
577
+ table in the join path whereas the joined table will be the start table.
578
+
579
+ Parameters
580
+ ----------
581
+ both_directions_on_initial : bool, optional
582
+ Whether to include the join path *R* -> *S* as well as *S* -> *R* for initial join graphs, assuming there is a join
583
+ between *R* and *S* in the graph.
584
+
585
+ Returns
586
+ -------
587
+ Iterable[JoinPath]
588
+ All possible joins in the current graph.
589
+ """
590
+ join_paths = []
591
+ if self.initial():
592
+ for join_edge in self._graph.edges.data("predicate"):
593
+ source_table, target_table, join_condition = join_edge
594
+ current_join_path = JoinPath(source_table, target_table, join_condition)
595
+ join_paths.append(current_join_path)
596
+ if both_directions_on_initial:
597
+ join_paths.append(current_join_path.flip_direction())
598
+ return join_paths
599
+
600
+ for join_edge in self._graph.edges.data("predicate"):
601
+ source_table, target_table, join_condition = join_edge
602
+ if self.is_free_table(source_table) and self.is_free_table(target_table):
603
+ # both tables are still free -> no path
604
+ continue
605
+ elif not self.is_free_table(source_table) and not self.is_free_table(
606
+ target_table
607
+ ):
608
+ # both tables are already joined -> no path
609
+ continue
610
+
611
+ if self.is_free_table(source_table):
612
+ # fix directionality
613
+ source_table, target_table = target_table, source_table
614
+ join_paths.append(JoinPath(source_table, target_table, join_condition))
615
+
616
+ return join_paths
617
+
618
+ def available_n_m_join_paths(
619
+ self, *, both_directions_on_initial: bool = False
620
+ ) -> Iterable[JoinPath]:
621
+ """Provides exactly those join paths from `available_join_paths` that correspond to n:m joins.
622
+
623
+ The logic for initial and "dirty" join graphs is inherited from `available_join_paths` and can be further customized
624
+ via the `both_directions_on_initial` parameter.
625
+
626
+ Parameters
627
+ ----------
628
+ both_directions_on_initial : bool, optional
629
+ Whether to include the join path *R* -> *S* as well as *S* -> *R* for initial join graphs if *R* ⨝ *S* is an n:m
630
+ join.
631
+
632
+ Returns
633
+ -------
634
+ Iterable[JoinPath]
635
+ The available n:m joins
636
+ """
637
+ n_m_paths = []
638
+ for join_path in self.available_join_paths():
639
+ start_table, target_table = join_path.start_table, join_path.target_table
640
+ if not self.is_pk_fk_join(
641
+ start_table, target_table
642
+ ) and not self.is_pk_fk_join(target_table, start_table):
643
+ n_m_paths.append(join_path)
644
+ if both_directions_on_initial and self.initial():
645
+ n_m_paths.append(join_path.flip_direction())
646
+ return n_m_paths
647
+
648
+ def available_join_paths_for(self, table: TableReference) -> Iterable[JoinPath]:
649
+ """Returns all possible joins of a specific table.
650
+
651
+ What constitutes a possible join depends on the state of the join graph: for an initial join graph, the only
652
+ requirement is a valid join predicate between the tables. In all other cases, exactly one of the tables has to be free
653
+ and the other table has to be consumed.
654
+
655
+ Parameters
656
+ ----------
657
+ table : TableReference
658
+ The table that should be joined
659
+
660
+ Returns
661
+ -------
662
+ Iterable[JoinPath]
663
+ All possible join paths for the table. This includes n:m joins as well as primary key/foreign key joins. In
664
+ each join path the specified table will be the start table and the join partner will be the target table.
665
+ """
666
+ self._assert_contains_table(table)
667
+ return [
668
+ JoinPath(table, partner_table, join_edge["predicate"])
669
+ for partner_table, join_edge in self._graph.adj[table].items()
670
+ if self.is_available_join(table, partner_table)
671
+ ]
672
+
673
+ def nx_graph(self) -> nx.Graph:
674
+ """Provides the underlying graph object for this join graph.
675
+
676
+ Returns
677
+ -------
678
+ nx.Graph
679
+ A deep copy of the raw join graph
680
+ """
681
+ return copy.deepcopy(self._graph)
682
+
683
+ def is_free_table(self, table: TableReference) -> bool:
684
+ """Checks, whether a specific table is still free in this join graph.
685
+
686
+ If the table is not part of the join graph, an error is raised.
687
+
688
+ Parameters
689
+ ----------
690
+ table : TableReference
691
+ The table to check
692
+
693
+ Returns
694
+ -------
695
+ bool
696
+ Whether the given table is still free
697
+ """
698
+ return self._graph.nodes[table]["free"]
699
+
700
+ def joins_tables(
701
+ self, first_table: TableReference, second_table: TableReference
702
+ ) -> bool:
703
+ """Checks, whether the join graph contains an edge between specific tables.
704
+
705
+ This check does not require the join in question to be available (this is what `is_available_join` is for).
706
+
707
+ Parameters
708
+ ----------
709
+ first_table : TableReference
710
+ The first join partner
711
+ second_table : TableReference
712
+ The second join partner
713
+
714
+ Returns
715
+ -------
716
+ bool
717
+ Whether there is any join predicate between the given tables. The direction or availability does not matter for
718
+ this check
719
+ """
720
+ return (first_table, second_table) in self._graph.edges
721
+
722
+ def is_available_join(
723
+ self, first_table: TableReference, second_table: TableReference
724
+ ) -> bool:
725
+ """Checks, whether the join between two tables is still available.
726
+
727
+ For initial join graphs, this check passes as long as there is a valid join predicate between the two given tables. In
728
+ all other cases, one of the join partners has to be consumed, whereas the other partner has to be free.
729
+
730
+ Parameters
731
+ ----------
732
+ first_table : TableReference
733
+ The first join partner
734
+ second_table : TableReference
735
+ The second join partner
736
+
737
+ Returns
738
+ -------
739
+ bool
740
+ Whether there is a valid join between the given tables and whether this join is still available. The join direction
741
+ and join type do not matter.
742
+ """
743
+ first_free, second_free = (
744
+ self._graph.nodes[first_table]["free"],
745
+ self._graph.nodes[second_table]["free"],
746
+ )
747
+ valid_join = self.joins_tables(first_table, second_table)
748
+ available_join = (
749
+ (first_free and not second_free)
750
+ or (not first_free and second_free)
751
+ or self.initial()
752
+ )
753
+ return valid_join and available_join
754
+
755
+ def is_pk_fk_join(self, fk_table: TableReference, pk_table: TableReference) -> bool:
756
+ """Checks, whether the join between the supplied tables is a primary key/foreign key join.
757
+
758
+ This check does not require the indicated join to be available.
759
+
760
+ Parameters
761
+ ----------
762
+ fk_table : TableReference
763
+ The foreign key table
764
+ pk_table : TableReference
765
+ The primary key table
766
+
767
+ Returns
768
+ -------
769
+ bool
770
+ Whether the join between the given tables is a primary key/foreign key join with the correct direction
771
+
772
+ Warnings
773
+ --------
774
+ In the current implementation, this check only works for (conjunctions of) binary join predicates. An error is raised
775
+ for joins between multiple columns
776
+ """
777
+
778
+ if not self.joins_tables(fk_table, pk_table):
779
+ return False
780
+
781
+ predicate: AbstractPredicate = self._graph.edges[fk_table, pk_table][
782
+ "predicate"
783
+ ]
784
+ for base_predicate in predicate.base_predicates():
785
+ fk_col = util.simplify(base_predicate.columns_of(fk_table))
786
+ pk_col = util.simplify(base_predicate.columns_of(pk_table))
787
+ if (
788
+ self._index_structures[fk_col].is_indexed()
789
+ and self._index_structures[pk_col].is_primary()
790
+ ):
791
+ return True
792
+ return False
793
+
794
+ def is_n_m_join(
795
+ self, first_table: TableReference, second_table: TableReference
796
+ ) -> bool:
797
+ """Checks, whether the join between the supplied tables is an n:m join.
798
+
799
+ This check does not require the indicated join to be available.
800
+
801
+ Parameters
802
+ ----------
803
+ first_table : TableReference
804
+ The first join partner
805
+ second_table : TableReference
806
+ The second join partner
807
+
808
+ Returns
809
+ -------
810
+ bool
811
+ Whether the join between the given tables is an n:m join
812
+
813
+ Warnings
814
+ --------
815
+ In the current implementation, this check only works for (conjunctions of) binary join predicates. An error is raised
816
+ for joins between multiple columns
817
+ """
818
+ return (
819
+ self.joins_tables(first_table, second_table)
820
+ and not self.is_pk_fk_join(first_table, second_table)
821
+ and not self.is_pk_fk_join(second_table, first_table)
822
+ )
823
+
824
+ def available_pk_fk_joins_for(self, fk_table: TableReference) -> Iterable[JoinPath]:
825
+ """Provides all available primary key/foreign key joins with a specific foreign key table.
826
+
827
+ This method does not restrict itself to available joins, but requires that at least one of the join parts is free.
828
+
829
+ Parameters
830
+ ----------
831
+ fk_table : TableReference
832
+ The foreign key table. This will be the start table in all join paths.
833
+
834
+ Returns
835
+ -------
836
+ Iterable[JoinPath]
837
+ All matching join paths. The start table of the path will be the foreign key table, whereas the primary key table
838
+ will be the target table.
839
+ """
840
+ self._assert_contains_table(fk_table)
841
+ return [
842
+ JoinPath(fk_table, pk_table, join_edge["predicate"])
843
+ for pk_table, join_edge in self._graph.adj[fk_table].items()
844
+ if self.is_pk_fk_join(fk_table, pk_table)
845
+ and (self.is_free_table(fk_table) or self.is_free_table(pk_table))
846
+ ]
847
+
848
+ def available_deep_pk_join_paths_for(
849
+ self,
850
+ fk_table: TableReference,
851
+ ordering: Callable[[TableReference, dict], int] | None = None,
852
+ ) -> Iterable[JoinPath]:
853
+ """Provides all available pk/fk joins with the given table, as well as follow-up pk/fk joins.
854
+
855
+ In contrast to the `available_pk_fk_joins_for` method, this method does not only return direct joins between the
856
+ foreign key table, but augments its output in the following way: suppose the foreign key table is pk/fk joined with a
857
+ primary key table *t*. Then, this method also includes all joins of *t* with additional tables *t'*, such
858
+ that *t* ⋈ *t'* is once again a primary key/foreign key join, but this time with *t* acting as the foreign key
859
+ and *t'* as the primary key. This procedure is repeated for all *t'* tables recursively until no more primary
860
+ key/foreign key joins are available.
861
+
862
+ Essentially, this is equivalent to performing a breadth-first search on all (directed) primary key/foreign key
863
+ joins, starting at the foreign key table. The sequence in which joins on the same level are placed into the resulting
864
+ iterable can be customized via the `ordering` parameter. This callable receives the current primary key table
865
+ and the edge data as input and produces a numerical position weight as output (smaller values meaning earlier
866
+ placement). The provided edge data contains the join predicate under the ``"predicate"`` key. Using the join predicate,
867
+ the join partner (i.e. the foreign key table) can be retrieved.
868
+
869
+ Parameters
870
+ ----------
871
+ fk_table : TableReference
872
+ The foreign key table at which the search should be anchored.
873
+ ordering : Callable[[TableReference, dict], int] | None, optional
874
+ How to sort different primary key join partners on the same level. Lower values mean earlier positioning. This
875
+ defaults to ``None``, in which case an arbitrary ordering is used.
876
+
877
+ Returns
878
+ -------
879
+ Iterable[JoinPath]
880
+ All deep primary key/foreign key join paths, starting at the `fk_table`
881
+ """
882
+ self._assert_contains_table(fk_table)
883
+ available_joins = util.nx.nx_bfs_tree(
884
+ self._graph, fk_table, self._check_pk_fk_join, node_order=ordering
885
+ )
886
+ join_paths = []
887
+ for join in available_joins:
888
+ current_pk_table: TableReference = join[0]
889
+ join_predicate: AbstractPredicate = join[1]["predicate"]
890
+ current_fk_table = util.simplify(
891
+ {
892
+ column.table
893
+ for column in join_predicate.join_partners_of(current_pk_table)
894
+ }
895
+ )
896
+ join_paths.append(
897
+ JoinPath(current_fk_table, current_pk_table, join_predicate)
898
+ )
899
+ return join_paths
900
+
901
+ def join_partners_from(
902
+ self, table: TableReference, candidate_tables: Iterable[TableReference]
903
+ ) -> set[TableReference]:
904
+ """Provides exactly those tables from a set of candidates that are joined with a specific given table.
905
+
906
+ This check does not require the joins in question to be available. The existence of a join edge is sufficient.
907
+
908
+ Parameters
909
+ ----------
910
+ table : TableReference
911
+ The table that should be joined with the candidates
912
+ candidate_tables : Iterable[TableReference]
913
+ Possible join partners for the `table`. Join type and direction do not matter
914
+
915
+ Returns
916
+ -------
917
+ set[TableReference]
918
+ Those tables of the `candidate_tables` that can be joined with the partner table.
919
+ """
920
+ candidate_tables = set(candidate_tables)
921
+ return set(
922
+ neighbor
923
+ for neighbor in self._graph.adj[table].keys()
924
+ if neighbor in candidate_tables
925
+ )
926
+
927
+ def join_predicates_between(
928
+ self,
929
+ first_tables: TableReference | Iterable[TableReference],
930
+ second_tables: Optional[TableReference | Iterable[TableReference]] = None,
931
+ ) -> Collection[AbstractPredicate]:
932
+ """Provides all join predicates between sets of tables.
933
+
934
+ This method operates in two modes: if only one set of tables is given, all join predicates for tables within that set
935
+ are collected. If two sets are given, all join predicates for tables from both sets are collected, but not predicates
936
+ from tables within the same set.
937
+
938
+ The status of the tables, as well as the join type, do not play a role in this check.
939
+
940
+ Parameters
941
+ ----------
942
+ first_tables : TableReference | Iterable[TableReference]
943
+ The first set of candidate tables. Can optionally also be a single table, in which case the check is only
944
+ performed for this table and the partner set
945
+ second_tables : Optional[TableReference | Iterable[TableReference]], optional
946
+ The second set of candidate tables. By default this is ``None``, which results in collecting only join predicates
947
+ for tables from `first_tables`. Can also be a single table, in which case the check is only performed for this
948
+ table and the partner set
949
+
950
+ Returns
951
+ -------
952
+ Collection[AbstractPredicate]
953
+ All join predicates
954
+ """
955
+ first_tables = util.enlist(first_tables)
956
+ second_tables = util.enlist(second_tables) if second_tables else first_tables
957
+ matching_predicates = set()
958
+
959
+ for first_table in first_tables:
960
+ for second_table in second_tables:
961
+ join_predicate = self._fetch_join_predicate(first_table, second_table)
962
+ if join_predicate:
963
+ matching_predicates.add(join_predicate)
964
+
965
+ return matching_predicates
966
+
967
+ def mark_joined(
968
+ self, table: TableReference, join_edge: Optional[AbstractPredicate] = None
969
+ ) -> None:
970
+ """Updates the join graph to include a specific table in the intermediate result.
971
+
972
+ This procedure also changes the available index structures according to the kind of join that was executed.
973
+ This is determined based on the current state of the join graph, the index structures, as well as the supplied join
974
+ predicate. If no join predicate is supplied, it is inferred from the query predicates.
975
+
976
+ Parameters
977
+ ----------
978
+ table : TableReference
979
+ The tables that becomes part of an intermediate result
980
+ join_edge : Optional[AbstractPredicate], optional
981
+ The condition that is used to carry out the join. Defaults to ``None``, in which case the predicate is inferred
982
+ from the predicates that have been supplied by the initial query.
983
+ """
984
+
985
+ # TODO: check, if we actually need to handle transient index updates here as well
986
+ # TODO: do we still need the join_edge parameter if we infer it from the predicates anyway?
987
+
988
+ self._graph.nodes[table]["free"] = False
989
+ if len(self.joined_tables()) == 1:
990
+ return
991
+
992
+ join_edge = (
993
+ join_edge
994
+ if join_edge
995
+ else self.query.predicates().joins_between(table, self.joined_tables())
996
+ )
997
+ if not join_edge:
998
+ # We still need this check even though we already know that there are at least two tables joined, since
999
+ # these two tables might have nothing to do with each other (e.g. different components in the join graph)
1000
+ return
1001
+
1002
+ partner_tables = {col.table for col in join_edge.join_partners_of(table)}
1003
+ for partner_table in partner_tables:
1004
+ pk_fk_join = self.is_pk_fk_join(table, partner_table)
1005
+ fk_pk_join = self.is_pk_fk_join(partner_table, table)
1006
+
1007
+ if pk_fk_join and fk_pk_join: # PK/PK join
1008
+ continue
1009
+
1010
+ for col1, col2 in join_edge.join_partners():
1011
+ joined_col, partner_col = (
1012
+ (col1, col2) if col1.table == table else (col2, col1)
1013
+ )
1014
+ if pk_fk_join:
1015
+ self._index_structures[partner_col].invalidate()
1016
+ elif fk_pk_join:
1017
+ self._index_structures[joined_col].invalidate()
1018
+ else:
1019
+ self._index_structures[partner_col].invalidate()
1020
+ self._index_structures[joined_col].invalidate()
1021
+
1022
+ if pk_fk_join:
1023
+ continue
1024
+
1025
+ for table, is_free in self._graph.nodes.data("free"):
1026
+ if is_free or table == partner_table:
1027
+ continue
1028
+ self._invalidate_indexes_on(table)
1029
+
1030
+ def clone(self) -> JoinGraph:
1031
+ """Provides a deep copy of the current join graph.
1032
+
1033
+ Returns
1034
+ -------
1035
+ JoinGraph
1036
+ The copy. It can be safely modified without affecting the original join graph.
1037
+ """
1038
+ cloned = JoinGraph(self.query, self._db_schema)
1039
+ cloned._graph = self.nx_graph()
1040
+ cloned._index_structures = copy.deepcopy(self._index_structures)
1041
+ return cloned
1042
+
1043
+ def _assert_contains_table(self, table: TableReference) -> None:
1044
+ """Raises an error if a specific table is not part of the join graph.
1045
+
1046
+ Parameters
1047
+ ----------
1048
+ table : TableReference
1049
+ The table to check
1050
+
1051
+ Raises
1052
+ ------
1053
+ ValueError
1054
+ If the table is not a node in the join graph
1055
+ """
1056
+ if table not in self:
1057
+ raise ValueError(f"Join graph does not contain table {table}")
1058
+
1059
+ def _check_pk_fk_join(self, pk_table: TableReference, edge_data: dict) -> bool:
1060
+ """Checks, whether a specific table acts as a primary key in the join as indicated by a join graph edge.
1061
+
1062
+ Parameters
1063
+ ----------
1064
+ pk_table : TableReference
1065
+ The table to check
1066
+ edge_data : dict
1067
+ The join that should be performed. This has to be contained in the ``"predicate"`` key.
1068
+
1069
+ Returns
1070
+ -------
1071
+ bool
1072
+ Whether the `pk_table` actually acts as the primary key in the given join edge.
1073
+ """
1074
+ join_predicate: AbstractPredicate = edge_data["predicate"]
1075
+ for base_predicate in join_predicate.base_predicates():
1076
+ fk_table = util.simplify(
1077
+ {column.table for column in base_predicate.join_partners_of(pk_table)}
1078
+ )
1079
+ if self.is_pk_fk_join(fk_table, pk_table):
1080
+ return True
1081
+ return False
1082
+
1083
+ def _invalidate_indexes_on(self, table: TableReference) -> None:
1084
+ """Invalidates all indexes on all columns that belong to the given table.
1085
+
1086
+ Parameters
1087
+ ----------
1088
+ table : TableReference
1089
+ The table for which the invalidation should take place
1090
+ """
1091
+ for column, index in self._index_structures.items():
1092
+ if column.table == table:
1093
+ index.invalidate()
1094
+
1095
+ def _fetch_join_predicate(
1096
+ self, first_table: TableReference, second_table: TableReference
1097
+ ) -> Optional[AbstractPredicate]:
1098
+ """Provides the join predicate between specific tables if there is one.
1099
+
1100
+ Parameters
1101
+ ----------
1102
+ first_table : TableReference
1103
+ The first join partner
1104
+ second_table : TableReference
1105
+ The second join partner
1106
+
1107
+ Returns
1108
+ -------
1109
+ Optional[AbstractPredicate]
1110
+ The join predicate if exists or ``None`` otherwise. The status of the join partners does not matter
1111
+ """
1112
+ if (first_table, second_table) not in self._graph.edges:
1113
+ return None
1114
+ return self._graph.edges[first_table, second_table]["predicate"]
1115
+
1116
+ def _index_info_for(self, table: TableReference) -> Collection[IndexInfo]:
1117
+ """Provides all index info for a specific table (i.e. for each column that belongs to the table).
1118
+
1119
+ Parameters
1120
+ ----------
1121
+ table : TableReference
1122
+ The table to retrieve the index info for
1123
+
1124
+ Returns
1125
+ -------
1126
+ Collection[IndexInfo]
1127
+ The index info of each column of the table. If no information for a specific column is contained in this
1128
+ collection, this indicates that the column is not important for the join graph's query.
1129
+ """
1130
+ return [
1131
+ info
1132
+ for info in self._index_structures.values()
1133
+ if info._column.belongs_to(table)
1134
+ ]
1135
+
1136
+ def __len__(self) -> int:
1137
+ return len(self._graph)
1138
+
1139
+ def __iter__(self) -> Iterator[TableReference]:
1140
+ return iter(self._graph.nodes)
1141
+
1142
+ def __contains__(self, x: object) -> bool:
1143
+ return x in self._graph.nodes
1144
+
1145
+ def __getitem__(self, key: TableReference) -> TableInfo:
1146
+ if key not in self:
1147
+ raise KeyError(f"Table {key} is not part of the join graph")
1148
+ free = self.is_free_table(key)
1149
+ index_info = self._index_info_for(key)
1150
+ return TableInfo(free, index_info)