PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/_hints.py ADDED
@@ -0,0 +1,1373 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ from collections.abc import Collection, Iterable
6
+ from enum import Enum
7
+ from typing import Any, Literal, Optional
8
+
9
+ from . import util
10
+ from ._base import T
11
+ from ._core import (
12
+ Cardinality,
13
+ IntermediateOperator,
14
+ JoinOperator,
15
+ PhysicalOperator,
16
+ ScanOperator,
17
+ TableReference,
18
+ )
19
+ from ._qep import PlanEstimates, PlanParams, QueryPlan
20
+ from .qal import parser
21
+ from .util import jsondict
22
+
23
+
24
+ class ScanOperatorAssignment:
25
+ """Models the selection of a scan operator for a specific base table.
26
+
27
+ Attributes
28
+ -------
29
+ operator : ScanOperators
30
+ The selected operator
31
+ table : TableReference
32
+ The table that is scanned using the operator
33
+ parallel_workers : float | int
34
+ The number of parallel processes that should be used to execute the scan. Can be set to 1 to indicate sequential
35
+ operation. Defaults to NaN to indicate that no choice has been made.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ operator: ScanOperator,
41
+ table: TableReference,
42
+ parallel_workers: float | int = math.nan,
43
+ ) -> None:
44
+ self._operator = operator
45
+ self._table = table
46
+ self._parallel_workers = parallel_workers
47
+ self._hash_val = hash((self._operator, self._table, self._parallel_workers))
48
+
49
+ __match_args__ = ("operator", "table", "parallel_workers")
50
+
51
+ @property
52
+ def operator(self) -> ScanOperator:
53
+ """Get the assigned operator.
54
+
55
+ Returns
56
+ -------
57
+ ScanOperators
58
+ The operator
59
+ """
60
+ return self._operator
61
+
62
+ @property
63
+ def table(self) -> TableReference:
64
+ """Get the table being scanned.
65
+
66
+ Returns
67
+ -------
68
+ TableReference
69
+ The table
70
+ """
71
+ return self._table
72
+
73
+ @property
74
+ def parallel_workers(self) -> int | float:
75
+ """Get the number of parallel workers used for the scan.
76
+
77
+ This number designates the total number of parallel processes. It can be 1 to indicate sequential operation, or even
78
+ *NaN* if it is unknown.
79
+
80
+ Returns
81
+ -------
82
+ int | float
83
+ The number of workers
84
+ """
85
+ return self._parallel_workers
86
+
87
+ def inspect(self) -> str:
88
+ """Provides the scan as a natural string.
89
+
90
+ Returns
91
+ -------
92
+ str
93
+ A string representation of the assignment
94
+ """
95
+ return f"USING {self.operator}" if self.operator else ""
96
+
97
+ def __json__(self) -> jsondict:
98
+ return {
99
+ "operator": self.operator.value,
100
+ "table": self.table,
101
+ "parallel_workers": self.parallel_workers,
102
+ }
103
+
104
+ def __hash__(self) -> int:
105
+ return self._hash_val
106
+
107
+ def __eq__(self, other: object) -> bool:
108
+ return (
109
+ isinstance(other, type(self))
110
+ and self.operator == other.operator
111
+ and self.table == other.table
112
+ and self.parallel_workers == other.parallel_workers
113
+ )
114
+
115
+ def __repr__(self) -> str:
116
+ return str(self)
117
+
118
+ def __str__(self) -> str:
119
+ return f"{self.operator.value}({self.table})"
120
+
121
+
122
+ class JoinOperatorAssignment:
123
+ """Models the selection of a join operator for a specific join of tables.
124
+
125
+ Each join is identified by all base tables that are involved in the join. The assignment to intermediate results does not
126
+ matter here. For example, a join between R ⨝ S and T is expressed as R, S, T even though the actual join combined an
127
+ intermediate result with as base table.
128
+
129
+ A more verbose model is provided by the `DirectionalJoinOperatorAssignment`. In addition to the joined tables, that model
130
+ also distinguishes between inner and outer relation of the join.
131
+
132
+ Parameters
133
+ ----------
134
+ operator : JoinOperators
135
+ The selected operator
136
+ join : Collection[TableReference]
137
+ The base tables that are joined using the operator
138
+ parallel_workers : float | int, optional
139
+ The number of parallel processes that should be used to execute the join. Can be set to 1 to indicate sequential
140
+ operation. Defaults to NaN to indicate that no choice has been made.
141
+
142
+ Raises
143
+ ------
144
+ ValueError
145
+ If `join` contains less than 2 tables
146
+ """
147
+
148
+ def __init__(
149
+ self,
150
+ operator: JoinOperator,
151
+ join: Collection[TableReference],
152
+ *,
153
+ parallel_workers: float | int = math.nan,
154
+ ) -> None:
155
+ if len(join) < 2:
156
+ raise ValueError("At least 2 join tables must be given")
157
+ self._operator = operator
158
+ self._join = frozenset(join)
159
+ self._parallel_workers = parallel_workers
160
+
161
+ self._hash_val = hash((self._operator, self._join, self._parallel_workers))
162
+
163
+ __match_args__ = ("operator", "join", "parallel_workers")
164
+
165
+ @property
166
+ def operator(self) -> JoinOperator:
167
+ """Get the operator that was selected for the join
168
+
169
+ Returns
170
+ -------
171
+ JoinOperators
172
+ The operator
173
+ """
174
+ return self._operator
175
+
176
+ @property
177
+ def join(self) -> frozenset[TableReference]:
178
+ """Get the tables that are joined together.
179
+
180
+ For joins of more than 2 base tables this usually means that the join combines an intermediate result with a base table
181
+ or another intermediate result. These two cases are not distinguished by the assignment and have to be detected
182
+ through other information, e.g. the join tree.
183
+
184
+ The more verbose model of a `DirectionalJoinOperatorAssignment` also distinguishes between inner and outer relations.
185
+
186
+ Returns
187
+ -------
188
+ frozenset[TableReference]
189
+ The tables that are joined together
190
+ """
191
+ return self._join
192
+
193
+ @property
194
+ def intermediate(self) -> frozenset[TableReference]:
195
+ """Alias for `join`"""
196
+ return self._join
197
+
198
+ @property
199
+ def parallel_workers(self) -> float | int:
200
+ """Get the number of parallel processes that should be used in the join.
201
+
202
+ "Processes" does not necessarily mean "system processes". The database system can also choose to use threads or other
203
+ means of parallelization. This is not restricted by the join assignment.
204
+
205
+ Returns
206
+ -------
207
+ float | int
208
+ The number processes to use. Can be 1 to indicate sequential processing or NaN to indicate that no choice has been
209
+ made.
210
+ """
211
+ return self._parallel_workers
212
+
213
+ def inspect(self) -> str:
214
+ """Provides this assignment as a natural string.
215
+
216
+ Returns
217
+ -------
218
+ str
219
+ A string representation of the assignment.
220
+ """
221
+ return f"USING {self.operator}" if self.operator else ""
222
+
223
+ def is_directional(self) -> bool:
224
+ """Checks, whether this assignment contains directional information, i.e. regarding inner and outer relation.
225
+
226
+ Returns
227
+ -------
228
+ bool
229
+ Whether the assignment explicitly denotes which relation should be the inner relationship and which relation should
230
+ be the outer relationship
231
+ """
232
+ return False
233
+
234
+ def __json__(self) -> jsondict:
235
+ return {
236
+ "directional": self.is_directional(),
237
+ "operator": self.operator.value,
238
+ "join": self.join,
239
+ "parallel_workers": self.parallel_workers,
240
+ }
241
+
242
+ def __hash__(self) -> int:
243
+ return self._hash_val
244
+
245
+ def __eq__(self, other: object) -> bool:
246
+ return (
247
+ isinstance(other, type(self))
248
+ and self._operator == other._operator
249
+ and self._join == other._join
250
+ and self._parallel_workers == other._parallel_workers
251
+ )
252
+
253
+ def __repr__(self) -> str:
254
+ return str(self)
255
+
256
+ def __str__(self) -> str:
257
+ join_str = ", ".join(str(tab) for tab in self.join)
258
+ return f"{self.operator.value}({join_str})"
259
+
260
+
261
+ class DirectionalJoinOperatorAssignment(JoinOperatorAssignment):
262
+ """A more verbose model of join operators.
263
+
264
+ The directional assignment does not only represent the relations that should be joined together, but also denotes which
265
+ role they should play for the join. More specifically, the directional assignment provides the *inner* and *outer* relation
266
+ of the join. The precise semantics of this distinction depends on the specific join operator and is also used
267
+ inconsistently between different database systems. In PostBOUND we use the following definitions:
268
+
269
+ - for nested-loop joins the outer relation corresponds to the outer loop and the inner relation is the inner loop. As a
270
+ special case for index nested-loop joins ths inner relation is the one that is probed via an index
271
+ - for hash joins the outer relation is the one that is aggregated in a hash table and the inner relation is the one that
272
+ is probed against that table
273
+ - for sort-merge joins the assignment does not matter
274
+
275
+ Parameters
276
+ ----------
277
+ operator : JoinOperators
278
+ The selected operator
279
+ inner : Collection[TableReference]
280
+ The tables that form the inner relation of the join
281
+ outer : Collection[TableReference]
282
+ The tables that form the outer relation of the join
283
+ parallel_workers : float | int, optional
284
+ The number of parallel processes that should be used to execute the join. Can be set to 1 to indicate sequential
285
+ operation. Defaults to NaN to indicate that no choice has been made.
286
+
287
+ Raises
288
+ ------
289
+ ValueError
290
+ If either `inner` or `outer` is empty.
291
+ """
292
+
293
+ def __init__(
294
+ self,
295
+ operator: JoinOperator,
296
+ inner: Collection[TableReference],
297
+ outer: Collection[TableReference],
298
+ *,
299
+ parallel_workers: float | int = math.nan,
300
+ ) -> None:
301
+ if not inner or not outer:
302
+ raise ValueError("Both inner and outer relations must be given")
303
+ self._inner = frozenset(inner)
304
+ self._outer = frozenset(outer)
305
+ super().__init__(
306
+ operator, self._inner | self._outer, parallel_workers=parallel_workers
307
+ )
308
+
309
+ __match_args__ = ("operator", "outer", "inner", "parallel_workers")
310
+
311
+ @property
312
+ def inner(self) -> frozenset[TableReference]:
313
+ """Get the inner relation of the join.
314
+
315
+ Returns
316
+ -------
317
+ frozenset[TableReference]
318
+ The tables of the inner relation
319
+ """
320
+ return self._inner
321
+
322
+ @property
323
+ def outer(self) -> frozenset[TableReference]:
324
+ """Get the outer relation of the join.
325
+
326
+ Returns
327
+ -------
328
+ frozenset[TableReference]
329
+ The tables of the outer relation
330
+ """
331
+ return self._outer
332
+
333
+ def is_directional(self) -> bool:
334
+ return True
335
+
336
+ def __json__(self) -> jsondict:
337
+ return {
338
+ "directional": True,
339
+ "operator": self.operator,
340
+ "inner": self.inner,
341
+ "outer": self.outer,
342
+ "parallel_workers": self.parallel_workers,
343
+ }
344
+
345
+ __hash__ = JoinOperatorAssignment.__hash__
346
+
347
+ def __eq__(self, other: object) -> bool:
348
+ return (
349
+ isinstance(other, type(self))
350
+ and self._inner == other._inner
351
+ and self._outer == other._outer
352
+ and super().__eq__(other)
353
+ )
354
+
355
+
356
+ def read_operator_json(
357
+ json_data: dict | str,
358
+ ) -> Optional[PhysicalOperator | ScanOperatorAssignment | JoinOperatorAssignment]:
359
+ """Reads a physical operator assignment from a JSON dictionary.
360
+
361
+ Parameters
362
+ ----------
363
+ json_data : dict | str
364
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*).
365
+
366
+ Returns
367
+ -------
368
+ Optional[ScanOperators | JoinOperators | ScanOperatorAssignment | JoinOperatorAssignment]
369
+ The parsed assignment. Whether it is a scan or join assignment is inferred from the JSON dictionary. If the input is
370
+ empty or *None*, *None* is returned.
371
+ """
372
+ if not json_data:
373
+ return None
374
+
375
+ if isinstance(json_data, str):
376
+ if json_data in {op.value for op in ScanOperator}:
377
+ return ScanOperator(json_data)
378
+ elif json_data in {op.value for op in JoinOperator}:
379
+ return JoinOperator(json_data)
380
+ elif json_data in {op.value for op in IntermediateOperator}:
381
+ return IntermediateOperator(json_data)
382
+ else:
383
+ json_data = json.loads(json_data)
384
+
385
+ parallel_workers = json_data.get("parallel_workers", math.nan)
386
+
387
+ if "table" in json_data:
388
+ parsed_table = parser.load_table_json(json_data["table"])
389
+ scan_operator = ScanOperator(json_data["operator"])
390
+ return ScanOperatorAssignment(scan_operator, parsed_table, parallel_workers)
391
+ elif "join" not in json_data and not (
392
+ "inner" in json_data and "outer" in json_data
393
+ ):
394
+ raise ValueError(
395
+ f"Malformed operator JSON: either 'table' or 'join' must be given: '{json_data}'"
396
+ )
397
+
398
+ directional = json_data["directional"]
399
+ join_operator = JoinOperator(json_data["operator"])
400
+ if directional:
401
+ inner = [parser.load_table_json(tab) for tab in json_data["inner"]]
402
+ outer = [parser.load_table_json(tab) for tab in json_data["outer"]]
403
+ return DirectionalJoinOperatorAssignment(
404
+ join_operator, inner, outer, parallel_workers=parallel_workers
405
+ )
406
+
407
+ joined_tables = [parser.load_table_json(tab) for tab in json_data["join"]]
408
+ return JoinOperatorAssignment(
409
+ join_operator, joined_tables, parallel_workers=parallel_workers
410
+ )
411
+
412
+
413
+ class PhysicalOperatorAssignment:
414
+ """The physical operator assignment stores the operators that should be used for specific joins or scans.
415
+
416
+ The assignment can happen at different levels:
417
+
418
+ - `global_settings` enable or disable specific operators for the entire query
419
+ - `join_operators` and `scan_operators` are concerned with specific (joins of) base tables. These assignments overwrite the
420
+ global settings, i.e. it is possible to assign a nested loop join to a specific set of tables, but disable NLJ globally.
421
+ In this case, only the specified join will be executed as an NLJ and other algorithms are used for all other joins
422
+ - `intermediate_operators` are used to pre-process the input for joins, e.g. by caching input tuples in a memo.
423
+
424
+ The basic assumption here is that for all joins and scans that have no assignment, the database system should determine the
425
+ best operators by itself. Likewise, the database system is free to insert intermediate operators wherever it sees fit.
426
+
427
+ Although it is allowed to modify the different dictionaries directly, the high-level methods (e.g. `add` or
428
+ `set_join_operator`) should be used instead. This ensures that all potential (future) invariants are maintained.
429
+
430
+ The assignment enables ``__getitem__`` access and tries to determine the requested setting in an intelligent way, i.e.
431
+ supplying a single base table will provide the associated scan operator, supplying an iterable of base tables the join
432
+ operator and supplying an operator will return the global setting. If no item is found, *None* will be returned.
433
+ ``__iter__`` and ``__contains__`` wrap scan and join operators and ``__bool__`` checks for any assignment
434
+ (global or specific). Notice that intermediate operators are not considered in the container-like methods.
435
+
436
+ Attributes
437
+ ----------
438
+ global_settings : dict[ScanOperators | JoinOperators | IntermediateOperator, bool]
439
+ Contains the global settings. Each operator is mapped to whether it is enable for the entire query or not. If an
440
+ operator is not present in the dictionary, the default setting of the database system is used.
441
+ join_operators : dict[frozenset[TableReference], JoinOperatorAssignment]
442
+ Contains the join operators that should be used for individual joins. All joins are identified by the base tables that
443
+ they combine. If a join does not appear in this dictionary, the database system has to choose an appropriate operator
444
+ (perhaps while considering the `global_settings`).
445
+ scan_operators : dict[TableReference, ScanOperatorAssignment]
446
+ Contains the scan operators that should be used for individual base table scans. Each scan is identified by the table
447
+ that should be scanned. If a table does not appear in this dictionary, the database system has to choose an appropriate
448
+ operator (perhaps while considering the `global_settings`).
449
+ intermediate_operators : dict[frozenset[TableReference], IntermediateOperator]
450
+ Contains the intermediate operators that are used to pre-process the input for joins. Keys are the intermediate tables
451
+ that are processed by the operator, i.e. an entry ``intermediate_operators[{R, S}] = Materialize`` means that the
452
+ result of the join between *R* and *S* should be materialized and *not* that the input to the join between *R* and *S*
453
+ should be materialized. Notice that intermediate operators are not enforced in conjunction with the join operators. For
454
+ example, a merge join assignment between *R* and *S* does not require the presence of sort operators for *R* and *S*.
455
+ Such interactions must be handled by the database hinting backend.
456
+ """
457
+
458
+ def __init__(self) -> None:
459
+ self.global_settings: dict[
460
+ ScanOperator | JoinOperator | IntermediateOperator, bool
461
+ ] = {}
462
+ self.join_operators: dict[
463
+ frozenset[TableReference], JoinOperatorAssignment
464
+ ] = {}
465
+ self.intermediate_operators: dict[
466
+ frozenset[TableReference], IntermediateOperator
467
+ ] = {}
468
+ self.scan_operators: dict[TableReference, ScanOperatorAssignment] = {}
469
+
470
+ def get_globally_enabled_operators(
471
+ self, include_by_default: bool = True
472
+ ) -> frozenset[PhysicalOperator]:
473
+ """Provides all operators that are enabled globally.
474
+
475
+ This differs from just calling ``assignment.global_settings`` directly, since all operators are checked, not just the
476
+ operators that appear in the global settings dictionary.
477
+
478
+ Parameters
479
+ ----------
480
+ include_by_default : bool, optional
481
+ The behaviour for operators that do not have a global setting set. If enabled, such operators are assumed to be
482
+ enabled and are hence included in the set.
483
+
484
+ Returns
485
+ -------
486
+ frozenset[PhysicalOperator]
487
+ The enabled scan and join operators. If no global setting is available for an operator `include_by_default`
488
+ determines the appropriate action.
489
+ """
490
+ enabled_scan_ops = [
491
+ scan_op
492
+ for scan_op in ScanOperator
493
+ if self.global_settings.get(scan_op, include_by_default)
494
+ ]
495
+ enabled_join_ops = [
496
+ join_op
497
+ for join_op in JoinOperator
498
+ if self.global_settings.get(join_op, include_by_default)
499
+ ]
500
+ enabled_intermediate_ops = [
501
+ intermediate_op
502
+ for intermediate_op in IntermediateOperator
503
+ if self.global_settings.get(intermediate_op, include_by_default)
504
+ ]
505
+ return frozenset(enabled_scan_ops + enabled_join_ops + enabled_intermediate_ops)
506
+
507
+ def set_operator_enabled_globally(
508
+ self,
509
+ operator: PhysicalOperator,
510
+ enabled: bool,
511
+ *,
512
+ overwrite_fine_grained_selection: bool = False,
513
+ ) -> None:
514
+ """Enables or disables an operator for all parts of a query.
515
+
516
+ Parameters
517
+ ----------
518
+ operator : PhysicalOperator
519
+ The operator to configure
520
+ enabled : bool
521
+ Whether the database system is allowed to choose the operator
522
+ overwrite_fine_grained_selection : bool, optional
523
+ How to deal with assignments of the same operator to individual nodes. If *True* all assignments that contradict
524
+ the setting are removed. For example, consider a situation where nested-loop joins should be disabled globally, but
525
+ a specific join has already been assigned to be executed with an NLJ. In this case, setting
526
+ `overwrite_fine_grained_selection` removes the assignment for the specific join. This is off by default, to enable
527
+ the per-node selection to overwrite global settings.
528
+ """
529
+ self.global_settings[operator] = enabled
530
+
531
+ if not overwrite_fine_grained_selection or enabled:
532
+ return
533
+
534
+ # at this point we know that we should disable a scan or join operator that was potentially set for
535
+ # individual joins or tables
536
+ match operator:
537
+ case ScanOperator():
538
+ self.scan_operators = {
539
+ table: current_setting
540
+ for table, current_setting in self.scan_operators.items()
541
+ if current_setting != operator
542
+ }
543
+ case JoinOperator():
544
+ self.join_operators = {
545
+ join: current_setting
546
+ for join, current_setting in self.join_operators.items()
547
+ if current_setting != operator
548
+ }
549
+ case IntermediateOperator():
550
+ self.intermediate_operators = {
551
+ join: current_setting
552
+ for join, current_setting in self.intermediate_operators.items()
553
+ if current_setting != operator
554
+ }
555
+ case _:
556
+ raise ValueError(f"Unknown operator type: {operator}")
557
+
558
+ def set_join_operator(
559
+ self,
560
+ operator: JoinOperatorAssignment | JoinOperator,
561
+ tables: Iterable[TableReference] | None = None,
562
+ ) -> None:
563
+ """Enforces a specific join operator for the join that consists of the contained tables.
564
+
565
+ This overwrites all previous assignments for the same join. Global settings are left unmodified since per-join settings
566
+ overwrite them anyway.
567
+
568
+ Parameters
569
+ ----------
570
+ join_operator : JoinOperatorAssignment | JoinOperator
571
+ The join operator. Can be an entire assignment, or just a plain operator. If a plain operator is supplied, the
572
+ actual tables to join must be provided in the `tables` parameter.
573
+ tables : Iterable[TableReference], optional
574
+ The tables to join. This parameter is only used if only a join operator without a proper assignment is supplied in
575
+ the `join_operator` parameter. Otherwise it is ignored.
576
+
577
+ Notes
578
+ -----
579
+
580
+ You can also pass a `DirectionalJoinOperatorAssignment` to this method. In contrast to the normal assignment, this
581
+ one also distinguishes between inner and outer relations of the join.
582
+ """
583
+ if isinstance(operator, JoinOperator):
584
+ operator = JoinOperatorAssignment(operator, tables)
585
+
586
+ self.join_operators[operator.join] = operator
587
+
588
+ def set_scan_operator(
589
+ self,
590
+ operator: ScanOperatorAssignment | ScanOperator,
591
+ table: TableReference | Iterable[TableReference] | None = None,
592
+ ) -> None:
593
+ """Enforces a specific scan operator for the contained base table.
594
+
595
+ This overwrites all previous assignments for the same table. Global settings are left unmodified since per-table
596
+ settings overwrite them anyway.
597
+
598
+ Parameters
599
+ ----------
600
+ scan_operator : ScanOperatorAssignment | ScanOperator
601
+ The scan operator. Can be an entire assignment, or just a plain operator. If a plain operator is supplied, the
602
+ actual table to scan must be provided in the `table` parameter.
603
+ table : TableReference | Iterable[TableReference], optional
604
+ The table to scan. This parameter is only used if only a scan operator without a proper assignment is supplied in
605
+ the `scan_operator` parameter. Otherwise it is ignored.
606
+ """
607
+ if isinstance(operator, ScanOperator):
608
+ table = util.simplify(table)
609
+ operator = ScanOperatorAssignment(operator, table)
610
+
611
+ self.scan_operators[operator.table] = operator
612
+
613
+ def set_intermediate_operator(
614
+ self, operator: IntermediateOperator, tables: Iterable[TableReference]
615
+ ) -> None:
616
+ """Enforces an intermediate operator to process specific tables.
617
+
618
+ This overwrites all previous assignments for the same intermediate. Global settings are left unmodified since
619
+ per-intermediate settings overwrite them anyway.
620
+
621
+ Parameters
622
+ ----------
623
+ intermediate_operator : IntermediateOperator
624
+ The intermediate operator
625
+ tables : Iterable[TableReference]
626
+ The tables to process. Notice that these tables are not the tables that are joined, but the input to the join.
627
+ For example, consider a neste-loop join between *R* and *S* where the tuples from *S* should be materialized
628
+ (perhaps because they stem from an expensive index access). In this case, the assignment should contain a
629
+ nested-loop assignment for the intermediate *{R, S}* and an assignment for the materialize operator for *S*.
630
+
631
+ """
632
+ self.intermediate_operators[frozenset(tables)] = operator
633
+
634
+ def add(
635
+ self,
636
+ operator: ScanOperatorAssignment | JoinOperatorAssignment | PhysicalOperator,
637
+ tables: Iterable[TableReference] | None = None,
638
+ ) -> None:
639
+ """Adds an arbitrary operator assignment to the current settings.
640
+
641
+ In contrast to the `set_scan_operator` and `set_join_operator` methods, this method figures out the correct assignment
642
+ type based on the input.
643
+
644
+ Parameters
645
+ ----------
646
+ operator : ScanOperatorAssignment | JoinOperatorAssignment | PhysicalOperator
647
+ The operator to use. If this is a complete assignment, it is used as such. Otherwise, the `tables` parameter must
648
+ contain the tables that are affected by the operator.
649
+ tables : Iterable[TableReference] | None, optional
650
+ The tables to join. This parameter is only used if a plain operator is supplied in the `operator` parameter.
651
+ Otherwise it is ignored.
652
+ """
653
+ match operator:
654
+ case ScanOperator():
655
+ self.set_scan_operator(operator, tables)
656
+ case JoinOperator():
657
+ self.set_join_operator(operator, tables)
658
+ case ScanOperatorAssignment():
659
+ self.set_scan_operator(operator)
660
+ case JoinOperatorAssignment():
661
+ self.set_join_operator(operator)
662
+ case IntermediateOperator():
663
+ self.set_intermediate_operator(operator, tables)
664
+ case _:
665
+ raise ValueError(f"Unknown operator assignment: {operator}")
666
+
667
+ def merge_with(
668
+ self, other_assignment: PhysicalOperatorAssignment
669
+ ) -> PhysicalOperatorAssignment:
670
+ """Combines the current assignment with additional operators.
671
+
672
+ In case of assignments to the same operators, the settings from the other assignment take precedence. None of the input
673
+ assignments are modified.
674
+
675
+ Parameters
676
+ ----------
677
+ other_assignment : PhysicalOperatorAssignment
678
+ The assignment to combine with the current assignment
679
+
680
+ Returns
681
+ -------
682
+ PhysicalOperatorAssignment
683
+ The combined assignment
684
+ """
685
+ merged_assignment = PhysicalOperatorAssignment()
686
+ merged_assignment.global_settings = (
687
+ self.global_settings | other_assignment.global_settings
688
+ )
689
+ merged_assignment.join_operators = (
690
+ self.join_operators | other_assignment.join_operators
691
+ )
692
+ merged_assignment.scan_operators = (
693
+ self.scan_operators | other_assignment.scan_operators
694
+ )
695
+ merged_assignment.intermediate_operators = (
696
+ self.intermediate_operators | other_assignment.intermediate_operators
697
+ )
698
+ return merged_assignment
699
+
700
+ def integrate_workers_from(
701
+ self, params: PlanParameterization, *, fail_on_missing: bool = False
702
+ ) -> PhysicalOperatorAssignment:
703
+ """Adds parallel workers from plan parameters to all matching operators.
704
+
705
+ Parameters
706
+ ----------
707
+ params : PlanParameterization
708
+ Parameters that provide the number of workers for specific intermediates
709
+ fail_on_missing : bool, optional
710
+ Whether to raise an error if the plan parameters contain worker hints for an intermediate that does not have
711
+ an operator assigned. The default is to just ignore such hints.
712
+
713
+ Returns
714
+ -------
715
+ PhysicalOperatorAssignment
716
+ The updated assignment. The original assignment is not modified.
717
+ """
718
+ assignment = self.clone()
719
+
720
+ for intermediate, workers in params.parallel_workers.items():
721
+ operator = assignment.get(intermediate)
722
+ if not operator and fail_on_missing:
723
+ raise ValueError(
724
+ f"Cannot integrate workers - no operator set for {list(intermediate)}"
725
+ )
726
+ elif not operator:
727
+ continue
728
+
729
+ match operator:
730
+ case ScanOperatorAssignment(op, tab):
731
+ updated_assignment = ScanOperatorAssignment(op, tab, workers)
732
+ case DirectionalJoinOperatorAssignment(op, outer, inner):
733
+ updated_assignment = DirectionalJoinOperatorAssignment(
734
+ op, inner, outer, parallel_workers=workers
735
+ )
736
+ case JoinOperatorAssignment(op, join):
737
+ updated_assignment = JoinOperatorAssignment(
738
+ op, join, parallel_workers=workers
739
+ )
740
+ case _:
741
+ raise RuntimeError(f"Unexpected operator type: {operator}")
742
+
743
+ assignment.add(updated_assignment)
744
+
745
+ return assignment
746
+
747
+ def global_settings_only(self) -> PhysicalOperatorAssignment:
748
+ """Provides an assignment that only contains the global settings.
749
+
750
+ Changes to the global settings of the derived assignment are not reflected in this assignment and vice-versa.
751
+
752
+ Returns
753
+ -------
754
+ PhysicalOperatorAssignment
755
+ An assignment of the global settings
756
+ """
757
+ global_assignment = PhysicalOperatorAssignment()
758
+ global_assignment.global_settings = dict(self.global_settings)
759
+ return global_assignment
760
+
761
+ def clone(self) -> PhysicalOperatorAssignment:
762
+ """Provides a copy of the current settings.
763
+
764
+ Changes to the copy are not reflected back on this assignment and vice-versa.
765
+
766
+ Returns
767
+ -------
768
+ PhysicalOperatorAssignment
769
+ The copy
770
+ """
771
+ cloned_assignment = PhysicalOperatorAssignment()
772
+ cloned_assignment.global_settings = dict(self.global_settings)
773
+ cloned_assignment.join_operators = dict(self.join_operators)
774
+ cloned_assignment.scan_operators = dict(self.scan_operators)
775
+ cloned_assignment.intermediate_operators = dict(self.intermediate_operators)
776
+ return cloned_assignment
777
+
778
+ def get(
779
+ self,
780
+ intermediate: TableReference | Iterable[TableReference],
781
+ default: Optional[T] = None,
782
+ ) -> Optional[ScanOperatorAssignment | JoinOperatorAssignment | T]:
783
+ """Retrieves the operator assignment for a specific scan or join.
784
+
785
+ This is similar to the *dict.get* method. An important distinction is that we never raise an error if there is no
786
+ intermediate assigned to the operator. Instead, we return the default value, which is *None* by default.
787
+
788
+ Notice that this method never provides intermediate operators!
789
+
790
+ Parameters
791
+ ----------
792
+ intermediate : TableReference | Iterable[TableReference]
793
+ The intermediate to retrieve the operator assignment for. For scans, either the scanned table can be given
794
+ directly, or the table can be wrapped in a singleton iterable.
795
+ default : Optional[T], optional
796
+ The default value to return if no assignment is found. Defaults to *None*.
797
+
798
+ Returns
799
+ -------
800
+ Optional[ScanOperatorAssignment | JoinOperatorAssignment | T]
801
+ The assignment if it was found or the default value otherwise.
802
+ """
803
+ if isinstance(intermediate, TableReference):
804
+ return self.scan_operators.get(intermediate, default)
805
+
806
+ intermediate_set = frozenset(intermediate)
807
+ return (
808
+ self.scan_operators.get(intermediate)
809
+ if len(intermediate_set) == 1
810
+ else self.join_operators.get(intermediate_set, default)
811
+ )
812
+
813
+ def __json__(self) -> jsondict:
814
+ jsonized = {
815
+ "global_settings": [],
816
+ "scan_operators": [
817
+ {"table": scan.table, "operator": scan.operator}
818
+ for scan in self.scan_operators.values()
819
+ ],
820
+ "join_operators": [
821
+ {"intermediate": join.join, "operator": join.operator}
822
+ for join in self.join_operators.values()
823
+ ],
824
+ "intermediate_operators": [
825
+ {"intermediate": intermediate, "operator": op}
826
+ for intermediate, op in self.intermediate_operators.items()
827
+ ],
828
+ }
829
+
830
+ global_settings: list[dict] = []
831
+ for operator, enabled in self.global_settings.items():
832
+ match operator:
833
+ case ScanOperator():
834
+ global_settings.append(
835
+ {"operator": operator, "enabled": enabled, "kind": "scan"}
836
+ )
837
+ case JoinOperator():
838
+ global_settings.append(
839
+ {"operator": operator, "enabled": enabled, "kind": "join"}
840
+ )
841
+ case IntermediateOperator():
842
+ global_settings.append(
843
+ {
844
+ "operator": operator,
845
+ "enabled": enabled,
846
+ "kind": "intermediate",
847
+ }
848
+ )
849
+ jsonized["global_settings"] = global_settings
850
+
851
+ return jsonized
852
+
853
+ def __bool__(self) -> bool:
854
+ return (
855
+ bool(self.global_settings)
856
+ or bool(self.join_operators)
857
+ or bool(self.scan_operators)
858
+ or bool(self.intermediate_operators)
859
+ )
860
+
861
+ def __iter__(self) -> Iterable[ScanOperatorAssignment | JoinOperatorAssignment]:
862
+ yield from self.scan_operators.values()
863
+ yield from self.join_operators.values()
864
+
865
+ def __contains__(self, item: TableReference | Iterable[TableReference]) -> bool:
866
+ if isinstance(item, TableReference):
867
+ return item in self.scan_operators
868
+
869
+ items = frozenset(item)
870
+ return (
871
+ item in self.scan_operators
872
+ if len(items) == 1
873
+ else items in self.join_operators
874
+ )
875
+
876
+ def __getitem__(
877
+ self,
878
+ item: TableReference | Iterable[TableReference] | ScanOperator | JoinOperator,
879
+ ) -> ScanOperatorAssignment | JoinOperatorAssignment | bool | None:
880
+ if isinstance(item, ScanOperator) or isinstance(item, JoinOperator):
881
+ return self.global_settings.get(item, None)
882
+ elif isinstance(item, TableReference):
883
+ return self.scan_operators.get(item, None)
884
+ elif isinstance(item, Iterable):
885
+ return self.join_operators.get(frozenset(item), None)
886
+ else:
887
+ return None
888
+
889
+ def __hash__(self) -> int:
890
+ return hash(
891
+ (
892
+ util.hash_dict(self.global_settings),
893
+ util.hash_dict(self.scan_operators),
894
+ util.hash_dict(self.join_operators),
895
+ )
896
+ )
897
+
898
+ def __eq__(self, other: object) -> bool:
899
+ return (
900
+ isinstance(other, type(self))
901
+ and self.global_settings == other.global_settings
902
+ and self.scan_operators == other.scan_operators
903
+ and self.join_operators == other.join_operators
904
+ )
905
+
906
+ def __repr__(self) -> str:
907
+ return str(self)
908
+
909
+ def __str__(self) -> str:
910
+ global_str = ", ".join(
911
+ f"{op.value}: {enabled}" for op, enabled in self.global_settings.items()
912
+ )
913
+
914
+ scans_str = ", ".join(
915
+ f"{scan.table.identifier()}: {scan.operator.value}"
916
+ for scan in self.scan_operators.values()
917
+ )
918
+
919
+ joins_keys = (
920
+ (join, " ⨝ ".join(tab.identifier() for tab in join.join))
921
+ for join in self.join_operators.values()
922
+ )
923
+ joins_str = ", ".join(
924
+ f"{key}: {join.operator.value}" for join, key in joins_keys
925
+ )
926
+
927
+ intermediates_keys = (
928
+ (intermediate, " ⨝ ".join(tab.identifier() for tab in intermediate))
929
+ for intermediate in self.intermediate_operators.keys()
930
+ )
931
+ intermediates_str = ", ".join(
932
+ f"{key}: {intermediate.value}" for intermediate, key in intermediates_keys
933
+ )
934
+
935
+ return f"global=[{global_str}] scans=[{scans_str}] joins=[{joins_str}] intermediates=[{intermediates_str}]"
936
+
937
+
938
+ def operators_from_plan(
939
+ query_plan: QueryPlan, *, include_workers: bool = False
940
+ ) -> PhysicalOperatorAssignment:
941
+ """Extracts the operator assignment from a whole query plan.
942
+
943
+ Notice that this method only adds parallel workers to the assignment if explicitly told to, since this is generally
944
+ better handled by the parameterization.
945
+ """
946
+ assignment = PhysicalOperatorAssignment()
947
+ if not query_plan.operator and query_plan.input_node:
948
+ return operators_from_plan(query_plan.input_node)
949
+
950
+ workers = query_plan.parallel_workers if include_workers else math.nan
951
+ match query_plan.operator:
952
+ case ScanOperator():
953
+ operator = ScanOperatorAssignment(
954
+ query_plan.operator,
955
+ query_plan.base_table,
956
+ workers,
957
+ )
958
+ assignment.add(operator)
959
+ case JoinOperator():
960
+ operator = JoinOperatorAssignment(
961
+ query_plan.operator,
962
+ query_plan.tables(),
963
+ parallel_workers=workers,
964
+ )
965
+ assignment.add(operator)
966
+ case _:
967
+ assignment.add(query_plan.operator, query_plan.tables())
968
+
969
+ for child in query_plan.children:
970
+ child_assignment = operators_from_plan(child)
971
+ assignment = assignment.merge_with(child_assignment)
972
+ return assignment
973
+
974
+
975
+ def read_operator_assignment_json(json_data: dict | str) -> PhysicalOperatorAssignment:
976
+ """Loads an operator assignment from its JSON representation.
977
+
978
+ Parameters
979
+ ----------
980
+ json_data : dict | str
981
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*).
982
+
983
+ Returns
984
+ -------
985
+ PhysicalOperatorAssignment
986
+ The assignment
987
+ """
988
+ json_data = json.loads(json_data) if isinstance(json_data, str) else json_data
989
+ assignment = PhysicalOperatorAssignment()
990
+
991
+ for hint in json_data.get("global_settings", []):
992
+ enabled = hint["enabled"]
993
+ match hint["kind"]:
994
+ case "scan":
995
+ assignment.global_settings[ScanOperator(hint["operator"])] = enabled
996
+ case "join":
997
+ assignment.global_settings[JoinOperator(hint["operator"])] = enabled
998
+ case "intermediate":
999
+ assignment.global_settings[IntermediateOperator(hint["operator"])] = (
1000
+ enabled
1001
+ )
1002
+ case _:
1003
+ raise ValueError(f"Unknown operator kind: {hint['kind']}")
1004
+
1005
+ for hint in json_data.get("scan_operators", []):
1006
+ parsed_table = parser.load_table_json(hint["table"])
1007
+ assignment.scan_operators[parsed_table] = ScanOperatorAssignment(
1008
+ ScanOperator(hint["operator"]), parsed_table
1009
+ )
1010
+
1011
+ for hint in json_data.get("join_operators", []):
1012
+ parsed_tables = frozenset(
1013
+ parser.load_table_json(tab) for tab in hint["intermediate"]
1014
+ )
1015
+ assignment.join_operators[parsed_tables] = JoinOperatorAssignment(
1016
+ JoinOperator(hint["operator"]), parsed_tables
1017
+ )
1018
+
1019
+ for hint in json_data.get("intermediate_operators", []):
1020
+ parsed_tables = frozenset(
1021
+ parser.load_table_json(tab) for tab in hint["intermediate"]
1022
+ )
1023
+ assignment.intermediate_operators[parsed_tables] = IntermediateOperator(
1024
+ hint["operator"]
1025
+ )
1026
+
1027
+ return assignment
1028
+
1029
+
1030
+ ExecutionMode = Literal["sequential", "parallel"]
1031
+ """
1032
+ The execution mode indicates whether a query should be executed using either only sequential operators or only parallel
1033
+ ones.
1034
+ """
1035
+
1036
+
1037
+ class PlanParameterization:
1038
+ """The plan parameterization stores metadata that is assigned to different parts of the plan.
1039
+
1040
+ Currently, three types of parameters are supported:
1041
+
1042
+ - `cardinalities` provide specific cardinality estimates for individual joins or tables. These can be used to overwrite
1043
+ the estimation of the native database system
1044
+ - `parallel_workers` indicate how many worker processes should be used to execute individual joins or table
1045
+ scans (assuming that the selected operator can be parallelized). Notice that this can also be indicated as part of
1046
+ the `PhysicalOperatorAssignment` which will take precedence over this setting.
1047
+ - `system_settings` can be used to enable or disable specific optimization or execution features of the target
1048
+ database. For example, they can be used to disable parallel execution or switch to another cardinality estimation
1049
+ method. Such settings should be used sparingly since they defeat the purpose of optimization algorithms that are
1050
+ independent of specific database systems. Using these settings can also modify properties of the connection and
1051
+ therefore affect later queries. It is the users's responsibility to reset such settings if necessary.
1052
+
1053
+ In addition, the `execution_mode` can be used to control whether the optimizer should only consider sequential plans or
1054
+ parallel plans. Note that the `parallel_workers` take precedence over this setting. If the optimizer should decide
1055
+ whether a parallel execution is beneficial, this should be set to *None*.
1056
+
1057
+ Although it is allowed to modify the different dictionaries directly, the more high-level methods should be used
1058
+ instead. This ensures that all potential (future) invariants are maintained.
1059
+
1060
+ Attributes
1061
+ ----------
1062
+ cardinalities : dict[frozenset[TableReference], Cardinality]
1063
+ Contains the cardinalities for individual joins and scans. This is always the cardinality that is emitted by a
1064
+ specific operator. All joins are identified by the base tables that they combine. Keys of single tables correpond
1065
+ to scans. Each join should assume that all filter predicates that can be evaluated at this point have already been
1066
+ applied.
1067
+ parallel_workers : dict[frozenset[TableReference], int]
1068
+ Contains the number of parallel processes that should be used to execute a join or scan. All joins are identified
1069
+ by the base tables that they combine. Keys of single tables correpond to scans. "Processes" does not necessarily
1070
+ mean "system processes". The database system can also choose to use threads or other means of parallelization. This
1071
+ is not restricted by the join assignment.
1072
+ system_settings : dict[str, Any]
1073
+ Contains the settings for the target database system. The keys and values, as well as their usage depend entirely
1074
+ on the system. For example, in Postgres a setting like *enable_geqo = 'off'* can be used to disable the genetic
1075
+ optimizer.
1076
+ execution_mode : ExecutionMode | None
1077
+ Indicates whether the optimizer should only consider sequential plans, parallel plans, or leave the decision to the
1078
+ optimizer (*None*). The default is *None*.
1079
+ """
1080
+
1081
+ def __init__(self) -> None:
1082
+ self.cardinalities: dict[frozenset[TableReference], Cardinality] = {}
1083
+ """
1084
+ Contains the cardinalities for individual joins and scans. This is always the cardinality that is emitted by a
1085
+ specific operator. All joins are identified by the base tables that they combine. Keys of single tables correpond
1086
+ to scans.
1087
+ Each join should assume that all filter predicates that can be evaluated at this point have already been applied.
1088
+ """
1089
+
1090
+ self.parallel_workers: dict[frozenset[TableReference], int] = {}
1091
+ """
1092
+ Contains the number of parallel processes that should be used to execute a join or scan. All joins are identified
1093
+ by the base tables that they combine. Keys of single tables correpond to scans. "Processes" does not necessarily
1094
+ mean "system processes". The database system can also choose to use threads or other means of parallelization. This
1095
+ is not restricted by the join assignment.
1096
+ """
1097
+
1098
+ self.system_settings: dict[str, Any] = {}
1099
+ """
1100
+ Contains the settings for the target database system. The keys and values, as well as their usage depend entirely
1101
+ on the system. For example, in Postgres a setting like *enable_geqo = 'off'* can be used to disable the genetic
1102
+ optimizer.
1103
+ """
1104
+
1105
+ self.execution_mode: ExecutionMode | None = None
1106
+ """
1107
+ Indicates whether the optimizer should only consider sequential plans, parallel plans, or leave the decision to the
1108
+ optimizer (*None*). The default is *None*.
1109
+ """
1110
+
1111
+ def add_cardinality(
1112
+ self, tables: Iterable[TableReference], cardinality: Cardinality
1113
+ ) -> None:
1114
+ """Assigns a specific cardinality hint to a (join of) tables.
1115
+
1116
+ Parameters
1117
+ ----------
1118
+ tables : Iterable[TableReference]
1119
+ The tables for which the hint is generated. This can be an iterable of a single table, which denotes a scan hint.
1120
+ cardinality : Cardinality
1121
+ The estimated or known cardinality.
1122
+ """
1123
+ cardinality = Cardinality.of(cardinality)
1124
+ self.cardinalities[frozenset(tables)] = cardinality
1125
+
1126
+ def set_workers(self, tables: Iterable[TableReference], num_workers: int) -> None:
1127
+ """Assigns a specific number of parallel workers to a (join of) tables.
1128
+
1129
+ How these workers are implemented depends on the database system. They could become actual system processes, threads,
1130
+ etc.
1131
+
1132
+ Parameters
1133
+ ----------
1134
+ tables : Iterable[TableReference]
1135
+ The tables for which the hint is generated. This can be an iterable of a single table, which denotes a scan hint.
1136
+ num_workers : int
1137
+ The desired number of worker processes. This denotes the total number of processes, not an additional amount. For
1138
+ some database systems this is an important distinction since one operator node will always be created. This node
1139
+ is then responsible for spawning the workers, but can also take part in the actual calculation. To prevent one-off
1140
+ errors, we standardize this number to denote the total number of workers that take part in the calculation.
1141
+ """
1142
+ self.parallel_workers[frozenset(tables)] = num_workers
1143
+
1144
+ def set_system_settings(
1145
+ self, setting_name: str = "", setting_value: Any = None, **kwargs
1146
+ ) -> None:
1147
+ """Stores a specific system setting.
1148
+
1149
+ This may happen in one of two ways: giving the setting name and value as two different parameters, or combining their
1150
+ assignment in the keyword parameters. While the first is limited to a single parameter, the second can be used to
1151
+ assign an arbitrary number of settings. However, this is limited to setting names that form valid keyword names.
1152
+
1153
+ Parameters
1154
+ ----------
1155
+ setting_name : str, optional
1156
+ The name of the setting when using the separate key/value assignment mode. Defaults to an empty string to enable
1157
+ the integrated keyword parameter mode.
1158
+ setting_value : Any, optional
1159
+ The setting's value when using the separate key/value assignment mode. Defaults to *None* to enable the
1160
+ integrated keyword parameter mode.
1161
+ kwargs
1162
+ The key/value pairs in the integrated keyword parameter mode.
1163
+
1164
+ Raises
1165
+ ------
1166
+ ValueError
1167
+ If both the `setting_name` as well as keyword arguments are given
1168
+ ValueError
1169
+ If neither the `setting_name` nor keyword arguments are given
1170
+
1171
+ Examples
1172
+ --------
1173
+ Using the separate setting name and value syntax: ``set_system_settings("join_collapse_limit", 1)``
1174
+ Using the kwargs syntax: ``set_system_settings(join_collapse_limit=1, jit=False)``
1175
+ Both examples are specific to Postgres (see https://www.postgresql.org/docs/current/runtime-config-query.html).
1176
+ """
1177
+ if setting_name and kwargs:
1178
+ raise ValueError("Only setting or kwargs can be supplied")
1179
+ elif not setting_name and not kwargs:
1180
+ raise ValueError("setting_name or kwargs required!")
1181
+
1182
+ if setting_name:
1183
+ self.system_settings[setting_name] = setting_value
1184
+ else:
1185
+ self.system_settings |= kwargs
1186
+
1187
+ def merge_with(
1188
+ self, other_parameters: PlanParameterization
1189
+ ) -> PlanParameterization:
1190
+ """Combines the current parameters with additional hints.
1191
+
1192
+ In case of assignments to the same hints, the values from the other parameters take precedence. None of the input
1193
+ parameterizations are modified.
1194
+
1195
+ Parameters
1196
+ ----------
1197
+ other_parameters : PlanParameterization
1198
+ The parameterization to combine with the current parameterization
1199
+
1200
+ Returns
1201
+ -------
1202
+ PlanParameterization
1203
+ The merged parameters
1204
+ """
1205
+ merged_params = PlanParameterization()
1206
+ merged_params.cardinalities = (
1207
+ self.cardinalities | other_parameters.cardinalities
1208
+ )
1209
+ merged_params.parallel_workers = (
1210
+ self.parallel_workers | other_parameters.parallel_workers
1211
+ )
1212
+ merged_params.system_settings = (
1213
+ self.system_settings | other_parameters.system_settings
1214
+ )
1215
+ return merged_params
1216
+
1217
+ def drop_workers(self) -> PlanParameterization:
1218
+ """Provides a copy of the current parameters without any parallel worker hints.
1219
+
1220
+ Changes to the copy are not reflected back on this parameterization and vice-versa.
1221
+
1222
+ Returns
1223
+ -------
1224
+ PlanParameterization
1225
+ The copy without any parallel worker hints
1226
+ """
1227
+ params = PlanParameterization()
1228
+ params.cardinalities = dict(self.cardinalities)
1229
+ params.system_settings = dict(self.system_settings)
1230
+ params.execution_mode = self.execution_mode
1231
+ return params
1232
+
1233
+ def __json__(self) -> jsondict:
1234
+ return {
1235
+ "cardinality_hints": self.cardinalities,
1236
+ "parallel_worker_hints": self.parallel_workers,
1237
+ }
1238
+
1239
+ def __repr__(self) -> str:
1240
+ return str(self)
1241
+
1242
+ def __str__(self) -> str:
1243
+ return (
1244
+ f"PlanParams(cards={self.cardinalities}, "
1245
+ f"system specific={self.system_settings}, par workers={self.parallel_workers})"
1246
+ )
1247
+
1248
+
1249
+ def read_plan_params_json(json_data: dict | str) -> PlanParameterization:
1250
+ """Loads a plan parameterization from its JSON representation.
1251
+
1252
+ Parameters
1253
+ ----------
1254
+ json_data : dict | str
1255
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*).
1256
+
1257
+ Returns
1258
+ -------
1259
+ PlanParameterization
1260
+ The plan parameterization
1261
+ """
1262
+ json_data = json.loads(json_data) if isinstance(json_data, str) else json_data
1263
+ params = PlanParameterization()
1264
+ params.cardinalities = {
1265
+ frozenset(parser.load_table_json(tab)): card
1266
+ for tab, card in json_data.get("cardinality_hints", {}).items()
1267
+ }
1268
+ params.parallel_workers = {
1269
+ frozenset(parser.load_table_json(tab)): workers
1270
+ for tab, workers in json_data.get("parallel_worker_hints", {}).items()
1271
+ }
1272
+ return params
1273
+
1274
+
1275
+ def update_plan(
1276
+ query_plan: QueryPlan,
1277
+ *,
1278
+ operators: Optional[PhysicalOperatorAssignment] = None,
1279
+ params: Optional[PlanParameterization] = None,
1280
+ simplify: bool = True,
1281
+ ) -> QueryPlan:
1282
+ """Assigns new operators and/or new estimates to a query plan, leaving the join order intact.
1283
+
1284
+ Notice that this update method is not particularly smart and only operates on a per-node basis. This means that high-level
1285
+ functions that are composed of multiple operators might not be updated properly. For example, Postgres represents a hash
1286
+ join as a combination of a hash operator (which builds the actual hash table) and a follow-up hash join operator (which
1287
+ performs the probing). If the update changes the hash join to a different join, the hash operator will still exist, likely
1288
+ leading to an invalid query plan. To circumvent such problems, the query plan is by default simplified before processing.
1289
+ Simplification removes all auxiliary non-join and non-scan operators, thereby effectively only leaving those nodes with a
1290
+ corresponding operator. But, there is no free lunch and the simplification might also remove some other important
1291
+ operators, such as using hash-based or sort-based aggregation operators. Therefore, simplification can be disabled by
1292
+ setting the `simplify` parameter to *False*.
1293
+
1294
+ Parameters
1295
+ ----------
1296
+ query_plan : QueryPlan
1297
+ The plan to update.
1298
+ operators : Optional[PhysicalOperatorAssignment], optional
1299
+ The new operators to use. This can be a partial assignment, in which case only the operators that are present in the
1300
+ new assignment are used and all others are left unchanged. If this parameter is not given, no operators are updated.
1301
+ params : Optional[PlanParameterization], optional
1302
+ The new parameters to use. This can be a partial assignment, in which case only the cardinalities/parallel workers in
1303
+ the new assignment are used and all others are left unchanged. If this parameter is not given, no parameters are
1304
+ updated.
1305
+ simplify : bool, optional
1306
+ Whether to simplify the query plan before updating it. For a detailed discussion, see the high-level documentatio of
1307
+ this method. Simplifications is enabled by default.
1308
+
1309
+ Returns
1310
+ -------
1311
+ QueryPlan
1312
+ The updated query plan
1313
+
1314
+ See Also
1315
+ --------
1316
+ QueryPlan.simplify
1317
+ """
1318
+ query_plan = query_plan.canonical() if simplify else query_plan
1319
+
1320
+ updated_operator = (
1321
+ operators.get(query_plan.tables(), query_plan.operator)
1322
+ if operators
1323
+ else query_plan.operator
1324
+ )
1325
+ updated_card_est = (
1326
+ params.cardinalities.get(query_plan.tables(), query_plan.estimated_cardinality)
1327
+ if params
1328
+ else query_plan.estimated_cardinality
1329
+ )
1330
+ updated_workers = (
1331
+ params.parallel_workers.get(
1332
+ query_plan.tables(), query_plan.params.parallel_workers
1333
+ )
1334
+ if params
1335
+ else query_plan.params.parallel_workers
1336
+ )
1337
+
1338
+ updated_params = PlanParams(
1339
+ **(query_plan.params.items() | {"parallel_workers": updated_workers})
1340
+ )
1341
+ updated_estimates = PlanEstimates(
1342
+ **(query_plan.estimates.items() | {"estimated_cardinality": updated_card_est})
1343
+ )
1344
+ updated_children = [
1345
+ update_plan(child, operators=operators, params=params)
1346
+ for child in query_plan.children
1347
+ ]
1348
+
1349
+ return QueryPlan(
1350
+ query_plan.node_type,
1351
+ operator=updated_operator,
1352
+ children=updated_children,
1353
+ plan_params=updated_params,
1354
+ estimates=updated_estimates,
1355
+ measures=query_plan.measures,
1356
+ subplan=query_plan.subplan,
1357
+ )
1358
+
1359
+
1360
+ class HintType(Enum):
1361
+ """Contains all hint types that are supported by PostBOUND.
1362
+
1363
+ Notice that not all of these hints need to be represented in the `PlanParameterization`, since some of them concern other
1364
+ aspects such as the join order. Furthermore, not all database systems will support all operators. The availability of
1365
+ certain hints can be checked on the database system interface and should be handled as part of the optimization pre-checks.
1366
+ """
1367
+
1368
+ LinearJoinOrder = "Join order"
1369
+ JoinDirection = "Join direction"
1370
+ BushyJoinOrder = "Bushy join order"
1371
+ Operator = "Physical operators"
1372
+ Parallelization = "Par. workers"
1373
+ Cardinality = "Cardinality"