PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,4216 @@
1
+ """Contains the Postgres implementation of the Database interface.
2
+
3
+ In many ways the Postgres implementation can be thought of as the reference or blueprint implementation of the database
4
+ interface. This is due to two main reasons: first up, Postgres' capabilities follow a traditional architecture and its
5
+ features cover most of the general aspects of query optimization (i.e. supported operators, join orders and statistics).
6
+ Secondly, and on a more pragmatic note Potsgres was the first database system that was supported by PostBOUND and therefore
7
+ a lot of the original Postgres interfaces eventually evolved into the more abstract database-independent interfaces.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import collections
13
+ import concurrent
14
+ import concurrent.futures
15
+ import math
16
+ import multiprocessing as mp
17
+ import os
18
+ import pathlib
19
+ import re
20
+ import subprocess
21
+ import sys
22
+ import textwrap
23
+ import threading
24
+ import time
25
+ import warnings
26
+ from collections import UserString
27
+ from collections.abc import Callable, Generator, Iterable, Sequence
28
+ from multiprocessing import connection as mp_conn
29
+ from pathlib import Path
30
+ from typing import Any, Literal, Optional
31
+
32
+ import psycopg
33
+ import psycopg.rows
34
+
35
+ from .. import util
36
+ from .._core import (
37
+ Cardinality,
38
+ IntermediateOperator,
39
+ JoinOperator,
40
+ PhysicalOperator,
41
+ ScanOperator,
42
+ UnboundColumnError,
43
+ VirtualTableError,
44
+ )
45
+ from .._hints import (
46
+ HintType,
47
+ PhysicalOperatorAssignment,
48
+ PlanParameterization,
49
+ operators_from_plan,
50
+ )
51
+ from .._jointree import JoinTree, jointree_from_plan, parameters_from_plan
52
+ from .._qep import QueryPlan, SortKey
53
+ from ..qal import formatter, transform
54
+ from ..qal._qal import (
55
+ AbstractPredicate,
56
+ ArrayAccessExpression,
57
+ BetweenPredicate,
58
+ BinaryPredicate,
59
+ CaseExpression,
60
+ CastExpression,
61
+ ColumnExpression,
62
+ ColumnReference,
63
+ CompoundOperator,
64
+ CompoundPredicate,
65
+ Explain,
66
+ FunctionExpression,
67
+ Hint,
68
+ InPredicate,
69
+ Limit,
70
+ MathExpression,
71
+ OrderBy,
72
+ OrderByExpression,
73
+ SqlExpression,
74
+ SqlQuery,
75
+ StarExpression,
76
+ StaticValueExpression,
77
+ SubqueryExpression,
78
+ TableReference,
79
+ UnaryPredicate,
80
+ WindowExpression,
81
+ )
82
+ from ..util import StateError, Version, jsondict
83
+ from ._db import (
84
+ Database,
85
+ DatabasePool,
86
+ DatabaseSchema,
87
+ DatabaseServerError,
88
+ DatabaseStatistics,
89
+ DatabaseUserError,
90
+ HintService,
91
+ HintWarning,
92
+ OptimizerInterface,
93
+ ResultSet,
94
+ UnsupportedDatabaseFeatureError,
95
+ simplify_result_set,
96
+ )
97
+
98
+ _SignificantPostgresSettings = {
99
+ # Resource consumption settings (see https://www.postgresql.org/docs/current/runtime-config-resource.html)
100
+ # Memory
101
+ "shared_buffers",
102
+ "huge_pages",
103
+ "huge_page_size",
104
+ "temp_buffers",
105
+ "max_prepared_transactions",
106
+ "work_mem",
107
+ "hash_mem_multiplier",
108
+ "maintenance_work_mem",
109
+ "autovacuum_work_mem",
110
+ "vacuum_buffer_usage_limit",
111
+ "logical_decoding_work_mem",
112
+ "max_stack_depth",
113
+ "shared_memory_type",
114
+ "dynamic_shared_memory_type",
115
+ "min_dynamic_shared_memory",
116
+ # Disk
117
+ "temp_file_limit",
118
+ # Kernel Resource Usage
119
+ "max_files_per_process",
120
+ # Cost-based Vacuum Delay
121
+ "vacuum_cost_delay",
122
+ "vacuum_cost_page_hit",
123
+ "vacuum_cost_page_miss",
124
+ "vacuum_cost_page_dirty",
125
+ "vacuum_cost_limit",
126
+ # Background Writer
127
+ "bgwriter_delay",
128
+ "bgwriter_lru_maxpages",
129
+ "bgwriter_lru_multiplier",
130
+ "bgwriter_flush_after",
131
+ # Asynchronous Behavior
132
+ "backend_flush_after",
133
+ "effective_io_concurrency",
134
+ "maintenance_io_concurrency",
135
+ "max_worker_processes",
136
+ "max_parallel_workers_per_gather",
137
+ "max_parallel_maintenance_workers",
138
+ "max_parallel_workers",
139
+ "parallel_leader_participation",
140
+ "old_snapshot_threshold",
141
+ # Query Planning Settings (see https://www.postgresql.org/docs/current/runtime-config-query.html)
142
+ # Planner Method Configuration
143
+ "enable_async_append",
144
+ "enable_bitmapscan",
145
+ "enable_gathermerge",
146
+ "enable_hashagg",
147
+ "enable_hashjoin",
148
+ "enable_incremental_sort",
149
+ "enable_indexscan",
150
+ "enable_indexonlyscan",
151
+ "enable_material",
152
+ "enable_memoize",
153
+ "enable_mergejoin",
154
+ "enable_nestloop",
155
+ "enable_parallel_append",
156
+ "enable_parallel_hash",
157
+ "enable_partition_pruning",
158
+ "enable_partitionwise_join",
159
+ "enable_partitionwise_aggregate",
160
+ "enable_presorted_aggregate",
161
+ "enable_seqscan",
162
+ "enable_sort",
163
+ "enable_tidscan",
164
+ # Planner Cost Constants
165
+ "seq_page_cost",
166
+ "random_page_cost",
167
+ "cpu_tuple_cost",
168
+ "cpu_index_tuple_cost",
169
+ "cpu_operator_cost",
170
+ "parallel_setup_cost",
171
+ "parallel_tuple_cost",
172
+ "min_parallel_table_scan_size",
173
+ "min_parallel_index_scan_size",
174
+ "effective_cache_size",
175
+ "jit_above_cost",
176
+ "jit_inline_above_cost",
177
+ "jit_optimize_above_cost",
178
+ # Genetic Query Optimizer
179
+ "geqo",
180
+ "geqo_threshold",
181
+ "geqo_effort",
182
+ "geqo_pool_size",
183
+ "geqo_generations",
184
+ "geqo_selection_bias",
185
+ "geqo_seed",
186
+ # Other Planner Options
187
+ "default_statistics_target",
188
+ "constraint_exclusion",
189
+ "cursor_tuple_fraction",
190
+ "from_collapse_limit",
191
+ "jit",
192
+ "join_collapse_limit",
193
+ "plan_cache_mode",
194
+ "recursive_worktable_factor"
195
+ # Automatic Vacuuming (https://www.postgresql.org/docs/current/runtime-config-autovacuum.html)
196
+ "autovacuum",
197
+ "autovacuum_max_workers",
198
+ "autovacuum_naptime",
199
+ "autovacuum_threshold",
200
+ "autovacuum_insert_threshold",
201
+ "autovacuum_analyze_threshold",
202
+ "autovacuum_scale_factor",
203
+ "autovacuum_analyze_scale_factor",
204
+ "autovacuum_freeze_max_age",
205
+ "autovacuum_multixact_freeze_max_age",
206
+ "autovacuum_cost_delay",
207
+ "autovacuum_cost_limit",
208
+ }
209
+ """Postgres settings that are relevant to many PostBOUND workflows.
210
+
211
+ These settings can influence performance measurements of different benchmarks. Therefore, we want to make their values
212
+ transparent in order to assess the results.
213
+
214
+ As a rule of thumb we include settings from three major categories: resource consumption (e.g. size of shared buffers),
215
+ optimizer settings (e.g. enable operators) and auto vacuum. The final category is required because it determines how good the
216
+ statistics are once a new database dump has been loaded or a data shift has been simulated. For all of these categories we
217
+ include all settings, even if they are not important right now to the best of our knowledge. This is done to prevent tedious
218
+ debugging if setting is later found to be indeed important: if the category to which it belongs is present in our "significant
219
+ settings", it is guaranteed to be monitored.
220
+
221
+ Most notably settings regarding replication, logging and network settings are excluded, as well as settings regarding locking.
222
+ This is done because PostBOUNDs database abstraction assumes read-only workloads with a single query at a time. If data shifts
223
+ are simulated, these are supposed to be happen strictly before or after a read-only workload is executed and benchmarked.
224
+
225
+ All settings are up-to-date as of Postgres version 16.
226
+ """
227
+
228
+ _RuntimeChangeablePostgresSettings = {
229
+ setting for setting in _SignificantPostgresSettings
230
+ } - {
231
+ "autovacuum_max_workers",
232
+ "autovacuum_naptime",
233
+ "autovacuum_threshold",
234
+ "autovacuum_insert_threshold",
235
+ "autovacuum_analyze_threshold",
236
+ "autovacuum_scale_factor",
237
+ "autovacuum_analyze_scale_factor",
238
+ "autovacuum_freeze_max_age",
239
+ "autovacuum_multixact_freeze_max_age",
240
+ "autovacuum_cost_delay",
241
+ "autovacuum_cost_limit",
242
+ "autovacuum_work_mem",
243
+ "bgwriter_delay",
244
+ "bgwriter_lru_maxpages",
245
+ "bgwriter_lru_multiplier",
246
+ "bgwriter_flush_after",
247
+ "dynamic_shared_memory_type",
248
+ "huge_pages",
249
+ "huge_page_size",
250
+ "max_files_per_process",
251
+ "max_prepared_transactions",
252
+ "max_worker_processes",
253
+ "min_dynamic_shared_memory",
254
+ "old_snapshot_threshold",
255
+ "shared_buffers",
256
+ "shared_memory_type",
257
+ }
258
+ """These are exactly those settings from `_SignificantPostgresSettings` that can be changed at runtime."""
259
+
260
+
261
+ class PostgresSetting(str):
262
+ """Model for a single Postgres configuration such as *SET enable_nestloop = 'off';*.
263
+
264
+ This setting can be used directly as a replacement where a string is expected, or its different components can be accessed
265
+ via the `parameter` and `value` attribute.
266
+
267
+ Parameters
268
+ ----------
269
+ parameter : str
270
+ The name of the setting
271
+ value : object
272
+ The setting's current or desired value
273
+ """
274
+
275
+ def __init__(self, parameter: str, value: object) -> None:
276
+ self._param = parameter
277
+ self._val = value
278
+
279
+ def __new__(cls, parameter: str, value: object):
280
+ value = "on" if value is True else "off" if value is False else value
281
+ return super().__new__(cls, f"SET {parameter} = '{value}';")
282
+
283
+ __match_args__ = ("parameter", "value")
284
+
285
+ @property
286
+ def parameter(self) -> str:
287
+ """Gets the name of the setting.
288
+
289
+ Returns
290
+ -------
291
+ str
292
+ The name
293
+ """
294
+ return self._param
295
+
296
+ @property
297
+ def value(self) -> object:
298
+ """Gets the current or desired value of the setting.
299
+
300
+ Returns
301
+ -------
302
+ object
303
+ The raw, i.e. un-escaped value of the setting.
304
+ """
305
+ return self._val
306
+
307
+ def update(self, value: object) -> PostgresSetting:
308
+ """Creates a new setting with the same name but a different value.
309
+
310
+ Parameters
311
+ ----------
312
+ value : object
313
+ The new value
314
+
315
+ Returns
316
+ -------
317
+ PostgresSetting
318
+ The new setting
319
+ """
320
+ return PostgresSetting(self.parameter, value)
321
+
322
+ def __getnewargs__(self) -> tuple[str, object]:
323
+ return (self.parameter, self.value)
324
+
325
+
326
+ class PostgresConfiguration(collections.UserString):
327
+ """Model for a collection of different postgres settings that form a complete server configuration.
328
+
329
+ Each configuration is build of indivdual `PostgresSetting` objects. The configuration can be used directly as a replacement
330
+ when a string is expected, or its different settings can be accessed individually - either through the accessor methods, or
331
+ by using a dict-like syntax: calling ``config[setting]`` with a string setting value will provide the matching
332
+ `PostgresSetting`. Since the configuration also subclasses string, the precise behavior of `__getitem__` depends on the
333
+ argument type: string arguments provide settings whereas integer arguments result in specific characters. All other string
334
+ methods are implemented such that the normal string behavior is retained. All additional behavior is part of new methods.
335
+
336
+ Parameters
337
+ ----------
338
+ settings : Iterable[PostgresSetting]
339
+ The settings that form the configuration.
340
+
341
+ Warnings
342
+ --------
343
+ Notice that while the configuration is a *UserString*, pyscopg currently does not support executing the configuration, i.e.
344
+ executing ``cursor.execute(config)`` will not work. Instead, the configuration has to be manually converted into a string
345
+ first by calling *str* as in ``cursor.execute(str(config))``. This also applies to the `execute_query()` method of the
346
+ `PostgresInterface` class, since it uses psycopg under the hood.
347
+ """
348
+
349
+ @staticmethod
350
+ def load(*args, **kwargs) -> PostgresConfiguration:
351
+ """Generates a new configuration based on (setting name, value) pairs.
352
+
353
+ Parameters
354
+ ----------
355
+ args
356
+ Ready-to-use `PostgresSetting` objects
357
+ kwargs
358
+ Additional settings
359
+
360
+ Returns
361
+ -------
362
+ PostgresConfiguration
363
+ The configuration
364
+ """
365
+ return PostgresConfiguration(
366
+ list(args) + [PostgresSetting(key, val) for key, val in kwargs.items()]
367
+ )
368
+
369
+ def __init__(self, settings: Iterable[PostgresSetting]) -> None:
370
+ self._settings = {setting.parameter: setting for setting in settings}
371
+ super().__init__(self._format())
372
+
373
+ @property
374
+ def settings(self) -> Sequence[PostgresSetting]:
375
+ """Gets the settings that are part of the configuration.
376
+
377
+ Returns
378
+ -------
379
+ Sequence[PostgresSetting]
380
+ The settings in the order in which they were originally specified.
381
+ """
382
+ return list(self._settings.values())
383
+
384
+ def parameters(self) -> Sequence[str]:
385
+ """Provides all setting names that are specified in this configuration.
386
+
387
+ Returns
388
+ -------
389
+ Sequence[str]
390
+ The setting names in the order in which they were orignally specified.
391
+ """
392
+ return list(self._settings.keys())
393
+
394
+ def add(
395
+ self, setting: PostgresSetting | str = None, value: object = None, **kwargs
396
+ ) -> PostgresConfiguration:
397
+ """Creates a new configuration with additional settings.
398
+
399
+ The setting can be supplied either as a `PostgresSetting` object or as a key-value pair.
400
+ The latter case allows both positional arguments, as well as as keyword arguments.
401
+
402
+ Parameters
403
+ ----------
404
+ setting : PostgresSetting | str
405
+ The setting to add. This can either be a readily created `PostgresSetting` object or a string that will be used as
406
+ the setting name. In the latter case, the `value` has to be supplied as well.
407
+ value : object
408
+ The value of the setting. This is only used if `setting` is a string.
409
+ kwargs
410
+ If the setting is not specified as a string, nor as a `PostgresSetting` object, it has to be specified as keyword
411
+ arguments. The keyword argument names are used as the setting names, the values are used as the setting values.
412
+
413
+ Returns
414
+ -------
415
+ PostgresConfiguration
416
+ The updated configuration. The original config is not modified.
417
+ """
418
+ if isinstance(setting, str):
419
+ setting = PostgresSetting(setting, value)
420
+
421
+ target_settings = dict(self._settings)
422
+ if isinstance(setting, PostgresSetting):
423
+ target_settings[setting.parameter] = setting
424
+ else:
425
+ settings = {key: PostgresSetting(key, val) for key, val in kwargs.items()}
426
+ target_settings.update(settings)
427
+
428
+ return PostgresConfiguration(target_settings.values())
429
+
430
+ def remove(self, setting: PostgresSetting | str) -> PostgresConfiguration:
431
+ """Creates a new configuration without a specific setting.
432
+
433
+ Parameters
434
+ ----------
435
+ setting : PostgresSetting
436
+ The setting to remove
437
+
438
+ Returns
439
+ -------
440
+ PostgresConfiguration
441
+ The updated configuration. The original config is not modified.
442
+ """
443
+ parameter = (
444
+ setting.parameter if isinstance(setting, PostgresSetting) else setting
445
+ )
446
+ target_settings = dict(self._settings)
447
+ target_settings.pop(parameter, None)
448
+ return PostgresConfiguration(target_settings.values())
449
+
450
+ def update(
451
+ self, setting: PostgresSetting | str, value: object
452
+ ) -> PostgresConfiguration:
453
+ """Creates a new configuration with an updated setting.
454
+
455
+ Parameters
456
+ ----------
457
+ setting : PostgresSetting | str
458
+ The setting to update. This can either be the raw setting name, or a `PostgresSetting` object. In either case,
459
+ the updated value has to be supplied via the `value` parameter. (When supplying a `PostgresSetting`, only its
460
+ name is used.)
461
+ value : object
462
+ The updated value of the setting.
463
+
464
+ Returns
465
+ -------
466
+ PostgresConfiguration
467
+ The updated configuration. The original config is not modified.
468
+ """
469
+ match setting:
470
+ case str():
471
+ setting = PostgresSetting(setting, value)
472
+ case PostgresSetting(name, _):
473
+ setting = PostgresSetting(name, value)
474
+
475
+ target_settings = dict(self._settings)
476
+ target_settings[setting.parameter] = setting
477
+
478
+ return PostgresConfiguration(target_settings.values())
479
+
480
+ def as_dict(self) -> dict[str, object]:
481
+ """Provides all settings as setting name -> setting value mappings.
482
+
483
+ Returns
484
+ -------
485
+ dict[str, object]
486
+ The settings. Changes to this dictionary will not be reflected in the configuration object.
487
+ """
488
+ return dict(self._settings)
489
+
490
+ def _format(self) -> str:
491
+ """Provides the string representation of the configuration.
492
+
493
+ Returns
494
+ -------
495
+ str
496
+ The string representation
497
+ """
498
+ return "\n".join([str(setting) for setting in self.settings])
499
+
500
+ def __getitem__(self, key: object) -> str:
501
+ if isinstance(key, str):
502
+ return self._settings[key]
503
+ return super().__getitem__(key)
504
+
505
+ def __setitem__(self, key: object, value: object) -> None:
506
+ if isinstance(key, str):
507
+ self._settings[key] = value
508
+ self.data = self._format()
509
+ else:
510
+ super().__setitem__(key, value)
511
+
512
+
513
+ class PostgresConfigInterface:
514
+ """A thin wrapper that provides read-only access to Postgres configuration settings using __getitem__ syntax."""
515
+
516
+ def __init__(self, pg_instance: PostgresInterface) -> None:
517
+ self._pg = pg_instance
518
+
519
+ def __getitem__(self, key: str) -> Any:
520
+ return self._pg.execute_query(f"SHOW {key};", cache_enabled=False, raw=False)
521
+
522
+
523
+ _PGVersionPattern = re.compile(r"^PostgreSQL (?P<pg_ver>[\d]+(\.[\d]+)?).*$")
524
+ """Regular expression to extract the Postgres server version from the *VERSION()* function.
525
+
526
+ References
527
+ ----------
528
+
529
+ .. Pattern debugging: https://regex101.com/r/UTQkfa/1
530
+ """
531
+
532
+
533
+ class PostgresInterface(Database):
534
+ """Database implementation for PostgreSQL backends.
535
+
536
+ The `config` attribute provides read-only access to the current GUC values of the server.
537
+
538
+ Parameters
539
+ ----------
540
+ connect_string : str
541
+ Connection string for `psycopg` to establish a connection to the Postgres server
542
+ system_name : str, optional
543
+ Description of the specific Postgres server, by default *Postgres*
544
+ application_name : str, optional
545
+ Identifier for the Postgres server. This will be the name that is shown in the server logs and process lists.
546
+ client_encoding : str, optional
547
+ The client encoding to use for the connection, by default *UTF8*
548
+ cache_enabled : bool, optional
549
+ Whether to enable caching of database queries, by default *False*
550
+ debug : bool, optional
551
+ Whether additional debug information should be printed during database interaction. Defaults to *False*.
552
+ """
553
+
554
+ def __init__(
555
+ self,
556
+ connect_string: str,
557
+ system_name: str = "Postgres",
558
+ *,
559
+ application_name: str = "PostBOUND",
560
+ client_encoding: str = "UTF8",
561
+ cache_enabled: bool = False,
562
+ debug: bool = False,
563
+ ) -> None:
564
+ self.connect_string = connect_string
565
+ self.debug = debug
566
+ self.config = PostgresConfigInterface(self)
567
+ self._application_name = application_name or "PostBOUND"
568
+ self._client_encoding = client_encoding
569
+ self._init_connection()
570
+
571
+ self._db_stats = PostgresStatisticsInterface(self)
572
+ self._db_schema = PostgresSchemaInterface(self)
573
+ self._hinting_backend = PostgresHintService(self)
574
+
575
+ self._timeout_executor = TimeoutQueryExecutor(self)
576
+ self._last_query_runtime = math.nan
577
+
578
+ super().__init__(system_name, cache_enabled=cache_enabled)
579
+
580
+ def schema(self) -> PostgresSchemaInterface:
581
+ return self._db_schema
582
+
583
+ def statistics(self) -> PostgresStatisticsInterface:
584
+ return self._db_stats
585
+
586
+ def hinting(self) -> PostgresHintService:
587
+ return self._hinting_backend
588
+
589
+ def execute_query(
590
+ self,
591
+ query: SqlQuery | str,
592
+ *,
593
+ cache_enabled: Optional[bool] = None,
594
+ raw: bool = False,
595
+ timeout: Optional[float] = None,
596
+ ) -> Any:
597
+ if timeout is not None and timeout > 0:
598
+ return self._timeout_executor.execute_query(
599
+ query, timeout=timeout, cache_enabled=cache_enabled, raw=raw
600
+ )
601
+
602
+ cache_enabled = cache_enabled or (cache_enabled is None and self._cache_enabled)
603
+ if isinstance(query, UserString):
604
+ query = str(query)
605
+ elif isinstance(query, SqlQuery):
606
+ query = self._hinting_backend.format_query(query)
607
+
608
+ if cache_enabled and query in self._query_cache:
609
+ query_result = self._query_cache[query]
610
+ return query_result if raw else simplify_result_set(query_result)
611
+
612
+ try:
613
+ start_time = time.perf_counter_ns()
614
+ self._cursor.execute(query)
615
+ end_time = time.perf_counter_ns()
616
+ self._last_query_runtime = (
617
+ end_time - start_time
618
+ ) / 10**9 # convert to seconds
619
+
620
+ query_result = (
621
+ self._cursor.fetchall() if self._cursor.rowcount >= 0 else None
622
+ )
623
+ if cache_enabled:
624
+ self._inflate_query_cache()
625
+ self._query_cache[query] = query_result
626
+ except (psycopg.InternalError, psycopg.OperationalError) as e:
627
+ msg = "\n".join(
628
+ [
629
+ f"At {util.timestamp()}",
630
+ "For query:",
631
+ str(query),
632
+ "Message:",
633
+ str(e),
634
+ ]
635
+ )
636
+ raise DatabaseServerError(msg, e)
637
+ except psycopg.Error as e:
638
+ msg = "\n".join(
639
+ [
640
+ f"At {util.timestamp()}",
641
+ "For query:",
642
+ str(query),
643
+ "Message:",
644
+ str(e),
645
+ ]
646
+ )
647
+ raise DatabaseUserError(msg, e)
648
+
649
+ return query_result if raw else simplify_result_set(query_result)
650
+
651
+ def execute_with_timeout(
652
+ self, query: SqlQuery | str, timeout: float = 60.0
653
+ ) -> Optional[ResultSet]:
654
+ try:
655
+ result = self.execute_query(
656
+ query, timeout=timeout, cache_enabled=False, raw=True
657
+ )
658
+ return result
659
+ except TimeoutError:
660
+ return None
661
+
662
+ def last_query_runtime(self) -> float:
663
+ return self._last_query_runtime
664
+
665
+ def time_query(self, query: SqlQuery, *, timeout: Optional[float] = None) -> float:
666
+ self.execute_query(query, cache_enabled=False, raw=True, timeout=timeout)
667
+ return self.last_query_runtime()
668
+
669
+ def optimizer(self) -> PostgresOptimizer:
670
+ return PostgresOptimizer(self)
671
+
672
+ def database_name(self) -> str:
673
+ self._cursor.execute("SELECT CURRENT_DATABASE();")
674
+ db_name = self._cursor.fetchone()[0]
675
+ return db_name
676
+
677
+ def database_system_version(self) -> Version:
678
+ self._cursor.execute("SELECT VERSION();")
679
+ version_string = self._cursor.fetchone()[0]
680
+ version_match = _PGVersionPattern.match(version_string)
681
+ if not version_match:
682
+ raise RuntimeError(
683
+ f"Could not extract Postgres version from string '{version_string}'"
684
+ )
685
+ pg_ver = version_match.group("pg_ver")
686
+ return Version(pg_ver)
687
+
688
+ def backend_pid(self) -> int:
689
+ """Provides the backend process ID of the current connection.
690
+
691
+ Returns
692
+ -------
693
+ int
694
+ The process ID
695
+ """
696
+ return self._connection.info.backend_pid
697
+
698
+ def data_dir(self) -> Path:
699
+ """Get the data directory of the Postgres server.
700
+
701
+ Returns
702
+ -------
703
+ Path
704
+ The data directory path
705
+ """
706
+ self._cursor.execute("SHOW data_directory;")
707
+ data_dir = self._cursor.fetchone()[0]
708
+ return Path(data_dir)
709
+
710
+ def logfile(self) -> Optional[Path]:
711
+ """Get the log file of the (local) Postgres server."""
712
+ proc_path = Path(f"/proc/{self.backend_pid()}/fd/1")
713
+ if not proc_path.exists() or not proc_path.is_symlink():
714
+ return None
715
+ return proc_path.resolve()
716
+
717
+ def describe(self) -> jsondict:
718
+ base_info = {
719
+ "system_name": self.database_system_name(),
720
+ "system_version": self.database_system_version(),
721
+ "database": self.database_name(),
722
+ "statistics_settings": {
723
+ "emulated": self._db_stats.emulated,
724
+ "cache_enabled": self._db_stats.cache_enabled,
725
+ },
726
+ "hinting_mode": self._hinting_backend.describe(),
727
+ "query_cache": self.cache_enabled,
728
+ }
729
+ self._cursor.execute("SELECT name, setting FROM pg_settings")
730
+ system_settings = self._cursor.fetchall()
731
+ base_info["system_settings"] = {
732
+ setting: value
733
+ for setting, value in system_settings
734
+ if setting in _SignificantPostgresSettings
735
+ }
736
+
737
+ schema_info: list[jsondict] = []
738
+ for table in self._db_schema.tables():
739
+ if table.full_name.startswith("pg_"):
740
+ continue # skip system tables
741
+
742
+ column_info: list[jsondict] = []
743
+
744
+ for column in self._db_schema.columns(table):
745
+ column_info.append(
746
+ {
747
+ "column": str(column),
748
+ "indexed": self.schema().has_index(column),
749
+ "foreign_keys": self._db_schema.foreign_keys_on(column),
750
+ }
751
+ )
752
+
753
+ pk_col = self._db_schema.primary_key_column(table)
754
+ schema_info.append(
755
+ {
756
+ "table": str(table),
757
+ "n_rows": self.statistics().total_rows(table, emulated=True),
758
+ "columns": column_info,
759
+ "primary_key": pk_col.name if pk_col else None,
760
+ }
761
+ )
762
+
763
+ base_info["schema_info"] = schema_info
764
+ return base_info
765
+
766
+ def reset_connection(self) -> int:
767
+ try:
768
+ self._connection.cancel()
769
+ self._cursor.close()
770
+ self._connection.close()
771
+ except psycopg.Error:
772
+ pass
773
+ return self._init_connection()
774
+
775
+ def cursor(self) -> psycopg.Cursor:
776
+ return self._cursor
777
+
778
+ def connection(self) -> psycopg.Connection:
779
+ """Provides the current database connection.
780
+
781
+ Returns
782
+ -------
783
+ psycopg.Connection
784
+ The connection
785
+ """
786
+ return self._connection
787
+
788
+ def obtain_new_local_connection(self) -> psycopg.Connection:
789
+ """Provides a new database connection to be used exclusively be the client.
790
+
791
+ The current connection maintained by the `PostgresInterface` is not affected by obtaining a new connection in any
792
+ way.
793
+
794
+ Returns
795
+ -------
796
+ psycopg.Connection
797
+ The connection
798
+ """
799
+ return psycopg.connect(self.connect_string)
800
+
801
+ def close(self) -> None:
802
+ self._cursor.close()
803
+ self._connection.close()
804
+
805
+ def prewarm_tables(
806
+ self,
807
+ tables: Optional[TableReference | Iterable[TableReference]] = None,
808
+ *more_tables: TableReference,
809
+ exclude_table_pages: bool = False,
810
+ include_primary_index: bool = True,
811
+ include_secondary_indexes: bool = True,
812
+ ) -> None:
813
+ """Prepares the Postgres buffer pool with tuples from specific tables.
814
+
815
+ Parameters
816
+ ----------
817
+ tables : Optional[TableReference | Iterable[TableReference]], optional
818
+ The tables that should be placed into the buffer pool
819
+ *more_tables : TableReference
820
+ More tables that should be placed into the buffer pool, enabling a more convenient usage of this method.
821
+ See examples for details on the usage.
822
+ exclude_table_pages : bool, optional
823
+ Whether the table data (i.e. pages containing the actual tuples) should *not* be prewarmed. This is off by default,
824
+ meaning that prewarming is applied to the data pages. This can be toggled on to only prewarm index pages (see
825
+ `include_primary_index` and `include_secondary_index`).
826
+ include_primary_index : bool, optional
827
+ Whether the pages of the primary key index should also be prewarmed. Enabled by default.
828
+ include_secondary_indexes : bool, optional
829
+ Whether the pages for secondary indexes should also be prewarmed. Enabled by default.
830
+
831
+ Notes
832
+ -----
833
+ If the database should prewarm more table pages than can be contained in the shared buffer, the actual contents of the
834
+ pool are not specified. Since all prewarming tasks happen sequentially, the first prewarmed relations will typically
835
+ be evicted and only the last relations (tables or indexes) are retained in the shared buffer. The precise order in
836
+ which the prewarming tasks are executed is not specified and depends on the actual relations.
837
+
838
+ Examples
839
+ --------
840
+ >>> pg.prewarm_tables([table1, table2])
841
+ >>> pg.prewarm_tables(table1, table2)
842
+ """
843
+ self._assert_active_extension("pg_prewarm")
844
+ tables: Iterable[TableReference] = list(util.enlist(tables)) + list(more_tables)
845
+ if not tables:
846
+ return
847
+ tables = set(
848
+ tab.full_name for tab in tables
849
+ ) # eliminate duplicates if tables are selected multiple times
850
+
851
+ table_indexes = (
852
+ [self._fetch_index_relnames(tab) for tab in tables]
853
+ if include_primary_index or include_secondary_indexes
854
+ else []
855
+ )
856
+ indexes_to_prewarm = {
857
+ idx
858
+ for idx, primary in util.flatten(table_indexes)
859
+ if (primary and include_primary_index)
860
+ or (not primary and include_secondary_indexes)
861
+ }
862
+ tables = (
863
+ indexes_to_prewarm if exclude_table_pages else tables | indexes_to_prewarm
864
+ )
865
+ if not tables:
866
+ return
867
+
868
+ prewarm_invocations = [f"pg_prewarm('{tab}')" for tab in tables]
869
+ prewarm_text = ", ".join(prewarm_invocations)
870
+ prewarm_query = f"SELECT {prewarm_text}"
871
+
872
+ self._cursor.execute(prewarm_query)
873
+
874
+ def cooldown_tables(
875
+ self,
876
+ tables: Optional[TableReference | Iterable[TableReference]] = None,
877
+ *more_tables: TableReference,
878
+ exclude_table_pages: bool = False,
879
+ include_primary_index: bool = True,
880
+ include_secondary_indexes: bool = True,
881
+ ) -> None:
882
+ """Removes tuples from specific tables from the Postgres buffer pool.
883
+
884
+ This method can be used to simulate a cold start for the next incoming query. It requires the *pg_temperature*
885
+ extension that is part of the pg_lab project.
886
+
887
+ Parameters
888
+ ----------
889
+ tables : Optional[TableReference | Iterable[TableReference]], optional
890
+ The tables that should be removed from the buffer pool
891
+ *more_tables : TableReference
892
+ More tables that should be removed into the buffer pool, enabling a more convenient usage of this method.
893
+ See examples for details on the usage.
894
+ exclude_table_pages : bool, optional
895
+ Whether the table data (i.e. pages containing the actual tuples) should *not* be removed. This is off by default,
896
+ meaning that the cooldown is applied to the data pages. This can be toggled on to only cooldown index pages (see
897
+ `include_primary_index` and `include_secondary_index`).
898
+ include_primary_index : bool, optional
899
+ Whether the pages of the primary key index should also be cooled down. Enabled by default.
900
+ include_secondary_indexes : bool, optional
901
+ Whether the pages for secondary indexes should also be cooled down. Enabled by default.
902
+
903
+ Examples
904
+ --------
905
+ >>> pg.cooldown_tables([table1, table2])
906
+ >>> pg.cooldown_tables(table1, table2)
907
+
908
+ References
909
+ ----------
910
+ pg_lab : https://github.com/rbergm/pg_lab
911
+ """
912
+ self._assert_active_extension("pg_temperature")
913
+ tables: Iterable[TableReference] = list(util.enlist(tables)) + list(more_tables)
914
+ if not tables:
915
+ return
916
+ tables = set(
917
+ tab.full_name for tab in tables
918
+ ) # eliminate duplicates if tables are selected multiple times
919
+
920
+ table_indexes = (
921
+ [self._fetch_index_relnames(tab) for tab in tables]
922
+ if include_primary_index or include_secondary_indexes
923
+ else []
924
+ )
925
+ indexes_to_cooldown = {
926
+ idx
927
+ for idx, primary in util.flatten(table_indexes)
928
+ if (primary and include_primary_index)
929
+ or (not primary and include_secondary_indexes)
930
+ }
931
+ tables = (
932
+ indexes_to_cooldown if exclude_table_pages else tables | indexes_to_cooldown
933
+ )
934
+ if not tables:
935
+ return
936
+
937
+ cooldown_invocations = [f"pg_cooldown('{tab}')" for tab in tables]
938
+ cooldown_text = ", ".join(cooldown_invocations)
939
+ cooldown_query = f"SELECT {cooldown_text}"
940
+
941
+ self._cursor.execute(cooldown_query)
942
+
943
+ def current_configuration(
944
+ self, *, runtime_changeable_only: bool = False
945
+ ) -> PostgresConfiguration:
946
+ """Provides all current configuration settings in the current Postgres connection.
947
+
948
+ Parameters
949
+ ----------
950
+ runtime_changeable_only : bool, optional
951
+ Whether only such settings that can be changed at runtime should be provided. Defaults to *False*.
952
+
953
+ Returns
954
+ -------
955
+ PostgresConfiguration
956
+ The current configuration.
957
+ """
958
+ self._cursor.execute("SELECT name, setting FROM pg_settings")
959
+ system_settings = self._cursor.fetchall()
960
+ allowed_settings = (
961
+ _RuntimeChangeablePostgresSettings
962
+ if runtime_changeable_only
963
+ else _SignificantPostgresSettings
964
+ )
965
+ configuration = {
966
+ setting: value
967
+ for setting, value in system_settings
968
+ if setting in allowed_settings
969
+ }
970
+ return PostgresConfiguration.load(**configuration)
971
+
972
+ def apply_configuration(
973
+ self, configuration: PostgresConfiguration | PostgresSetting | str
974
+ ) -> None:
975
+ """Changes specific configuration parameters of the Postgres server or current connection.
976
+
977
+ Parameters
978
+ ----------
979
+ configuration : PostgresConfiguration | PostgresSetting | str
980
+ The desired setting values. If a string is supplied directly, it already has to be a valid setting update such as
981
+ *SET geqo = FALSE;*.
982
+ """
983
+ if (
984
+ isinstance(configuration, PostgresSetting)
985
+ and configuration.parameter not in _RuntimeChangeablePostgresSettings
986
+ ):
987
+ warnings.warn(
988
+ f"Cannot apply configuration setting '{configuration.parameter}' at runtime"
989
+ )
990
+ return
991
+ elif isinstance(configuration, PostgresConfiguration):
992
+ supported_settings: list[PostgresSetting] = []
993
+ unsupported_settings: list[str] = []
994
+ for setting in configuration.settings:
995
+ if setting.parameter in _RuntimeChangeablePostgresSettings:
996
+ supported_settings.append(setting)
997
+ else:
998
+ unsupported_settings.append(setting.parameter)
999
+ if unsupported_settings:
1000
+ warnings.warn(
1001
+ f"Skipping configuration settings {unsupported_settings} "
1002
+ "because they cannot be changed at runtime"
1003
+ )
1004
+ configuration = str(PostgresConfiguration(supported_settings))
1005
+
1006
+ self._cursor.execute(configuration)
1007
+
1008
+ def has_extension(
1009
+ self, extension_name: str, *, is_shared_object: bool = True
1010
+ ) -> bool:
1011
+ """Checks, whether the current Postgres database has a specific extension loaded and active.
1012
+
1013
+ Extensions can be either created using the *CREATE EXTENSION* command, or by loading the shared object via *LOAD*.
1014
+ For the shared object-based check to work correctly, the Postgres server has to run in the same namespace as the
1015
+ PostBOUND client.
1016
+
1017
+ Parameters
1018
+ ----------
1019
+ extension_name : str
1020
+ The name of the extension to be checked. In case of shared objects, this should be equivalent to the name of said
1021
+ object. In this case, the suffix is optional.
1022
+ is_shared_object : bool, optional
1023
+ Whether the extension is a shared object that is loaded into the Postgres server. By default this is set to *True*,
1024
+ which assumes that the extension is loaded as a shared object, rather than as a default extension.
1025
+
1026
+
1027
+ Returns
1028
+ -------
1029
+ bool
1030
+ Whether the extension is loaded and active in the current Postgres database.
1031
+ """
1032
+ match sys.platform:
1033
+ case "win32" | "cygwin":
1034
+ lib_suffix = ".dll"
1035
+ case "darwin":
1036
+ lib_suffix = ".dylib"
1037
+ case "linux":
1038
+ lib_suffix = ".so"
1039
+ case _:
1040
+ raise RuntimeError(
1041
+ f"Plaform '{sys.platform}' is not supported by extension check."
1042
+ )
1043
+
1044
+ if is_shared_object or extension_name in ("pg_hint_plan", "pg_lab"):
1045
+ shared_object_name = (
1046
+ f"{extension_name}{lib_suffix}"
1047
+ if not extension_name.endswith(lib_suffix)
1048
+ else extension_name
1049
+ )
1050
+ loaded_shared_objects = util.system.open_files(
1051
+ self._connection.info.backend_pid
1052
+ )
1053
+ return any(so.endswith(shared_object_name) for so in loaded_shared_objects)
1054
+ else:
1055
+ self._cursor.execute("SELECT extname FROM pg_extension;")
1056
+ return any(ext[0] == extension_name for ext in self._cursor.fetchall())
1057
+
1058
+ def _init_connection(self) -> int:
1059
+ """Sets all default connection parameters and creates the actual database cursor.
1060
+
1061
+ Returns
1062
+ -------
1063
+ int
1064
+ The backend process ID of the new connection
1065
+ """
1066
+ self._connection: psycopg.Connection = psycopg.connect(
1067
+ self.connect_string,
1068
+ application_name=self._application_name,
1069
+ client_encoding=self._client_encoding,
1070
+ row_factory=psycopg.rows.tuple_row,
1071
+ )
1072
+ self._connection.autocommit = (
1073
+ True # pg_hint_plan hinting backend currently relies on autocommit!
1074
+ )
1075
+ self._connection.prepare_threshold = None
1076
+ self._cursor: psycopg.Cursor = self._connection.cursor()
1077
+ return self.backend_pid()
1078
+
1079
+ def _fetch_index_relnames(
1080
+ self, table: TableReference | str
1081
+ ) -> Iterable[tuple[str, bool]]:
1082
+ """Loads all physical index relations for a physical table.
1083
+
1084
+ Parameters
1085
+ ----------
1086
+ table : TableReference
1087
+ The table for which to load the indexes
1088
+
1089
+ Returns
1090
+ -------
1091
+ Iterable[tuple[str, bool]]
1092
+ All indexes as pairs *(relation name, primary)*. Relation name corresponds to the table-like object that Postgres
1093
+ created internally to store the index (e.g. for a table called *title*, this is typically called *title_pkey* for
1094
+ the primary key index). The *primary* boolean indicates whether this is the primary key index of the table.
1095
+ """
1096
+ query_template = textwrap.dedent("""
1097
+ SELECT cls.relname, idx.indisprimary
1098
+ FROM pg_index idx
1099
+ JOIN pg_class cls ON idx.indexrelid = cls.oid
1100
+ JOIN pg_class owner_cls ON idx.indrelid = owner_cls.oid
1101
+ WHERE owner_cls.relname = %s;
1102
+ """)
1103
+ table = table.full_name if isinstance(table, TableReference) else table
1104
+ self._cursor.execute(query_template, (table,))
1105
+ return list(self._cursor.fetchall())
1106
+
1107
+ def _assert_active_extension(
1108
+ self, extension_name: str, *, is_shared_object: bool = False
1109
+ ) -> None:
1110
+ """Raises an error if the current postgres database does not have the desired extension.
1111
+
1112
+ Extensions can be created using the *CREATE EXTENSION* command, or by loading the shared object via *LOAD*. In either
1113
+ case, this method can check whether they are indeed active.
1114
+
1115
+ Parameters
1116
+ ----------
1117
+ extension_name : str
1118
+ The name of the extension to be checked.
1119
+ is_shared_object : bool, optional
1120
+ Whether the extension is activated using *LOAD*. If this it the case, the shared objects owned by the database
1121
+ process rather than the internal extension catalogs will be checked. The extension name will be automatically
1122
+ suffixed with *.so* if necessary. As a special case, for checking the *pg_hint_plan* extension this parameter does
1123
+ not need to be true. This is due to the central importance of that extension for the entire Postgres hinting
1124
+ system and saves some typing in that case.
1125
+
1126
+ Raises
1127
+ ------
1128
+ StateError
1129
+ If the given extension is not active
1130
+ """
1131
+ extension_is_active = self.has_extension(
1132
+ extension_name, is_shared_object=is_shared_object
1133
+ )
1134
+ if not extension_is_active:
1135
+ raise StateError(
1136
+ f"Extension '{extension_name}' is not active in database '{self.database_name()}'"
1137
+ )
1138
+
1139
+ def __eq__(self, other: object) -> bool:
1140
+ return (
1141
+ isinstance(other, type(self))
1142
+ and self.connect_string == other.connect_string
1143
+ )
1144
+
1145
+ def __hash__(self) -> int:
1146
+ return hash(self.connect_string)
1147
+
1148
+
1149
+ class PostgresSchemaInterface(DatabaseSchema):
1150
+ """Database schema implementation for Postgres systems.
1151
+
1152
+ Parameters
1153
+ ----------
1154
+ postgres_db : PostgresInterface
1155
+ The database for which schema information should be retrieved
1156
+ """
1157
+
1158
+ def __int__(self, postgres_db: PostgresInterface) -> None:
1159
+ super().__init__(postgres_db)
1160
+
1161
+ def tables(self, *, schema: str = "public") -> set[TableReference]:
1162
+ query_template = textwrap.dedent("""
1163
+ SELECT table_name
1164
+ FROM information_schema.tables
1165
+ WHERE table_catalog = %s AND table_schema = %s""")
1166
+ self._db.cursor().execute(query_template, (self._db.database_name(), schema))
1167
+ result_set = self._db.cursor().fetchall()
1168
+ assert result_set is not None
1169
+ return set(TableReference(row[0]) for row in result_set)
1170
+
1171
+ def lookup_column(
1172
+ self,
1173
+ column: ColumnReference | str,
1174
+ candidate_tables: Iterable[TableReference],
1175
+ *,
1176
+ expect_match: bool = False,
1177
+ ) -> Optional[TableReference]:
1178
+ candidate_tables = (
1179
+ set(candidate_tables)
1180
+ if len(candidate_tables) > 5
1181
+ else list(candidate_tables)
1182
+ )
1183
+ column = column.name if isinstance(column, ColumnReference) else column
1184
+ lower_col = column.lower()
1185
+
1186
+ for table in candidate_tables:
1187
+ table_columns = self._fetch_columns(table)
1188
+ if column in table_columns or lower_col in table_columns:
1189
+ return table
1190
+
1191
+ if not expect_match:
1192
+ return None
1193
+ candidate_tables = [table.qualified_name() for table in candidate_tables]
1194
+ raise ValueError(
1195
+ f"Column '{column}' not found in candidate tables {candidate_tables}"
1196
+ )
1197
+
1198
+ def is_primary_key(self, column: ColumnReference) -> bool:
1199
+ if not column.table:
1200
+ raise UnboundColumnError(column)
1201
+ if column.table.virtual:
1202
+ raise VirtualTableError(column.table)
1203
+ index_map = self._fetch_indexes(column.table)
1204
+ return index_map.get(column.name, False)
1205
+
1206
+ def has_secondary_index(self, column: ColumnReference) -> bool:
1207
+ if not column.table:
1208
+ raise UnboundColumnError(column)
1209
+ if column.table.virtual:
1210
+ raise VirtualTableError(column.table)
1211
+ index_map = self._fetch_indexes(column.table)
1212
+
1213
+ # The index map contains an entry for each attribute that actually has an index. The value is True, if the
1214
+ # attribute (which is known to be indexed), is even the Primary Key
1215
+ # Our method should return False in two cases: 1) the attribute is not indexed at all; and 2) the attribute
1216
+ # actually is the Primary key. Therefore, by assuming it is the PK in case of absence, we get the correct
1217
+ # value.
1218
+ return not index_map.get(column.name, True)
1219
+
1220
+ def indexes_on(self, column: ColumnReference) -> set[str]:
1221
+ if not column.table:
1222
+ raise UnboundColumnError(column)
1223
+ if column.table.virtual:
1224
+ raise VirtualTableError(column.table)
1225
+ schema = column.table.schema or "public"
1226
+ query_template = textwrap.dedent("""
1227
+ SELECT cls.relname
1228
+ FROM pg_index idx
1229
+ JOIN pg_class cls ON idx.indexrelid = cls.oid
1230
+ JOIN pg_class rel ON idx.indrelid = rel.oid
1231
+ JOIN pg_attribute att ON att.attnum = ANY(idx.indkey) AND idx.indrelid = att.attrelid
1232
+ JOIN pg_namespace nsp ON cls.relnamespace = nsp.oid AND rel.relnamespace = nsp.oid
1233
+ WHERE rel.relname = %s
1234
+ AND att.attname = %s
1235
+ AND nsp.nspname = %s
1236
+ """)
1237
+
1238
+ self._db.cursor().execute(
1239
+ query_template, (column.table.full_name, column.name, schema)
1240
+ )
1241
+ result_set = self._db.cursor().fetchall()
1242
+ return {row[0] for row in result_set}
1243
+
1244
+ def foreign_keys_on(self, column: ColumnReference) -> set[ColumnReference]:
1245
+ if not column.table:
1246
+ raise UnboundColumnError(column)
1247
+ if column.table.virtual:
1248
+ raise VirtualTableError(column.table)
1249
+ schema = column.table.schema or "public"
1250
+ query_template = textwrap.dedent("""
1251
+ SELECT target.table_name, target.column_name
1252
+ FROM information_schema.key_column_usage AS fk_sources
1253
+ JOIN information_schema.table_constraints AS all_constraints
1254
+ ON fk_sources.constraint_name = all_constraints.constraint_name
1255
+ AND fk_sources.table_schema = all_constraints.table_schema
1256
+ JOIN information_schema.constraint_column_usage AS target
1257
+ ON fk_sources.constraint_name = target.constraint_name
1258
+ AND fk_sources.table_schema = target.table_schema
1259
+ WHERE fk_sources.table_name = %s
1260
+ AND fk_sources.column_name = %s
1261
+ AND fk_sources.table_schema = %s
1262
+ AND all_constraints.constraint_type = 'FOREIGN KEY'
1263
+ """)
1264
+
1265
+ self._db.cursor().execute(
1266
+ query_template, (column.table.full_name, column.name, schema)
1267
+ )
1268
+ result_set = self._db.cursor().fetchall()
1269
+ return {ColumnReference(row[1], TableReference(row[0])) for row in result_set}
1270
+
1271
+ def datatype(self, column: ColumnReference) -> str:
1272
+ if not column.table:
1273
+ raise UnboundColumnError(column)
1274
+ if column.table.virtual:
1275
+ raise VirtualTableError(column.table)
1276
+ schema = column.table.schema or "public"
1277
+ query_template = textwrap.dedent("""
1278
+ SELECT data_type FROM information_schema.columns
1279
+ WHERE table_name = %s AND column_name = %s AND table_schema = %s""")
1280
+ self._db.cursor().execute(
1281
+ query_template, (column.table.full_name, column.name, schema)
1282
+ )
1283
+ result_set = self._db.cursor().fetchone()
1284
+ return result_set[0]
1285
+
1286
+ def is_nullable(self, column: ColumnReference) -> bool:
1287
+ if not column.table:
1288
+ raise UnboundColumnError(column)
1289
+ if column.table.virtual:
1290
+ raise VirtualTableError(column.table)
1291
+ schema = column.table.schema or "public"
1292
+ query_tempalte = textwrap.dedent("""
1293
+ SELECT is_nullable = 'YES' FROM information_schema.columns
1294
+ WHERE table_name = %s AND column_name = %s AND table_schema = %s""")
1295
+ self._db.cursor().execute(
1296
+ query_tempalte, (column.table.full_name, column.name, schema)
1297
+ )
1298
+ result_set = self._db.cursor().fetchone()
1299
+ return result_set[0]
1300
+
1301
+ def _fetch_columns(self, table: TableReference) -> list[str]:
1302
+ """Retrieves all physical columns for a given table from the PG metadata catalogs.
1303
+
1304
+ Parameters
1305
+ ----------
1306
+ table : TableReference
1307
+ The table whose columns should be loaded
1308
+
1309
+ Returns
1310
+ -------
1311
+ list[str]
1312
+ The names of all columns
1313
+
1314
+ Raises
1315
+ ------
1316
+ VirtualTableError
1317
+ If the table is a virtual table (e.g. subquery or CTE)
1318
+ """
1319
+ if table.virtual:
1320
+ raise VirtualTableError(table)
1321
+ schema = table.schema or "public"
1322
+ query_template = "SELECT column_name FROM information_schema.columns WHERE table_name = %s AND table_schema = %s"
1323
+ self._db.cursor().execute(query_template, (table.full_name, schema))
1324
+ result_set = self._db.cursor().fetchall()
1325
+ return [col[0] for col in result_set]
1326
+
1327
+ def _fetch_indexes(self, table: TableReference) -> dict[str, bool]:
1328
+ """Retrieves all index structures for a given table based on the PG metadata catalogs.
1329
+
1330
+ Parameters
1331
+ ----------
1332
+ table : TableReference
1333
+ The table whose indexes should be loaded
1334
+
1335
+ Returns
1336
+ -------
1337
+ dict
1338
+ Contains a key for each column that has an index. The column keys map to booleans that indicate whether
1339
+ the corresponding index is a primary key index. Columns without any index do not appear in the dictionary.
1340
+
1341
+ Raises
1342
+ ------
1343
+ VirtualTableError
1344
+ If the table is a virtual table (e.g. subquery or CTE)
1345
+ """
1346
+ if table.virtual:
1347
+ raise VirtualTableError(table)
1348
+ # query adapted from https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns
1349
+ table_name = table.full_name
1350
+ schema = table.schema or "public"
1351
+ index_query = textwrap.dedent("""
1352
+ SELECT attr.attname, idx.indisprimary
1353
+ FROM pg_index idx
1354
+ JOIN pg_attribute attr ON idx.indrelid = attr.attrelid AND attr.attnum = ANY(idx.indkey)
1355
+ JOIN pg_class cls ON idx.indrelid = cls.oid
1356
+ JOIN pg_namespace nsp ON cls.relnamespace = nsp.oid
1357
+ WHERE cls.relname = %s
1358
+ AND nsp.nspname = %s
1359
+ """)
1360
+ self._db.cursor().execute(index_query, (table_name, schema))
1361
+ result_set = self._db.cursor().fetchall()
1362
+ index_map = dict(result_set)
1363
+ return index_map
1364
+
1365
+ def __eq__(self, other: object) -> None:
1366
+ return isinstance(other, type(self)) and self._db == other._db
1367
+
1368
+ def __hash__(self):
1369
+ return hash(self._db)
1370
+
1371
+
1372
+ # Postgres stores its array datatypes in a more general array-type structure (anyarray).
1373
+ # However, to extract the individual entries from such an array, the need to be casted to a typed array structure.
1374
+ # This dictionary contains the necessary casts for the actual column types.
1375
+ # For example, suppose a column contains integer values. If this column is aggregated into an anyarray entry, the
1376
+ # appropriate converter for this array is int[]. In other words DTypeArrayConverters["integer"] = "int[]"
1377
+ _DTypeArrayConverters = {
1378
+ "integer": "int[]",
1379
+ "text": "text[]",
1380
+ "character varying": "text[]",
1381
+ }
1382
+
1383
+
1384
+ class PostgresStatisticsInterface(DatabaseStatistics):
1385
+ """Statistics implementation for Postgres systems.
1386
+
1387
+ Parameters
1388
+ ----------
1389
+ postgres_db : PostgresInterface
1390
+ The database instance for which the statistics should be retrieved
1391
+ emulated : bool, optional
1392
+ Whether the statistics interface should operate in emulation mode. To enable reproducibility, this is *True*
1393
+ by default
1394
+ enable_emulation_fallback : bool, optional
1395
+ Whether emulation should be used for unsupported statistics when running in native mode, by default True
1396
+ cache_enabled : Optional[bool], optional
1397
+ Whether emulated statistics queries should be subject to caching, by default True. Set to *None* to use the
1398
+ caching behavior of the `db`
1399
+ """
1400
+
1401
+ def __init__(
1402
+ self,
1403
+ postgres_db: PostgresInterface,
1404
+ *,
1405
+ emulated: bool = True,
1406
+ enable_emulation_fallback: bool = True,
1407
+ cache_enabled: Optional[bool] = True,
1408
+ ) -> None:
1409
+ super().__init__(
1410
+ postgres_db,
1411
+ emulated=emulated,
1412
+ enable_emulation_fallback=enable_emulation_fallback,
1413
+ cache_enabled=cache_enabled,
1414
+ )
1415
+
1416
+ def n_pages(self, table: TableReference | str) -> int:
1417
+ query_template = "SELECT relpages FROM pg_class WHERE oid = %s::regclass"
1418
+ regclass = table.full_name if isinstance(table, TableReference) else table
1419
+ self._db.cursor().execute(query_template, (regclass,))
1420
+ result_set = self._db.cursor().fetchone()
1421
+ if not result_set:
1422
+ raise ValueError(f"Could not retrieve page count for table '{table}'")
1423
+ return result_set[0]
1424
+
1425
+ def update_statistics(
1426
+ self,
1427
+ columns: Optional[ColumnReference | Iterable[ColumnReference]] = None,
1428
+ *,
1429
+ tables: Optional[TableReference | Iterable[TableReference]] = None,
1430
+ perfect_mcv: bool = False,
1431
+ perfect_n_distinct: bool = False,
1432
+ verbose: bool = False,
1433
+ ) -> None:
1434
+ """Instructs the Postgres server to update statistics for specific columns.
1435
+
1436
+ Notice that is one of the methods of the database interface that explicitly mutates the state of the database system.
1437
+
1438
+ Parameters
1439
+ ----------
1440
+ columns : Optional[ColumnReference | Iterable[ColumnReference]], optional
1441
+ The columns for which statistics should be updated. If no columns are given, columns are inferred based on the
1442
+ `tables` and all detected columns are used.
1443
+ tables : Optional[TableReference | Iterable[TableReference]], optional
1444
+ The table for which statistics should be updated. If `columns` are given, this parameter is completely ignored. If
1445
+ no columns and no tables are given, all tables in the current database are used.
1446
+ perfect_mcv : bool, optional
1447
+ Whether the database system should attempt to create perfect statistics. Perfect statistics means that for each of
1448
+ the columns MCV lists are created such that each distinct value is contained within the list. For large and diverse
1449
+ columns, this might lots of compute time as well as storage space. Notice, that the database system still has the
1450
+ ultimate decision on whether to generate MCV lists in the first place. Postgres also imposes a hard limit on the
1451
+ maximum allowed length of MCV lists and histogram widths.
1452
+ perfect_n_distinct : bool, optional
1453
+ Whether to set the number of distinct values to its true value.
1454
+ verbose : bool, optional
1455
+ Whether to print some progress information to standard error.
1456
+ """
1457
+ if not columns and not tables:
1458
+ tables = [
1459
+ tab
1460
+ for tab in self._db.schema().tables()
1461
+ if not self._db.schema().is_view(tab)
1462
+ ]
1463
+ if not columns and tables:
1464
+ tables = util.enlist(tables)
1465
+ columns = util.set_union(self._db.schema().columns(tab) for tab in tables)
1466
+
1467
+ assert columns is not None
1468
+ columns: Iterable[ColumnReference] = util.enlist(columns)
1469
+ columns_map: dict[TableReference, list[str]] = util.dicts.generate_multi(
1470
+ (col.table, col.name) for col in columns
1471
+ )
1472
+ distinct_values: dict[ColumnReference, int] = {}
1473
+
1474
+ if perfect_mcv or perfect_n_distinct:
1475
+ for column in columns:
1476
+ util.logging.print_if(
1477
+ verbose,
1478
+ util.timestamp(),
1479
+ ":: Now preparing column",
1480
+ column,
1481
+ use_stderr=True,
1482
+ )
1483
+ n_distinct = round(
1484
+ self.distinct_values(column, emulated=True, cache_enabled=True)
1485
+ )
1486
+ if perfect_n_distinct:
1487
+ distinct_values[column] = n_distinct
1488
+ if not perfect_mcv:
1489
+ continue
1490
+
1491
+ stats_target_query = textwrap.dedent(f"""
1492
+ ALTER TABLE {column.table.full_name}
1493
+ ALTER COLUMN {column.name}
1494
+ SET STATISTICS {n_distinct};
1495
+ """)
1496
+ # This query might issue a warning if the requested stats target is larger than the allowed maximum value
1497
+ # However, Postgres simply uses the maximum value in this case. To permit different maximum values in different
1498
+ # Postgres versions, we accept the warning and do not use a hard-coded maximum value with snapping logic
1499
+ # ourselves.
1500
+ self._db.cursor().execute(stats_target_query)
1501
+
1502
+ columns_str = {
1503
+ table: ", ".join(col for col in columns)
1504
+ for table, columns in columns_map.items()
1505
+ }
1506
+ tables_and_columns = ", ".join(
1507
+ f"{table.full_name}({cols})" for table, cols in columns_str.items()
1508
+ )
1509
+
1510
+ util.logging.print_if(
1511
+ verbose,
1512
+ util.timestamp(),
1513
+ ":: Now analyzing columns",
1514
+ tables_and_columns,
1515
+ use_stderr=True,
1516
+ )
1517
+ query_template = f"ANALYZE {tables_and_columns}"
1518
+ self._db.cursor().execute(query_template)
1519
+
1520
+ for column, n_distinct in distinct_values.items():
1521
+ distinct_update_query = textwrap.dedent(f"""
1522
+ ALTER TABLE {column.table.full_name}
1523
+ ALTER COLUMN {column.name}
1524
+ SET (n_distinct = {n_distinct});
1525
+ """)
1526
+ self._db.cursor().execute(distinct_update_query)
1527
+
1528
+ def _retrieve_total_rows_from_stats(self, table: TableReference) -> Optional[int]:
1529
+ count_query = (
1530
+ f"SELECT reltuples FROM pg_class WHERE oid = '{table.full_name}'::regclass"
1531
+ )
1532
+ self._db.cursor().execute(count_query)
1533
+ result_set = self._db.cursor().fetchone()
1534
+ if not result_set:
1535
+ return None
1536
+ count = result_set[0]
1537
+ return count
1538
+
1539
+ def _retrieve_distinct_values_from_stats(
1540
+ self, column: ColumnReference
1541
+ ) -> Optional[int]:
1542
+ dist_query = (
1543
+ "SELECT n_distinct FROM pg_stats WHERE tablename = %s and attname = %s"
1544
+ )
1545
+ self._db.cursor().execute(dist_query, (column.table.full_name, column.name))
1546
+ result_set = self._db.cursor().fetchone()
1547
+ if not result_set:
1548
+ return None
1549
+ dist_values = result_set[0]
1550
+
1551
+ # interpreting the n_distinct column is difficult, since different value ranges indicate different things
1552
+ # (see https://www.postgresql.org/docs/current/view-pg-stats.html)
1553
+ # If the value is >= 0, it represents the actual (approximated) number of distinct non-zero values in the
1554
+ # column.
1555
+ # If the value is < 0, it represents 'the negative of the number of distinct values divided by the number of
1556
+ # rows'. Therefore, we have to correct the number of distinct values manually in this case.
1557
+ if dist_values >= 0:
1558
+ return dist_values
1559
+
1560
+ # correct negative values
1561
+ n_rows = self._retrieve_total_rows_from_stats(column.table)
1562
+ return -1 * n_rows * dist_values
1563
+
1564
+ def _retrieve_min_max_values_from_stats(
1565
+ self, column: ColumnReference
1566
+ ) -> Optional[tuple[Any, Any]]:
1567
+ # Postgres does not keep track of min/max values, so we need to determine them manually
1568
+ if not self.enable_emulation_fallback:
1569
+ raise UnsupportedDatabaseFeatureError(self._db, "min/max value statistics")
1570
+ return self._calculate_min_max_values(column, cache_enabled=True)
1571
+
1572
+ def _retrieve_most_common_values_from_stats(
1573
+ self, column: ColumnReference, k: int
1574
+ ) -> Sequence[tuple[Any, int]]:
1575
+ # Postgres stores the Most common values in a column of type anyarray (since in this column, many MCVs from
1576
+ # many different tables and data types are present). However, this type is not very convenient to work on.
1577
+ # Therefore, we first need to convert the anyarray to an array of the actual attribute type.
1578
+
1579
+ # determine the attributes data type to figure out how it should be converted
1580
+ attribute_query = "SELECT data_type FROM information_schema.columns WHERE table_name = %s AND column_name = %s"
1581
+ self._db.cursor().execute(
1582
+ attribute_query, (column.table.full_name, column.name)
1583
+ )
1584
+ attribute_dtype = self._db.cursor().fetchone()[0]
1585
+ attribute_converter = _DTypeArrayConverters[attribute_dtype]
1586
+
1587
+ # now, load the most frequent values. Since the frequencies are expressed as a fraction of the total number of
1588
+ # rows, we need to multiply this number again to obtain the true number of occurrences
1589
+ mcv_query = textwrap.dedent(
1590
+ """
1591
+ SELECT UNNEST(most_common_vals::text::{conv}),
1592
+ UNNEST(most_common_freqs) * (SELECT reltuples FROM pg_class WHERE oid = '{tab}'::regclass)
1593
+ FROM pg_stats
1594
+ WHERE tablename = %s AND attname = %s""".format(
1595
+ conv=attribute_converter, tab=column.table.full_name
1596
+ )
1597
+ )
1598
+ self._db.cursor().execute(mcv_query, (column.table.full_name, column.name))
1599
+ return self._db.cursor().fetchall()[:k]
1600
+
1601
+
1602
+ PostgresOptimizerSettings = {
1603
+ JoinOperator.NestedLoopJoin: "enable_nestloop",
1604
+ JoinOperator.HashJoin: "enable_hashjoin",
1605
+ JoinOperator.SortMergeJoin: "enable_mergejoin",
1606
+ ScanOperator.SequentialScan: "enable_seqscan",
1607
+ ScanOperator.IndexScan: "enable_indexscan",
1608
+ ScanOperator.IndexOnlyScan: "enable_indexonlyscan",
1609
+ ScanOperator.BitmapScan: "enable_bitmapscan",
1610
+ IntermediateOperator.Memoize: "enable_memoize",
1611
+ IntermediateOperator.Materialize: "enable_material",
1612
+ IntermediateOperator.Sort: "enable_sort",
1613
+ }
1614
+ """All (session-global) optimizer settings that modify the allowed physical operators."""
1615
+
1616
+ PGHintPlanOptimizerHints: dict[PhysicalOperator, str] = {
1617
+ JoinOperator.NestedLoopJoin: "NestLoop",
1618
+ JoinOperator.HashJoin: "HashJoin",
1619
+ JoinOperator.SortMergeJoin: "MergeJoin",
1620
+ ScanOperator.SequentialScan: "SeqScan",
1621
+ ScanOperator.IndexScan: "IndexOnlyScan",
1622
+ ScanOperator.IndexOnlyScan: "IndexOnlyScan",
1623
+ ScanOperator.BitmapScan: "BitmapScan",
1624
+ IntermediateOperator.Memoize: "Memoize",
1625
+ }
1626
+ """All physical operators that can be enforced by pg_hint_plan.
1627
+
1628
+ These settings operate on a per-relation basis and overwrite the session-global optimizer settings.
1629
+
1630
+ References
1631
+ ----------
1632
+
1633
+ .. pg_hint_plan hints: https://github.com/ossc-db/pg_hint_plan/blob/master/docs/hint_list.md
1634
+ """
1635
+
1636
+ PGLabOptimizerHints: dict[PhysicalOperator, str] = {
1637
+ JoinOperator.NestedLoopJoin: "NestLoop",
1638
+ JoinOperator.HashJoin: "HashJoin",
1639
+ JoinOperator.SortMergeJoin: "MergeJoin",
1640
+ ScanOperator.SequentialScan: "SeqScan",
1641
+ ScanOperator.IndexScan: "IdxScan",
1642
+ ScanOperator.IndexOnlyScan: "IdxScan",
1643
+ ScanOperator.BitmapScan: "BitmapScan",
1644
+ IntermediateOperator.Materialize: "Material",
1645
+ IntermediateOperator.Memoize: "Memo",
1646
+ }
1647
+ """All physical operators that can be enforced by pg_lab.
1648
+
1649
+ These settings operate on a per-relation basis and overwrite the session-global optimizer settings.
1650
+
1651
+ References
1652
+ ----------
1653
+
1654
+ .. pg_lab extension: https://github.com/rbergm/pg_lab/blob/main/docs/hinting.md
1655
+
1656
+ """
1657
+
1658
+
1659
+ PostgresJoinHints = {
1660
+ JoinOperator.NestedLoopJoin,
1661
+ JoinOperator.HashJoin,
1662
+ JoinOperator.SortMergeJoin,
1663
+ }
1664
+ """All join operators that are supported by Postgres."""
1665
+
1666
+ PostgresScanHints = {
1667
+ ScanOperator.SequentialScan,
1668
+ ScanOperator.IndexScan,
1669
+ ScanOperator.IndexOnlyScan,
1670
+ ScanOperator.BitmapScan,
1671
+ }
1672
+ """All scan operators that are supported by Postgres."""
1673
+
1674
+ PostgresPlanHints = {
1675
+ HintType.Cardinality,
1676
+ HintType.Parallelization,
1677
+ HintType.LinearJoinOrder,
1678
+ HintType.BushyJoinOrder,
1679
+ HintType.JoinDirection,
1680
+ HintType.Operator,
1681
+ }
1682
+ """All non-operator hints supported by Postgres, that can be used to enforce additional optimizer behaviour."""
1683
+
1684
+
1685
+ class PostgresExplainClause(Explain):
1686
+ """A specialized *EXPLAIN* clause implementation to handle Postgres custom syntax for query plans.
1687
+
1688
+ If *ANALYZE* is enabled, this also retrieves information about shared buffer usage (page hits and disk reads).
1689
+
1690
+ Parameters
1691
+ ----------
1692
+ original_clause : Explain
1693
+ The actual *EXPLAIN* clause. The new explain clause acts as a decorator around the original clause.
1694
+ """
1695
+
1696
+ def __init__(self, original_clause: Explain) -> None:
1697
+ super().__init__(original_clause.analyze, original_clause.target_format)
1698
+
1699
+ def __str__(self) -> str:
1700
+ explain_args = "(SETTINGS, "
1701
+ if self.analyze:
1702
+ explain_args += "ANALYZE, BUFFERS, "
1703
+ explain_args += f"FORMAT {self.target_format})"
1704
+ return f"EXPLAIN {explain_args}"
1705
+
1706
+
1707
+ class PostgresLimitClause(Limit):
1708
+ """A specialized *LIMIT* clause implementation to handle Postgres custom syntax for limits / offsets
1709
+
1710
+ Parameters
1711
+ ----------
1712
+ original_clause : Limit
1713
+ The actual *LIMIT* clause. The new limit clause acts as a decorator around the original clause.
1714
+ """
1715
+
1716
+ def __init__(self, original_clause: Limit) -> None:
1717
+ super().__init__(
1718
+ limit=original_clause.limit,
1719
+ offset=original_clause.offset,
1720
+ fetch_direction=original_clause.fetch_direction,
1721
+ )
1722
+
1723
+ def __str__(self) -> str:
1724
+ if self.fetch_direction != "first":
1725
+ return super().__str__()
1726
+
1727
+ if self.limit and self.offset:
1728
+ return f"LIMIT {self.limit} OFFSET {self.offset}"
1729
+ elif self.limit:
1730
+ return f"LIMIT {self.limit}"
1731
+ elif self.offset:
1732
+ return f"OFFSET {self.offset}"
1733
+ else:
1734
+ return ""
1735
+
1736
+
1737
+ def _replace_postgres_cast_expressions(expression: SqlExpression) -> SqlExpression:
1738
+ """Wraps a given expression by a `_PostgresCastExpression` if necessary.
1739
+
1740
+ This is the replacment method required by the `replace_expressions` transformation. It wraps all `CastExpression`
1741
+ instances by a `_PostgresCastExpression` and leaves all other expressions intact.
1742
+
1743
+ Parameters
1744
+ ----------
1745
+ expression : SqlExpression
1746
+ The expression to check
1747
+
1748
+ Returns
1749
+ -------
1750
+ SqlExpression
1751
+ A potentially wrapped version of the original expression
1752
+
1753
+ See Also
1754
+ --------
1755
+ transform.replace_expressions
1756
+ """
1757
+ target = type(expression)
1758
+ match expression:
1759
+ case StaticValueExpression() | ColumnExpression() | StarExpression():
1760
+ return expression
1761
+ case SubqueryExpression(query):
1762
+ replaced_subquery = transform.replace_expressions(
1763
+ query, _replace_postgres_cast_expressions
1764
+ )
1765
+ return target(replaced_subquery)
1766
+ case CaseExpression(cases, else_expr):
1767
+ replaced_cases: list[tuple[AbstractPredicate, SqlExpression]] = []
1768
+ for condition, result in cases:
1769
+ replaced_condition = _replace_postgres_cast_expressions(condition)
1770
+ replaced_result = _replace_postgres_cast_expressions(result)
1771
+ replaced_cases.append((replaced_condition, replaced_result))
1772
+ replaced_else = (
1773
+ _replace_postgres_cast_expressions(else_expr) if else_expr else None
1774
+ )
1775
+ return target(replaced_cases, else_expr=replaced_else)
1776
+ case CastExpression(cast, typ, params):
1777
+ replaced_cast = _replace_postgres_cast_expressions(cast)
1778
+ # return _PostgresCastExpression(replaced_cast, typ, type_params=params)
1779
+ return CastExpression(replaced_cast, typ, params)
1780
+ case MathExpression(op, lhs, rhs):
1781
+ replaced_lhs = _replace_postgres_cast_expressions(lhs)
1782
+ rhs = util.enlist(rhs) if rhs else []
1783
+ replaced_rhs = [_replace_postgres_cast_expressions(expr) for expr in rhs]
1784
+ return target(op, replaced_lhs, replaced_rhs)
1785
+ case ArrayAccessExpression(array, ind, lo, hi):
1786
+ replaced_arr = _replace_postgres_cast_expressions(array)
1787
+ replaced_ind = (
1788
+ _replace_postgres_cast_expressions(ind) if ind is not None else None
1789
+ )
1790
+ replaced_lo = (
1791
+ _replace_postgres_cast_expressions(lo) if lo is not None else None
1792
+ )
1793
+ replaced_hi = (
1794
+ _replace_postgres_cast_expressions(hi) if hi is not None else None
1795
+ )
1796
+ return target(
1797
+ replaced_arr,
1798
+ idx=replaced_ind,
1799
+ lower_idx=replaced_lo,
1800
+ upper_idx=replaced_hi,
1801
+ )
1802
+ case FunctionExpression(fn, args, distinct, cond):
1803
+ replaced_args = [_replace_postgres_cast_expressions(arg) for arg in args]
1804
+ replaced_cond = _replace_postgres_cast_expressions(cond) if cond else None
1805
+ return FunctionExpression(
1806
+ fn, replaced_args, distinct=distinct, filter_where=replaced_cond
1807
+ )
1808
+ case WindowExpression(fn, parts, ordering, cond):
1809
+ replaced_fn = _replace_postgres_cast_expressions(fn)
1810
+ replaced_parts = [
1811
+ _replace_postgres_cast_expressions(part) for part in parts
1812
+ ]
1813
+ replaced_cond = _replace_postgres_cast_expressions(cond) if cond else None
1814
+
1815
+ replaced_order_exprs: list[OrderByExpression] = []
1816
+ for order in ordering or []:
1817
+ replaced_expr = _replace_postgres_cast_expressions(order.column)
1818
+ replaced_order_exprs.append(
1819
+ OrderByExpression(replaced_expr, order.ascending, order.nulls_first)
1820
+ )
1821
+ replaced_ordering = (
1822
+ OrderBy(replaced_order_exprs) if replaced_order_exprs else None
1823
+ )
1824
+
1825
+ return target(
1826
+ replaced_fn,
1827
+ partitioning=replaced_parts,
1828
+ ordering=replaced_ordering,
1829
+ filter_condition=replaced_cond,
1830
+ )
1831
+ case BinaryPredicate(op, lhs, rhs):
1832
+ replaced_lhs = _replace_postgres_cast_expressions(lhs)
1833
+ replaced_rhs = _replace_postgres_cast_expressions(rhs)
1834
+ return target(op, replaced_lhs, replaced_rhs)
1835
+ case BetweenPredicate(col, lo, hi):
1836
+ replaced_col = _replace_postgres_cast_expressions(col)
1837
+ replaced_lo = _replace_postgres_cast_expressions(lo)
1838
+ replaced_hi = _replace_postgres_cast_expressions(hi)
1839
+ return BetweenPredicate(replaced_col, (replaced_lo, replaced_hi))
1840
+ case InPredicate(col, vals):
1841
+ replaced_col = _replace_postgres_cast_expressions(col)
1842
+ replaced_vals = [_replace_postgres_cast_expressions(val) for val in vals]
1843
+ return target(replaced_col, replaced_vals)
1844
+ case UnaryPredicate(col, op):
1845
+ replaced_col = _replace_postgres_cast_expressions(col)
1846
+ return target(replaced_col, op)
1847
+ case CompoundPredicate(op, children) if op in {
1848
+ CompoundOperator.And,
1849
+ CompoundOperator.Or,
1850
+ }:
1851
+ replaced_children = [
1852
+ _replace_postgres_cast_expressions(child) for child in children
1853
+ ]
1854
+ return target(op, replaced_children)
1855
+ case CompoundPredicate(op, child) if op == CompoundOperator.Not:
1856
+ replaced_child = _replace_postgres_cast_expressions(child)
1857
+ return target(op, replaced_child)
1858
+ case _:
1859
+ raise ValueError(
1860
+ f"Unsupported expression type {type(expression)}: {expression}"
1861
+ )
1862
+
1863
+
1864
+ PostgresHintingBackend = Literal["pg_hint_plan", "pg_lab", "none"]
1865
+ """The hinting backend being used.
1866
+
1867
+ If pg_lab is available, this is the preferred extension. Otherwise, pg_hint_plan is used as a fallback.
1868
+ If the hint service is inactive, the backend is set to _none_.
1869
+ """
1870
+
1871
+
1872
+ def _walk_join_order(node: JoinTree) -> str:
1873
+ if node.is_scan():
1874
+ return node.base_table.identifier()
1875
+
1876
+ outer = _walk_join_order(node.outer_child)
1877
+ inner = _walk_join_order(node.inner_child)
1878
+ return f"({outer} {inner})"
1879
+
1880
+
1881
+ def _generate_pghintplan_hints(
1882
+ query: SqlQuery,
1883
+ join_order: Optional[JoinTree],
1884
+ phys_ops: Optional[PhysicalOperatorAssignment],
1885
+ plan_params: Optional[PlanParameterization],
1886
+ *,
1887
+ pg_instance: PostgresInterface,
1888
+ ) -> Hint:
1889
+ hints: list[str] = []
1890
+ prep_statements: list[str] = []
1891
+ used_parallel: bool = False
1892
+
1893
+ geqo_thresh: str = pg_instance.config["geqo_threshold"]
1894
+ if len(query.tables()) > int(geqo_thresh):
1895
+ warnings.warn(
1896
+ "Temporarily disabling GEQO. pg_hint_plan only works with the DP optimizer.",
1897
+ category=HintWarning,
1898
+ )
1899
+ hints.append("Set(geqo off)")
1900
+
1901
+ if join_order and len(join_order) > 1:
1902
+ join_str = _walk_join_order(join_order)
1903
+ hints.append(f"Leading({join_str})")
1904
+
1905
+ if phys_ops:
1906
+ for scan in phys_ops.scan_operators.values():
1907
+ op = PGHintPlanOptimizerHints[scan.operator]
1908
+ tab = scan.table.identifier()
1909
+ hints.append(f"{op}({tab})")
1910
+ if scan.parallel_workers > 1 and not used_parallel:
1911
+ hints.append(f"Parallel({tab} {scan.parallel_workers} hard)")
1912
+ used_parallel = True
1913
+ elif used_parallel:
1914
+ warnings.warn(
1915
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
1916
+ category=HintWarning,
1917
+ )
1918
+
1919
+ for join in phys_ops.join_operators.values():
1920
+ op = PGHintPlanOptimizerHints[join.operator]
1921
+ intermediate = " ".join(tab.identifier() for tab in join.intermediate)
1922
+ hints.append(f"{op}({intermediate})")
1923
+ if join.parallel_workers > 1 and not used_parallel:
1924
+ warnings.warn(
1925
+ "Cannot directly set parallel workers on a join with pg_hint_plan. "
1926
+ "Setting on all base tables instead.",
1927
+ category=HintWarning,
1928
+ )
1929
+ for tab in join.intermediate:
1930
+ hints.append(
1931
+ f"Parallel({tab.identifier()} {join.parallel_workers} hard)"
1932
+ )
1933
+ elif used_parallel:
1934
+ warnings.warn(
1935
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
1936
+ category=HintWarning,
1937
+ )
1938
+
1939
+ for tabs, intermediate_op in phys_ops.intermediate_operators.items():
1940
+ op = PGHintPlanOptimizerHints.get(intermediate_op)
1941
+ if not op:
1942
+ warnings.warn(
1943
+ f"Cannot enforce operator {intermediate_op} with pg_hint_plan. Ignoring this hint",
1944
+ category=HintWarning,
1945
+ )
1946
+ continue
1947
+ intermediate = " ".join(tab.identifier() for tab in tabs)
1948
+ hints.append(f"{op}({intermediate})")
1949
+
1950
+ for op, val in phys_ops.global_settings.items():
1951
+ setting = PostgresOptimizerSettings[op]
1952
+ hints.append(f"Set({setting} {val})")
1953
+
1954
+ if plan_params:
1955
+ for tabs, card in plan_params.cardinalities.items():
1956
+ if card.isnan():
1957
+ continue
1958
+
1959
+ intermediate = " ".join(tab.identifier() for tab in tabs)
1960
+ if card.isinf():
1961
+ warnings.warn(
1962
+ f"Ignoring infinite cardinality for intermediate {intermediate}",
1963
+ category=HintWarning,
1964
+ )
1965
+ continue
1966
+
1967
+ hints.append(f"Rows({intermediate} #{card.value})")
1968
+
1969
+ for tabs, workers in plan_params.parallel_workers.items():
1970
+ if workers == 1:
1971
+ continue
1972
+ elif used_parallel:
1973
+ warnings.warn(
1974
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
1975
+ category=HintWarning,
1976
+ )
1977
+ continue
1978
+
1979
+ intermediate = " ".join(tab.identifier() for tab in tabs)
1980
+ hints.append(f"Parallel({intermediate} {workers} hard)")
1981
+ used_parallel = True
1982
+
1983
+ for setting, val in plan_params.system_settings.items():
1984
+ # TODO: we could be smart here and differentiate betwen settings that only affect the optimizer and settings
1985
+ # that also affect the execution engine. The former can be set in pg_hint_plan via Set(...), while the latter
1986
+ # must be set via a preparatory SET statement. We should avoid this second case if at all possible since it
1987
+ # affects the entire session and not just the current query.
1988
+ # For now, we mitigate this issue in a different way: we emit SET LOCAL statements which only modify the
1989
+ # current transaction. Since the Postgres interface runs in autocommit mode, each query is executed within
1990
+ # its own transaction. Therefore, all changes are reverted immediately after the query has finished.
1991
+ prep_statements.append(f"SET LOCAL {setting} TO '{val}';")
1992
+
1993
+ if plan_params.execution_mode is not None:
1994
+ warnings.warn(
1995
+ "pg_hint_plan does not support execution mode hints",
1996
+ category=HintWarning,
1997
+ )
1998
+
1999
+ hints = [f" {line}" for line in hints]
2000
+ hints.insert(0, "/*+")
2001
+ hints.append(" */")
2002
+
2003
+ return Hint("\n".join(prep_statements), "\n".join(hints))
2004
+
2005
+
2006
+ def _generate_pglab_hints(
2007
+ join_order: Optional[JoinTree],
2008
+ phys_ops: Optional[PhysicalOperatorAssignment],
2009
+ plan_params: Optional[PlanParameterization],
2010
+ ) -> Hint:
2011
+ hints: list[str] = []
2012
+ prep_statements: list[str] = []
2013
+
2014
+ has_worker_params = plan_params and plan_params.parallel_workers
2015
+ used_parallel = False
2016
+
2017
+ if has_worker_params and not phys_ops:
2018
+ warnings.warn(
2019
+ "pg_lab can only force parallel execution of nodes with known operators. Ignoring worker hints.",
2020
+ category=HintWarning,
2021
+ )
2022
+ elif has_worker_params:
2023
+ has_dangling_worker_hints = any(
2024
+ intermediate not in phys_ops
2025
+ for intermediate in plan_params.parallel_workers
2026
+ )
2027
+ if has_dangling_worker_hints:
2028
+ warnings.warn(
2029
+ "pg_lab can only force parallel execution of nodes with known operators. Ignoring additional hints.",
2030
+ category=HintWarning,
2031
+ )
2032
+ phys_ops = phys_ops.integrate_workers_from(plan_params)
2033
+
2034
+ hints.append("Config(plan_mode=anchored)")
2035
+
2036
+ if join_order and len(join_order) > 1:
2037
+ join_str = _walk_join_order(join_order)
2038
+ hints.append(f"JoinOrder({join_str})")
2039
+
2040
+ if phys_ops:
2041
+ for scan in phys_ops.scan_operators.values():
2042
+ op = PGLabOptimizerHints[scan.operator]
2043
+ table = scan.table.identifier()
2044
+
2045
+ if scan.parallel_workers > 1 and not used_parallel:
2046
+ # TODO: check for off-by-one errors!!!
2047
+ hint = f"{op}({table} (workers={scan.parallel_workers}))"
2048
+ used_parallel = True
2049
+ elif scan.parallel_workers > 1 and used_parallel:
2050
+ warnings.warn(
2051
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
2052
+ category=HintWarning,
2053
+ )
2054
+ else:
2055
+ hint = f"{op}({table})"
2056
+ hints.append(hint)
2057
+
2058
+ for join in phys_ops.join_operators.values():
2059
+ op = PGLabOptimizerHints[join.operator]
2060
+ intermediate = " ".join(tab.identifier() for tab in join.intermediate)
2061
+
2062
+ if join.parallel_workers > 1 and not used_parallel:
2063
+ hint = f"{op}({intermediate} (workers={join.parallel_workers}))"
2064
+ used_parallel = True
2065
+ elif join.parallel_workers > 1 and used_parallel:
2066
+ warnings.warn(
2067
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
2068
+ category=HintWarning,
2069
+ )
2070
+ else:
2071
+ hint = f"{op}({intermediate})"
2072
+ hints.append(hint)
2073
+
2074
+ for tabs, intermediate_op in phys_ops.intermediate_operators.items():
2075
+ op = PGLabOptimizerHints[intermediate_op]
2076
+ intermediate = " ".join(tab.identifier() for tab in tabs)
2077
+ hints.append(f"{op}({intermediate})")
2078
+
2079
+ for op, enabled in phys_ops.global_settings.items():
2080
+ setting = PostgresOptimizerSettings[op]
2081
+ value = "on" if enabled else "off"
2082
+ hints.append(f"Set({setting} = '{value}')")
2083
+
2084
+ if plan_params:
2085
+ for tabs, card in plan_params.cardinalities.items():
2086
+ if card.isnan():
2087
+ continue
2088
+
2089
+ intermediate = " ".join(tab.identifier() for tab in tabs)
2090
+ if card.isinf():
2091
+ warnings.warn(
2092
+ f"Ignoring infinite cardinality for intermediate {intermediate}",
2093
+ category=HintWarning,
2094
+ )
2095
+ continue
2096
+
2097
+ hints.append(f"Card({intermediate} #{card})")
2098
+
2099
+ for setting, val in plan_params.system_settings.items():
2100
+ hints.append(f"Set({setting} = '{val}')")
2101
+
2102
+ if plan_params.execution_mode is not None:
2103
+ mode = (
2104
+ "sequential"
2105
+ if plan_params.execution_mode == "sequential"
2106
+ else "parallel"
2107
+ )
2108
+ hints.append(f"Config(exec_mode={mode})")
2109
+
2110
+ hints = [f" {line}" for line in hints]
2111
+ hints.insert(0, "/*=pg_lab=")
2112
+ hints.append(" */")
2113
+
2114
+ return Hint("\n".join(prep_statements), "\n".join(hints))
2115
+
2116
+
2117
+ def _extract_plan_join_order(plan: QueryPlan) -> str:
2118
+ if plan.is_scan():
2119
+ return plan.base_table.identifier()
2120
+ elif plan.input_node:
2121
+ return _extract_plan_join_order(plan.input_node)
2122
+
2123
+ outer = _extract_plan_join_order(plan.outer_child)
2124
+ inner = _extract_plan_join_order(plan.inner_child)
2125
+ return f"({outer} {inner})"
2126
+
2127
+
2128
+ def _iter_plan_bfs(plan: QueryPlan) -> Generator[QueryPlan, None, None]:
2129
+ queue = collections.deque([plan])
2130
+ while queue:
2131
+ node = queue.popleft()
2132
+ queue.extend(node.children)
2133
+ yield node
2134
+
2135
+
2136
+ def _generate_pglab_plan(
2137
+ plan: QueryPlan,
2138
+ ) -> Hint:
2139
+ hints: list[str] = ["Config(plan_mode=full)"]
2140
+ join_order = _extract_plan_join_order(plan)
2141
+ hints.append(f"JoinOrder({join_order})")
2142
+
2143
+ used_parallel = False
2144
+ in_upperrel = True
2145
+ par_workers: Optional[int] = None
2146
+ for node in _iter_plan_bfs(plan):
2147
+ if node.is_scan() or node.is_join():
2148
+ in_upperrel = False
2149
+
2150
+ par_workers = (
2151
+ node.parallel_workers if node.parallel_workers > 0 else par_workers
2152
+ )
2153
+ if in_upperrel and par_workers and not used_parallel:
2154
+ hints.append(f"Result(workers={par_workers})")
2155
+ used_parallel = True
2156
+ par_workers = None
2157
+ elif in_upperrel and par_workers and used_parallel:
2158
+ warnings.warn(
2159
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
2160
+ category=HintWarning,
2161
+ )
2162
+
2163
+ operator = PGLabOptimizerHints.get(node.operator)
2164
+ intermediate = " ".join(tab.identifier() for tab in node.tables())
2165
+
2166
+ if operator:
2167
+ if par_workers and not used_parallel:
2168
+ metadata = f" (workers={par_workers})"
2169
+ par_workers = None
2170
+ used_parallel = True
2171
+ elif par_workers and used_parallel:
2172
+ metadata = ""
2173
+ warnings.warn(
2174
+ "Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
2175
+ category=HintWarning,
2176
+ )
2177
+ else:
2178
+ metadata = ""
2179
+
2180
+ hints.append(f"{operator}({intermediate}{metadata})")
2181
+
2182
+ card = node.actual_cardinality or node.estimated_cardinality
2183
+ if operator and card.is_valid():
2184
+ hints.append(f"Card({intermediate} #{card})")
2185
+
2186
+ hints = [f" {line}" for line in hints]
2187
+ hints.insert(0, "/*=pg_lab=")
2188
+ hints.append(" */")
2189
+ return Hint("", "\n".join(hints))
2190
+
2191
+
2192
+ class PostgresHintService(HintService):
2193
+ """Postgres-specific implementation of the hinting capabilities.
2194
+
2195
+ Most importantly, this service implements a mapping from the abstract optimization descisions (join order + operators) to
2196
+ their counterparts in the hinting backend and integrates Postgres' few deviations from standard SQL syntax (*CAST*
2197
+ expressions and *LIMIT* clauses).
2198
+
2199
+ The hinting service supports two different kinds of backends: pg_lab or pg_hint_plan. The former is the preferred option
2200
+ since it provides cardinality hints for base joins and does not require management of the GeQO optimizer.
2201
+
2202
+ Notice that by delegating the adaptation of Postgres' native optimizer to the pg_hint_plan extension, a couple of
2203
+ undesired side-effects have to be accepted:
2204
+
2205
+ 1. forcing a join order also involves forcing a specific join direction. Our implementation applies a couple of heuristics
2206
+ to mitigate a bad impact on performance
2207
+ 2. the extension only instruments the dynamic programming-based optimizer. If the *geqo_threshold* is reached and the
2208
+ genetic optimizer takes over, no modifications are applied. Therefore, it is best to disable GeQO while working with
2209
+ Postgres. At the same time, this means that certain scenarios like custom cardinality estimation for the genetic
2210
+ optimizer cannot currently be tested
2211
+
2212
+ Parameters
2213
+ ----------
2214
+ postgres_db : PostgresInterface
2215
+ A postgres database with an active hinting backend (pg_hint_plan or pg_lab)
2216
+
2217
+ Raises
2218
+ ------
2219
+ ValueError
2220
+ If the supplied `postgres_db` does not have a supported hinting backend enabled.
2221
+
2222
+ See Also
2223
+ --------
2224
+ _generate_pg_join_order_hint
2225
+
2226
+ References
2227
+ ----------
2228
+
2229
+ .. pg_hint_plan extension: https://github.com/ossc-db/pg_hint_plan
2230
+ .. Postgres query planning configuration: https://www.postgresql.org/docs/current/runtime-config-query.html
2231
+ """
2232
+
2233
+ def __init__(self, postgres_db: PostgresInterface) -> None:
2234
+ self._postgres_db = postgres_db
2235
+ self._inactive = True
2236
+ self._backend = "none"
2237
+ self._infer_pg_backend()
2238
+
2239
+ def _get_backend(self) -> PostgresHintingBackend:
2240
+ return self._backend
2241
+
2242
+ def _set_backend(self, backend_name: PostgresHintingBackend) -> None:
2243
+ self._inactive = backend_name == "none"
2244
+ self._backend = backend_name
2245
+
2246
+ backend = property(_get_backend, _set_backend, doc="The hinting backend in use.")
2247
+
2248
+ def generate_hints(
2249
+ self,
2250
+ query: SqlQuery,
2251
+ plan: Optional[QueryPlan] = None,
2252
+ *,
2253
+ join_order: Optional[JoinTree] = None,
2254
+ physical_operators: Optional[PhysicalOperatorAssignment] = None,
2255
+ plan_parameters: Optional[PlanParameterization] = None,
2256
+ ) -> SqlQuery:
2257
+ self._assert_active_backend()
2258
+
2259
+ adapted_query = query
2260
+ if adapted_query.explain and not isinstance(
2261
+ adapted_query.explain, PostgresExplainClause
2262
+ ):
2263
+ adapted_query = transform.replace_clause(
2264
+ adapted_query, PostgresExplainClause(adapted_query.explain)
2265
+ )
2266
+ if adapted_query.limit_clause and not isinstance(
2267
+ adapted_query.limit_clause, PostgresLimitClause
2268
+ ):
2269
+ adapted_query = transform.replace_clause(
2270
+ adapted_query, PostgresLimitClause(adapted_query.limit_clause)
2271
+ )
2272
+
2273
+ has_param = any(
2274
+ param is not None
2275
+ for param in (join_order, physical_operators, plan_parameters)
2276
+ )
2277
+ if plan is not None and has_param:
2278
+ raise ValueError(
2279
+ "Can only hint an entire query plan, or individual parts, not both."
2280
+ )
2281
+
2282
+ match self._backend:
2283
+ case "pg_hint_plan":
2284
+ if plan is not None:
2285
+ join_order = jointree_from_plan(plan)
2286
+ physical_operators = operators_from_plan(
2287
+ plan, include_workers=False
2288
+ )
2289
+ plan_parameters = parameters_from_plan(
2290
+ plan, target_cardinality="actual", fallback_estimated=True
2291
+ )
2292
+
2293
+ hints = _generate_pghintplan_hints(
2294
+ query,
2295
+ join_order,
2296
+ physical_operators,
2297
+ plan_parameters,
2298
+ pg_instance=self._postgres_db,
2299
+ )
2300
+ case "pg_lab" if plan is not None:
2301
+ hints = _generate_pglab_plan(plan)
2302
+ case "pg_lab":
2303
+ hints = _generate_pglab_hints(
2304
+ join_order,
2305
+ physical_operators,
2306
+ plan_parameters,
2307
+ )
2308
+
2309
+ query = transform.add_clause(adapted_query, hints)
2310
+ return query
2311
+
2312
+ def format_query(self, query: SqlQuery) -> str:
2313
+ if query.explain:
2314
+ query = transform.replace_clause(
2315
+ query, PostgresExplainClause(query.explain)
2316
+ )
2317
+ return formatter.format_quick(query, flavor="postgres")
2318
+
2319
+ def supports_hint(self, hint: PhysicalOperator | HintType) -> bool:
2320
+ self._assert_active_backend()
2321
+ return hint in PostgresJoinHints | PostgresScanHints | PostgresPlanHints
2322
+
2323
+ def describe(self) -> dict[str, str]:
2324
+ """Provides a JSON-serializable description of the hint service.
2325
+
2326
+ Returns
2327
+ -------
2328
+ dict[str, str]
2329
+ Information about the hinting backend
2330
+ """
2331
+ return {"backend": self._backend}
2332
+
2333
+ def _infer_pg_backend(self) -> None:
2334
+ """Determines the hinting backend that is provided by the current Postgres instance."""
2335
+
2336
+ # We first try the easy route: checking whether any of the settings related to the hinting backends are available and
2337
+ # activated. If this is the case, we are already done.
2338
+ # Otherwise, we need to become more creative and rely on more advanced heuristics.
2339
+ # Note that on recent installations of Postgres/pg_hint_plan or pg_lab, we can expect that the easy route does indeed
2340
+ # work. It is just on older versions that the settings were not available.
2341
+
2342
+ cur = self._postgres_db.cursor()
2343
+ try:
2344
+ cur.execute("SHOW pg_hint_plan.enable_hint;")
2345
+ res = cur.fetchone()
2346
+ if res and res[0] == "on":
2347
+ util.logging.print_if(
2348
+ self._postgres_db.debug,
2349
+ "Using pg_hint_plan hinting backend",
2350
+ file=sys.stderr,
2351
+ )
2352
+ self._inactive = False
2353
+ self._backend = "pg_hint_plan"
2354
+ return
2355
+ except psycopg.errors.UndefinedObject:
2356
+ pass
2357
+
2358
+ try:
2359
+ cur.execute("SHOW enable_pglab;")
2360
+ res = cur.fetchone()
2361
+ if res and res[0] == "on":
2362
+ util.logging.print_if(
2363
+ self._postgres_db.debug,
2364
+ "Using pg_lab hinting backend",
2365
+ file=sys.stderr,
2366
+ )
2367
+ self._inactive = False
2368
+ self._backend = "pg_lab"
2369
+ return
2370
+ except psycopg.errors.UndefinedObject:
2371
+ pass
2372
+
2373
+ # At this point the easy route failed and we need to rely on more advanced heuristics.
2374
+ # Specifically, we try to check whether a shared library related to one of the backends is currently loaded
2375
+ # in the backend process. See the later comment for the reasoning.
2376
+ #
2377
+ # All code below should be considered legacy and we might in fact remove it entirely in future versions of PostBOUND.
2378
+
2379
+ if os.name != "posix":
2380
+ warnings.warn(
2381
+ "It seems you are running PostBOUND on a non-POSIX system. "
2382
+ "Please beware that PostBOUND is currently not intended to run on different systems and "
2383
+ "there might be (many) dragons. "
2384
+ "Proceed at your own risk. "
2385
+ "We assume that the Postgres server has pg_hint_plan enabled. "
2386
+ "Please set the backend property to pg_lab manually if you are using pg_lab."
2387
+ )
2388
+ self._backend = "pg_hint_plan"
2389
+ self._inactive = False
2390
+ return
2391
+
2392
+ connection = self._postgres_db.connection()
2393
+ backend_pid = connection.info.backend_pid
2394
+ hostname = connection.info.host
2395
+
2396
+ # Postgres does not provide a direct method to determine which extensions are currently active if they have only
2397
+ # been loaded as a shared library (as is the case for both pg_hint_plan and pg_lab). Therefore, we have to rely on
2398
+ # the assumption that the Postgres server is running on the same (virtual) machine as our PostBOUND process and can
2399
+ # rely on the operating system to determine open files of the backend process (which will include the shared libaries)
2400
+
2401
+ if sys.platform == "darwin":
2402
+ pg_candidates = subprocess.run(
2403
+ ["lsof -p " + str(backend_pid) + " | awk '/postgres/{print $1}'"],
2404
+ capture_output=True,
2405
+ shell=True,
2406
+ text=True,
2407
+ )
2408
+ else:
2409
+ pg_candidates = subprocess.run(
2410
+ ["ps -aux | awk '/" + str(backend_pid) + "/{print $11}'"],
2411
+ capture_output=True,
2412
+ shell=True,
2413
+ text=True,
2414
+ )
2415
+ found_pg = any(
2416
+ candidate.lower().startswith("postgres")
2417
+ for candidate in pg_candidates.stdout.split()
2418
+ )
2419
+
2420
+ # There are some rare edge cases where our heuristics fail. We have to accept them for now, but should improve the
2421
+ # backend detection in the future. Most importantly, the heuristic will pass if we are connected to a remote server
2422
+ # on localhost (e.g. via SSH tunneling or WSL instances) and there is a different Postgres server running on the same
2423
+ # machine as the PostBOUND process. In this case, our heuristics assume that these are the same servers.
2424
+ # In the future, we might want to check the ports as well, but this probably requires superuser privileges
2425
+ # (for netstat).
2426
+
2427
+ if hostname not in ["localhost", "127.0.0.1", "::1"] or not found_pg:
2428
+ warnings.warn(
2429
+ "It seems you are connecting to a remote Postgres instance. "
2430
+ "PostBOUND cannot infer the hinting backend for such connections. "
2431
+ "We assume that the this server has pg_hint_plan enabled. "
2432
+ "Please set the backend property to pg_lab manually if you are using pg_lab."
2433
+ )
2434
+ self._backend = "pg_hint_plan"
2435
+ self._inactive = False
2436
+ return
2437
+
2438
+ lib_ext = "dylib" if sys.platform == "darwin" else "so"
2439
+ active_extensions = util.system.open_files(backend_pid)
2440
+ if any(ext.endswith(f"pg_lab.{lib_ext}") for ext in active_extensions):
2441
+ util.logging.print_if(
2442
+ self._postgres_db.debug, "Using pg_lab hinting backend", file=sys.stderr
2443
+ )
2444
+ self._inactive = False
2445
+ self._backend = "pg_lab"
2446
+ elif any(ext.endswith(f"pg_hint_plan.{lib_ext}") for ext in active_extensions):
2447
+ util.logging.print_if(
2448
+ self._postgres_db.debug,
2449
+ "Using pg_hint_plan hinting backend",
2450
+ file=sys.stderr,
2451
+ )
2452
+ self._inactive = False
2453
+ self._backend = "pg_hint_plan"
2454
+ else:
2455
+ warnings.warn(
2456
+ "No supported hinting backend found. "
2457
+ "Please ensure that either pg_hint_plan or pg_lab is available in your Postgres instance."
2458
+ )
2459
+ self._inactive = True
2460
+ self._backend = "none"
2461
+
2462
+ def _assert_active_backend(self) -> None:
2463
+ """Ensures that a proper hinting backend is available.
2464
+
2465
+ Raises
2466
+ ------
2467
+ ValueError
2468
+ If no backend is available.
2469
+ """
2470
+ if self._inactive:
2471
+ connection_pid = self._postgres_db._connection.info.backend_pid
2472
+ raise ValueError(
2473
+ f"No supported hinting backend found for backend with PID {connection_pid}"
2474
+ )
2475
+
2476
+ def __repr__(self) -> str:
2477
+ return f"PostgresHintService(db={self._postgres_db} backend={self._backend})"
2478
+
2479
+ def __str__(self) -> str:
2480
+ return repr(self)
2481
+
2482
+
2483
+ class PostgresOptimizer(OptimizerInterface):
2484
+ """Optimizer introspection for Postgres.
2485
+
2486
+ Parameters
2487
+ ----------
2488
+ postgres_instance : PostgresInterface
2489
+ The database whose optimizer should be introspected
2490
+ """
2491
+
2492
+ def __init__(self, postgres_instance: PostgresInterface) -> None:
2493
+ self._pg_instance = postgres_instance
2494
+
2495
+ def query_plan(self, query: SqlQuery | str) -> QueryPlan:
2496
+ if isinstance(query, SqlQuery):
2497
+ query = transform.as_explain(query)
2498
+ query = self._pg_instance._hinting_backend.format_query(query)
2499
+ else:
2500
+ query = self._explainify(query)
2501
+ raw_query_plan: list = self._pg_instance.execute_query(
2502
+ query, cache_enabled=False
2503
+ )
2504
+ query_plan = PostgresExplainPlan(raw_query_plan[0])
2505
+ return query_plan.as_qep()
2506
+
2507
+ def analyze_plan(
2508
+ self, query: SqlQuery, *, timeout: Optional[float] = None
2509
+ ) -> Optional[QueryPlan]:
2510
+ query = transform.as_explain_analyze(query)
2511
+
2512
+ try:
2513
+ raw_query_plan: dict = self._pg_instance.execute_query(
2514
+ query, cache_enabled=False, raw=True, timeout=timeout
2515
+ )[0]
2516
+ except TimeoutError:
2517
+ return None
2518
+
2519
+ query_plan = PostgresExplainPlan(raw_query_plan)
2520
+ return query_plan.as_qep()
2521
+
2522
+ def cardinality_estimate(self, query: SqlQuery | str) -> Cardinality:
2523
+ if isinstance(query, SqlQuery):
2524
+ query = transform.as_explain(query)
2525
+ query = self._pg_instance._hinting_backend.format_query(query)
2526
+ else:
2527
+ query = self._explainify(query)
2528
+ query_plan = self._pg_instance.execute_query(query, cache_enabled=False)
2529
+ estimate: int = query_plan[0]["Plan"]["Plan Rows"]
2530
+ return Cardinality(estimate)
2531
+
2532
+ def cost_estimate(self, query: SqlQuery | str) -> float:
2533
+ if isinstance(query, SqlQuery):
2534
+ query = transform.as_explain(query)
2535
+ query = self._pg_instance._hinting_backend.format_query(query)
2536
+ else:
2537
+ query = self._explainify(query)
2538
+ query_plan = self._pg_instance.execute_query(query, cache_enabled=False)
2539
+ estimate: float = query_plan[0]["Plan"]["Total Cost"]
2540
+ return estimate
2541
+
2542
+ def configure_operator(self, operator: PhysicalOperator, *, enabled: bool) -> None:
2543
+ """Enables or disables a specific physical operator for the current Postgres connection.
2544
+
2545
+ Parameters
2546
+ ----------
2547
+ operator : PhysicalOperator
2548
+ The operator to configure.
2549
+ enabled : bool
2550
+ Whether the operator should be allowed or not.
2551
+
2552
+ References
2553
+ ----------
2554
+ https://www.postgresql.org/docs/current/runtime-config-query.html
2555
+ """
2556
+ setting_name = PostgresOptimizerSettings.get(operator)
2557
+ if not setting_name:
2558
+ raise ValueError(
2559
+ f"Cannot configure operator {operator} as it is not supported by Postgres"
2560
+ )
2561
+ status = "on" if enabled else "off"
2562
+ self._pg_instance.cursor.execute(f"SET {setting_name} TO {status}")
2563
+
2564
+ def _explainify(self, query: str) -> str:
2565
+ if not query.upper().startswith("EXPLAIN (FORMAT JSON)"):
2566
+ query = f"EXPLAIN (FORMAT JSON) {query}"
2567
+ return query
2568
+
2569
+
2570
+ def _reconnect(name: str, *, pool: DatabasePool) -> PostgresInterface:
2571
+ """Fetches a connection from the database pool.
2572
+
2573
+ If the connection is in a bad state (e.g. because the user called close() before), it is re-established.
2574
+
2575
+ Parameters
2576
+ ----------
2577
+ name : str
2578
+ The name of the database connection in the pool.
2579
+ pool : DatabasePool
2580
+ The current pool.
2581
+ """
2582
+ current_instance: PostgresInterface = pool.retrieve_database(name)
2583
+
2584
+ status = current_instance._connection.info.status
2585
+ if status != psycopg.pq.ConnStatus.OK:
2586
+ # Actually there are a lot of other ConnStatus values beyond OK and Bad
2587
+ # We could handle them explicitly here, or we might just defined anything that is not OK as Bad.
2588
+ # The latter seems much simpler so let's just do this for now.
2589
+ current_instance.reset_connection()
2590
+
2591
+ return current_instance
2592
+
2593
+
2594
+ def connect(
2595
+ *,
2596
+ name: str = "postgres",
2597
+ application_name: str = "",
2598
+ connect_string: str | None = None,
2599
+ config_file: str | Path | None = ".psycopg_connection",
2600
+ encoding: str = "UTF8",
2601
+ cache_enabled: bool = False,
2602
+ refresh: bool = False,
2603
+ private: bool = False,
2604
+ debug: bool = False,
2605
+ ) -> PostgresInterface:
2606
+ """Convenience function to seamlessly connect to a Postgres instance.
2607
+
2608
+ This function obtains a connect-string to the database according to the following rules:
2609
+
2610
+ 1. if the connect-string is supplied directly via the `connect_string` parameter, this is used
2611
+ 2. if the connect-string is not supplied, it is read from the file indicated by `config_file`. This file has to be located
2612
+ in the current working directory, or the file name has to describe the path to that file.
2613
+ 3. if the `config_file` does not exist, an error is raised
2614
+
2615
+ After a connection to the Postgres instance has been obtained, it is registered automatically on the current
2616
+ `DatabasePool` instance. This can be changed via the `private` parameter.
2617
+
2618
+ Parameters
2619
+ ----------
2620
+ name : str, optional
2621
+ A name to identify the current connection if multiple connections to different Postgres instances should be maintained.
2622
+ This is used to register the instance on the `DatabasePool`. Defaults to *postgres*.
2623
+ application_name : str, optional
2624
+ Identifier for the Postgres server. This will be the name that is shown in the server logs and process lists.
2625
+ connect_string : str | None, optional
2626
+ A Psycopg-compatible connect string for the database. Supplying this parameter overwrites any other connection
2627
+ data
2628
+ config_file : str | Path | None, optional
2629
+ A file containing a Psycopg-compatible connect string for the database. This is the default and preferred method of
2630
+ connecting to a Postgres database. Defaults to *.psycopg_connection*
2631
+ encoding : str, optional
2632
+ The client enconding of the connection. Defaults to *UTF8*.
2633
+ cache_enabled : bool, optional
2634
+ Controls the default caching behaviour of the Postgres instance. Caching of general queries is disabled by default,
2635
+ whereas queries from the statistics interface are cached by default.
2636
+ refresh : bool, optional
2637
+ If true, a new connection to the database will always be established, even if a connection to the same database is
2638
+ already pooled. The registration key will be suffixed to prevent collisions. By default, the current connection is
2639
+ re-used. If that is the case, no further information (e.g. config strings) is read and only the `name` is accessed.
2640
+ private : bool, optional
2641
+ If true, skips registration of the new instance on the `DatabasePool`. Registration is performed by default.
2642
+
2643
+ Returns
2644
+ -------
2645
+ PostgresInterface
2646
+ The Postgres database object
2647
+
2648
+ Raises
2649
+ ------
2650
+ ValueError
2651
+ If neither a config file nor a connect string was given, or if the connect file should be used but does not exist
2652
+
2653
+ References
2654
+ ----------
2655
+
2656
+ .. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
2657
+ database
2658
+ """
2659
+ db_pool = DatabasePool.get_instance()
2660
+ if name in db_pool and not refresh:
2661
+ return _reconnect(name, pool=db_pool)
2662
+
2663
+ if config_file and not connect_string:
2664
+ config_file = Path(config_file)
2665
+ if not config_file.is_file():
2666
+ wdir = os.getcwd()
2667
+ raise ValueError(
2668
+ f"Failed to obtain a database connection. Tried to read the config file '{config_file}' from "
2669
+ f"your current working directory, but the file was not found. Your working directory is {wdir}. "
2670
+ "Please either supply the connect string directly to the connect() method, or ensure that the "
2671
+ "config file exists."
2672
+ )
2673
+ with open(config_file, "r") as f:
2674
+ connect_string = f.readline().strip()
2675
+ elif not connect_string:
2676
+ raise ValueError(
2677
+ "Failed to obtain a database connection. Please either supply the connect string directly to the "
2678
+ "connect() method, or put a configuration file in your working directory. See the documentation of "
2679
+ "the connect() method for more details."
2680
+ )
2681
+
2682
+ postgres_db = PostgresInterface(
2683
+ connect_string,
2684
+ system_name=name,
2685
+ client_encoding=encoding,
2686
+ cache_enabled=cache_enabled,
2687
+ debug=debug,
2688
+ )
2689
+ if not private:
2690
+ orig_name = name
2691
+ instance_idx = 2
2692
+ while name in db_pool:
2693
+ name = f"{orig_name} - {instance_idx}"
2694
+ instance_idx += 1
2695
+ db_pool.register_database(name, postgres_db)
2696
+ return postgres_db
2697
+
2698
+
2699
+ def start(pgdata: str | Path = "", *, logfile: str | Path = "") -> None:
2700
+ """Starts a local Postgres server.
2701
+
2702
+ This function assumes that *pg_ctl* is available on the system PATH and either the server's data directory is specified
2703
+ explicitly, or set via the *PGDATA* environment variable.
2704
+ """
2705
+ if os.system("which pg_ctl") != 0:
2706
+ raise ValueError("Cannot start Postgres server: pg_ctl is not on PATH")
2707
+
2708
+ pgdata = pgdata or os.environ.get("PGDATA", "")
2709
+ pgdata = Path(pgdata).expanduser()
2710
+ if not pgdata:
2711
+ raise ValueError(
2712
+ "Cannot start Postgres server: Must either supply pgdata argument or set PGDATA environment variable"
2713
+ )
2714
+
2715
+ args = ["pg_ctl", "-D", pgdata]
2716
+ if logfile:
2717
+ args.extend(["-l", logfile])
2718
+ args.append("start")
2719
+
2720
+ subprocess.run(args, check=True)
2721
+
2722
+
2723
+ def stop(pgdata: str | Path = "", *, raise_on_error: bool = False) -> None:
2724
+ """Stops a running (local) Postgres server.
2725
+
2726
+ This function assumes that *pg_ctl* is available on the system PATH and either the server's data directory is specified
2727
+ explicitly, or set via the *PGDATA* environment variable.
2728
+
2729
+ If the server cannot be stopped due to whatever reason, an error can be raised by setting the corresponding parameter.
2730
+ Otherwise, it is silently ignored.
2731
+ """
2732
+ if os.system("which pg_ctl") != 0:
2733
+ raise ValueError("Cannot stop Postgres server: pg_ctl is not on PATH")
2734
+
2735
+ pgdata = pgdata or os.environ.get("PGDATA", "")
2736
+ pgdata = Path(pgdata).expanduser()
2737
+ if not pgdata:
2738
+ raise ValueError(
2739
+ "Cannot stop Postgres server: Must either supply pgdata argument or set PGDATA environment variable"
2740
+ )
2741
+
2742
+ subprocess.run(["pg_ctl", "-D", pgdata, "stop"], check=raise_on_error)
2743
+
2744
+
2745
+ def is_running(pgdata: str | Path = "") -> bool:
2746
+ """Checks, whether a local Postgres server is currently running.
2747
+
2748
+ This function assumes that *pg_ctl* is available on the system PATH. A data directory can be supplied to check whether
2749
+ a server is running for the specific database. If *pgdata* is not supplied, the *PGDATA* environment variable is used as
2750
+ a fallback.
2751
+ """
2752
+ if os.system("which pg_ctl") != 0:
2753
+ raise ValueError("Cannot start Postgres server: pg_ctl is not on PATH")
2754
+
2755
+ cmd = ["pg_ctl"]
2756
+ pgdata = pgdata or os.environ.get("PGDATA", "")
2757
+ if pgdata:
2758
+ cmd.extend(["-D", pgdata])
2759
+ cmd.append("status")
2760
+
2761
+ res = subprocess.run(cmd)
2762
+ return res.returncode == 0
2763
+
2764
+
2765
+ def _parallel_query_initializer(
2766
+ connect_string: str, local_data: threading.local, verbose: bool = False
2767
+ ) -> None:
2768
+ """Internal function for the `ParallelQueryExecutor` to setup worker connections.
2769
+
2770
+ Parameters
2771
+ ----------
2772
+ connect_string : str
2773
+ Connection info to establish a network connection to the Postgres instance. Delegates to Psycopg
2774
+ local_data : threading.local
2775
+ Data object to store the opened connection
2776
+ verbose : bool, optional
2777
+ Whether to print logging information, by default *False*
2778
+
2779
+ References
2780
+ ----------
2781
+
2782
+ .. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
2783
+ database
2784
+ """
2785
+ log = util.make_logger(verbose)
2786
+ tid = threading.get_ident()
2787
+ connection = psycopg.connect(
2788
+ connect_string, application_name=f"PostBOUND parallel worker ID {tid}"
2789
+ )
2790
+ connection.autocommit = True
2791
+ local_data.connection = connection
2792
+ log(f"[worker id={tid}, ts={util.timestamp()}] Connected")
2793
+
2794
+
2795
+ def _parallel_query_worker(
2796
+ query: str | SqlQuery,
2797
+ local_data: threading.local,
2798
+ timeout: Optional[int] = None,
2799
+ verbose: bool = False,
2800
+ ) -> tuple[SqlQuery | str, Any]:
2801
+ """Internal function for the `ParallelQueryExecutor` to run individual queries.
2802
+
2803
+ Parameters
2804
+ ----------
2805
+ query : str | SqlQuery
2806
+ The query to execute. The parallel executor does not make use of caching whatsoever, so no additional parameters are
2807
+ required.
2808
+ local_data : threading.local
2809
+ Data object that contains the database connection to use. This should have been initialized by
2810
+ `_parallel_query_initializer`
2811
+ timeout : Optional[int], optional
2812
+ The number of seconds to wait until the calculation is aborted. Defaults to *None*, which indicates no timeout. In
2813
+ case of timeout, *None* is returned.
2814
+ verbose : bool, optional
2815
+ Whether to print logging information, by default *False*
2816
+
2817
+ Returns
2818
+ -------
2819
+ tuple[SqlQuery | str, Any]
2820
+ A tuple of the original query and the (simplified) result set. See `Database.execute_query` for an outline of the
2821
+ simplification process. This method applies the same rules. The query is also provided to distinguish the different
2822
+ result sets that arrive in parallel.
2823
+ """
2824
+ log = util.make_logger(verbose)
2825
+ connection: psycopg.connection.Connection = local_data.connection
2826
+ connection.rollback()
2827
+ cursor = connection.cursor()
2828
+ if timeout:
2829
+ cursor.execute(f"SET statement_timeout = '{timeout}s';")
2830
+
2831
+ log(
2832
+ f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Now executing query {query}"
2833
+ )
2834
+ try:
2835
+ cursor.execute(str(query))
2836
+ log(
2837
+ f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Executed query {query}"
2838
+ )
2839
+ except psycopg.errors.QueryCanceled as e:
2840
+ if "canceling statement due to statement timeout" in e.args:
2841
+ log(
2842
+ f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Query {query} timed out"
2843
+ )
2844
+ return query, None
2845
+ else:
2846
+ raise e
2847
+
2848
+ result_set = cursor.fetchall()
2849
+ cursor.close()
2850
+
2851
+ return query, result_set
2852
+
2853
+
2854
+ class ParallelQueryExecutor:
2855
+ """The ParallelQueryExecutor provides mechanisms to conveniently execute queries in parallel.
2856
+
2857
+ The parallel execution happens by maintaining a number of worker threads that execute the incoming queries.
2858
+ The number of input queries can exceed the worker pool size, potentially by a large margin. If that is the case,
2859
+ input queries will be buffered until a worker is available.
2860
+
2861
+ This parallel executor has nothing to do with the Database interface and acts entirely independently and
2862
+ Postgres-specific.
2863
+
2864
+ Parameters
2865
+ ----------
2866
+ connect_string : str
2867
+ Connection info to establish a network connection to the Postgres instance. Delegates to Psycopg
2868
+ n_threads : Optional[int], optional
2869
+ The maximum number of parallel workers to use. If this is not specified, uses ``os.cpu_count()`` many workers.
2870
+ timeout : Optional[int], optional
2871
+ The number of seconds to wait until an individual query is aborted. Timeouts do not affect other queries (both those
2872
+ running in parallel or those running afterwards on the same worker). In case of a timeout, the query's entry in the
2873
+ result set will be *None*.
2874
+ verbose : bool, optional
2875
+ Whether to print logging information during the query execution. This is off by default.
2876
+
2877
+ See Also
2878
+ --------
2879
+ Database
2880
+ PostgresInterface
2881
+
2882
+ References
2883
+ ----------
2884
+
2885
+ .. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
2886
+ database
2887
+ """
2888
+
2889
+ def __init__(
2890
+ self,
2891
+ connect_string: str,
2892
+ n_threads: Optional[int] = None,
2893
+ *,
2894
+ timeout: Optional[int] = None,
2895
+ verbose: bool = False,
2896
+ ) -> None:
2897
+ self._n_threads = (
2898
+ n_threads if n_threads is not None and n_threads > 0 else os.cpu_count()
2899
+ )
2900
+ self._connect_string = connect_string
2901
+ self._timeout = timeout
2902
+ self._verbose = verbose
2903
+
2904
+ self._thread_data = threading.local()
2905
+ self._thread_pool = concurrent.futures.ThreadPoolExecutor(
2906
+ max_workers=self._n_threads,
2907
+ initializer=_parallel_query_initializer,
2908
+ initargs=(
2909
+ self._connect_string,
2910
+ self._thread_data,
2911
+ ),
2912
+ )
2913
+ self._tasks: list[concurrent.futures.Future] = []
2914
+ self._results: list[Any] = []
2915
+ self._queries: dict[concurrent.futures.Future, SqlQuery | str] = {}
2916
+
2917
+ def queue_query(self, query: SqlQuery | str) -> None:
2918
+ """Adds a new query to the queue, to be executed as soon as possible.
2919
+
2920
+ If a timeout was specified when creating the executor, this timeout will be applied to the query.
2921
+
2922
+ Parameters
2923
+ ----------
2924
+ query : SqlQuery | str
2925
+ The query to execute
2926
+ """
2927
+ future = self._thread_pool.submit(
2928
+ _parallel_query_worker,
2929
+ query,
2930
+ self._thread_data,
2931
+ self._timeout,
2932
+ self._verbose,
2933
+ )
2934
+ self._tasks.append(future)
2935
+ self._queries[future] = query
2936
+
2937
+ def drain_queue(
2938
+ self,
2939
+ timeout: Optional[float] = None,
2940
+ *,
2941
+ callback: Optional[Callable[[SqlQuery | str, ResultSet | None], None]] = None,
2942
+ ) -> None:
2943
+ """Blocks, until all queries currently queued have terminated.
2944
+
2945
+ Parameters
2946
+ ----------
2947
+ timeout : Optional[float], optional
2948
+ The number of seconds to wait until the calculation is aborted. Defaults to *None*, which indicates no timeout,
2949
+ i.e. wait forever. Note that in contrast to the timeout specified when creating the executor, this timeout
2950
+ applies to the entire queue and not to individual queries. For example, one can set the per-query timeout to 1s
2951
+ which means that each query can be executed for at most 1 second. If an additional timeout of 10s is specified
2952
+ on the queue, the entire queue will be aborted if it takes longer than 10 seconds to complete.
2953
+ callback : Optional[Callable[[SqlQuery | str, ResultSet | None], None]], optional
2954
+ A callback to be executed with each query that completes. The callback receives the query that was executed and
2955
+ the corresponding (raw) result set as arguments. If the query ran into a timeout, the result set is *None*.
2956
+
2957
+ Raises
2958
+ ------
2959
+ TimeoutError or concurrent.futures.TimeoutError
2960
+ If some queries have not completed after the given `timeout`.
2961
+ """
2962
+ for future in concurrent.futures.as_completed(self._tasks, timeout=timeout):
2963
+ result_set = future.result()
2964
+ self._results.append(result_set)
2965
+
2966
+ if not callback:
2967
+ continue
2968
+
2969
+ query = self._queries[future]
2970
+ callback(query, result_set)
2971
+
2972
+ def result_set(self) -> dict[str | SqlQuery, ResultSet | None]:
2973
+ """Provides the results of all queries that have terminated already, mapping query -> result set
2974
+
2975
+ Returns
2976
+ -------
2977
+ dict[str | SqlQuery, ResultSet | None]
2978
+ The query results. The raw result sets are provided without any simplification. If the query timed out, the result
2979
+ set is *None* (in contrast to empty result sets like `[]`).
2980
+ """
2981
+ return dict(self._results)
2982
+
2983
+ def close(self) -> None:
2984
+ """Terminates all worker threads. The executor is essentially useless afterwards."""
2985
+ self._thread_pool.shutdown()
2986
+
2987
+ def __repr__(self) -> str:
2988
+ return str(self)
2989
+
2990
+ def __str__(self) -> str:
2991
+ running_workers = [future for future in self._tasks if future.running()]
2992
+ completed_workers = [future for future in self._tasks if future.done()]
2993
+
2994
+ return (
2995
+ f"Concurrent query pool of {self._n_threads} workers, {len(self._tasks)} tasks "
2996
+ f"(run={len(running_workers)} fin={len(completed_workers)})"
2997
+ )
2998
+
2999
+
3000
+ def _timeout_query_worker(
3001
+ query: SqlQuery | str,
3002
+ *,
3003
+ pg_config: dict,
3004
+ result_send: mp_conn.Connection,
3005
+ err_send: mp_conn.Connection,
3006
+ backend_send: mp_conn.Connection,
3007
+ **kwargs,
3008
+ ) -> None:
3009
+ """Internal function to the `TimeoutQueryExecutor` to run individual queries.
3010
+
3011
+ Query results are sent via the `result_send` pipe, not as a return value. In case of any errors, these are sent via the
3012
+ `err_send` pipe. Therefore, it is best to check the `err_send` pipe first, before reading from the `result_send` pipe.
3013
+
3014
+ Parameters
3015
+ ----------
3016
+ query : SqlQuery | str
3017
+ Query to execute
3018
+ pg_config : dict
3019
+ Pickable representation of the current Postgres connection. This is used to re-establish the connection in the parallel
3020
+ worker.
3021
+ result_send : mp_conn.Connection
3022
+ Pipe connection to send the query result
3023
+ err_send : mp_conn.Connection
3024
+ Pipe connection to send any errors that occurred during the query execution
3025
+ backend_send : mp_conn.Connection
3026
+ Pipe connection to send the backend PID
3027
+ kwargs : Any
3028
+ Additional parameters to pass to the `PostgresInterface.execute_query` method.
3029
+ """
3030
+ try:
3031
+ connect_string = pg_config["connect_string"]
3032
+ cache_enabled = pg_config.get("cache_enabled", False)
3033
+ pg_instance = PostgresInterface(
3034
+ connect_string,
3035
+ application_name="PostBOUND Timeout Worker",
3036
+ cache_enabled=cache_enabled,
3037
+ )
3038
+ backend_send.send(pg_instance.backend_pid())
3039
+ pg_instance.apply_configuration(pg_config["config"])
3040
+
3041
+ result = pg_instance.execute_query(query, **kwargs)
3042
+ runtime = pg_instance.last_query_runtime()
3043
+
3044
+ result_send.send({"query_result": result, "runtime": runtime})
3045
+ except Exception as e:
3046
+ err_send.send(e)
3047
+ finally:
3048
+ pg_instance.close()
3049
+
3050
+
3051
+ class TimeoutQueryExecutor:
3052
+ """The TimeoutQueryExecutor provides a mechanism to execute queries with a timeout attached.
3053
+
3054
+ If the query takes longer than the designated timeout, its execution is cancelled. The query execution itself is delegated
3055
+ to the `PostgresInterface`, so all its rules still apply. At the same time, using the timeout executor service can
3056
+ invalidate some of the state that is exposed by the database interface (see *Warnings* below). Therefore, the relevant
3057
+ variables should be refreshed once the timeout executor was used.
3058
+
3059
+ In addition to calling the `execute_query` method directly, the executor also implements *__call__* for more convenient
3060
+ access. Both methods accept the same parameters.
3061
+
3062
+ Parameters
3063
+ ----------
3064
+ postgres_instance : Optional[PostgresInterface], optional
3065
+ Database to execute the queries. If omitted, this is inferred from the `DatabasePool`.
3066
+
3067
+ Warnings
3068
+ --------
3069
+ When a query gets cancelled due to the timeout being reached, the current cursor as well as database connection might be
3070
+ refreshed. Any direct references to these instances should no longer be used.
3071
+ """
3072
+
3073
+ def __init__(self, postgres_instance: Optional[PostgresInterface] = None) -> None:
3074
+ self._pg_instance = (
3075
+ postgres_instance
3076
+ if postgres_instance is not None
3077
+ else DatabasePool.get_instance().current_database()
3078
+ )
3079
+ self._timeout_watchdog = psycopg.connect(
3080
+ self._pg_instance.connect_string,
3081
+ application_name="PostBOUND Timeout Watchdog",
3082
+ )
3083
+
3084
+ def execute_query(self, query: SqlQuery | str, timeout: float, **kwargs) -> Any:
3085
+ """Runs a query on the database connection, cancelling if it takes longer than a specific timeout.
3086
+
3087
+ Parameters
3088
+ ----------
3089
+ query : SqlQuery | str
3090
+ Query to execute
3091
+ timeout : float
3092
+ Maximum query execution time in seconds.
3093
+ **kwargs
3094
+ Additional parameters to pass to the `PostgresInterface.execute_query` method.
3095
+
3096
+ Returns
3097
+ -------
3098
+ Any
3099
+ The query result if it terminated timely. Rules from `PostgresInterface.execute_query` apply.
3100
+
3101
+ Raises
3102
+ ------
3103
+ TimeoutError
3104
+ If the query execution was not finished after `timeout` seconds.
3105
+
3106
+ See Also
3107
+ --------
3108
+ PostgresInterface.execute_query
3109
+ PostgresInterface.reset_connection
3110
+ """
3111
+ result_recv, result_send = mp.Pipe(False)
3112
+ error_recv, error_send = mp.Pipe(False)
3113
+ backend_recv, backend_send = mp.Pipe(False)
3114
+ query_worker = mp.Process(
3115
+ target=_timeout_query_worker,
3116
+ args=(query,),
3117
+ kwargs={
3118
+ "pg_config": self._pg_fingerprint(),
3119
+ "result_send": result_send,
3120
+ "err_send": error_send,
3121
+ "backend_send": backend_send,
3122
+ **kwargs,
3123
+ },
3124
+ )
3125
+
3126
+ query_worker.start()
3127
+ query_worker.join(timeout)
3128
+
3129
+ # We perform the timeout check before doing anything else to make sure that the worker process cannot terminate
3130
+ # immediately after the timeout has been reached. E.g., suppose that the query is still running after calling join().
3131
+ # If we would now proceed to check the error-pipe, the query would have more time to terminate while we are performing
3132
+ # our error checks. This might result in an involuntary increase in the timeout duration. By keeping the timeout check
3133
+ # as close to the join() call as possible, we minimize this risk.
3134
+ timed_out = query_worker.is_alive()
3135
+ query_worker.terminate()
3136
+ query_worker.join()
3137
+
3138
+ # Now that we know whether the worker timed out or not, we need to make sure that it actually terminated properly
3139
+ # (or timed out). In case of an error, we just propagate it to the client.
3140
+ if error_recv.poll():
3141
+ self._pg_instance._last_query_runtime = math.nan
3142
+ self._abort_backend(backend_recv.recv())
3143
+ err = error_recv.recv()
3144
+
3145
+ query_worker.close()
3146
+ result_send.close()
3147
+ result_recv.close()
3148
+ error_send.close()
3149
+ error_recv.close()
3150
+
3151
+ raise err
3152
+
3153
+ # At this point we know that the worker either terminated in time or that it timed out, but it did not error.
3154
+ # Both the timeout and the termination case can be handled in a pretty straightforward manner.
3155
+ if timed_out:
3156
+ self._abort_backend(backend_recv.recv())
3157
+ query_result = None
3158
+ self._pg_instance._last_query_runtime = timeout
3159
+ else:
3160
+ raw_result = result_recv.recv()
3161
+ query_result = raw_result["query_result"]
3162
+ self._pg_instance._last_query_runtime = raw_result["runtime"]
3163
+
3164
+ query_worker.close()
3165
+ result_send.close()
3166
+ result_recv.close()
3167
+ error_send.close()
3168
+ error_recv.close()
3169
+
3170
+ if timed_out:
3171
+ raise TimeoutError(query)
3172
+ else:
3173
+ return query_result
3174
+
3175
+ def _pg_fingerprint(self) -> dict:
3176
+ """Generate a pickable representation of the current Postgres connection."""
3177
+ return {
3178
+ "connect_string": self._pg_instance.connect_string,
3179
+ "cache_enabled": self._pg_instance.cache_enabled,
3180
+ "config": self._pg_instance.current_configuration(
3181
+ runtime_changeable_only=True
3182
+ ),
3183
+ }
3184
+
3185
+ def _abort_backend(self, pid: int) -> None:
3186
+ with self._timeout_watchdog.cursor() as cursor:
3187
+ cursor.execute(f"SELECT pg_cancel_backend({pid});")
3188
+ self._timeout_watchdog.rollback()
3189
+
3190
+ def __call__(self, query: SqlQuery | str, timeout: float, **kwargs) -> Any:
3191
+ return self.execute_query(query, timeout, **kwargs)
3192
+
3193
+
3194
+ PostgresExplainJoinNodes = {
3195
+ "Nested Loop": JoinOperator.NestedLoopJoin,
3196
+ "Hash Join": JoinOperator.HashJoin,
3197
+ "Merge Join": JoinOperator.SortMergeJoin,
3198
+ }
3199
+ """A mapping from Postgres EXPLAIN node names to the corresponding join operators."""
3200
+
3201
+ PostgresExplainScanNodes = {
3202
+ "Seq Scan": ScanOperator.SequentialScan,
3203
+ "Index Scan": ScanOperator.IndexScan,
3204
+ "Index Only Scan": ScanOperator.IndexOnlyScan,
3205
+ "Bitmap Heap Scan": ScanOperator.BitmapScan,
3206
+ }
3207
+ """A mapping from Postgres EXPLAIN node names to the corresponding scan operators."""
3208
+
3209
+ PostgresExplainIntermediateNodes = {
3210
+ "Materialize": IntermediateOperator.Materialize,
3211
+ "Memoize": IntermediateOperator.Memoize,
3212
+ "Sort": IntermediateOperator.Sort,
3213
+ }
3214
+ """A mapping from Postgres EXPLAIN node names to the corresponding intermediate operators."""
3215
+
3216
+
3217
+ class PostgresExplainNode:
3218
+ """Simplified model of a plan node as provided by Postgres' *EXPLAIN* output in JSON format.
3219
+
3220
+ Generally speaking, a node stores all the information about the plan node that we currently care about. This is mostly
3221
+ focused on optimizer statistics, along with some additional data. Explain nodes form a hierarchichal structure with each
3222
+ node containing an arbitrary number of child nodes. Notice that this model is very loose in the sense that no constraints
3223
+ are enforced and no sanity checking is performed. For example, this means that nodes can contain more than two children
3224
+ even though this can never happen in a real *EXPLAIN* plan. Similarly, the correspondence between filter predicates and
3225
+ the node typse (e.g. join filter for a join node) is not checked.
3226
+
3227
+ All relevant data from the explain node is exposed as attributes on the objects. Even though these are mutable, they should
3228
+ be thought of as read-only data objects.
3229
+
3230
+ Parameters
3231
+ ----------
3232
+ explain_data : dict
3233
+ The JSON data of the current explain node. This is parsed and prepared as part of the *__init__* method.
3234
+
3235
+ Attributes
3236
+ ----------
3237
+ node_type : str | None, default None
3238
+ The node type. This should never be empty or *None*, even though it is technically allowed.
3239
+ cost : float, default NaN
3240
+ The optimizer's cost estimation for this node. This includes the cost of all child nodes as well. This should normally
3241
+ not be *NaN*, even though it is technically allowed.
3242
+ cardinality_estimate : float, default NaN
3243
+ The optimizer's estimation of the number of tuples that will be *produced* by this operator. This should normally not
3244
+ be *NaN*, even though it is technically allowed.
3245
+ execution_time : float, default NaN
3246
+ For *EXPLAIN ANALYZE* plans, this is the actual total execution time of the node in seconds. For pure *EXPLAIN*
3247
+ plans, this is *NaN*
3248
+ true_cardinality : float, default NaN
3249
+ For *EXPLAIN ANALYZE* plans, this is the average of the number of tuples that were actually produced for each loop of
3250
+ the node. For pure *EXPLAIN* plans, this is *NaN*
3251
+ loops : int, default 1
3252
+ For *EXPLAIN ANALYZE* plans, this is the number of times the operator was invoked. The number of invocations can mean
3253
+ a number of different things: for parallel operators, this normally matches the number of parallel workers. For scans,
3254
+ this matches the number of times a new tuple was requested (e.g. for an index nested-loop join the number of loops of
3255
+ the index scan part indicates how many times the index was probed).
3256
+ relation_name : str | None, default None
3257
+ The name of the relation/table that is processed by this node. This should be defined on scan nodes, but could also
3258
+ be present on other nodes.
3259
+ relation_alias : str | None, default None
3260
+ The alias of the relation/table under which the relation was accessed in th equery plan. See `relation_name`.
3261
+ index_name : str | None, default None
3262
+ The name of the index that was probed. This should be defined on index scans and index-only scans, but could also be
3263
+ present on other nodes.
3264
+ filter_condition : str | None, default None
3265
+ A post-processing filter that is applied to all rows emitted by this operator. This is most important for scan
3266
+ operations with an attached filter predicate, but can also be present on some joins.
3267
+ index_condition : str | None, default None
3268
+ The condition that is used to locate the matching tuples in an index scan or index-only scan
3269
+ join_filter : str | None, default None
3270
+ The condition that is used to determine matching tuples in a join
3271
+ hash_condition : str | None, default None
3272
+ The condition that is used to determine matching tuples in a hash join
3273
+ recheck_condition : str | None, default None
3274
+ For lossy bitmap scans or bitmap scans based on lossy indexes, this is post-processing check for whether the produced
3275
+ tuples actually match the filter condition
3276
+ parent_relationship : str | None, default None
3277
+ Describes the role that this node plays in relation to its parent. Common values are *inner* which denotes that
3278
+ this is the inner child of a join and *outer* which denotes the opposite.
3279
+ parallel_workers : int | float, default NaN
3280
+ For parallel operators in *EXPLAIN ANALYZE* plans, this is the actual number of worker processes that were started.
3281
+ Notice that in total there is one additional worker. This process takes care of spawning the other workers and
3282
+ managing them, but can also take part in the input processing.
3283
+ sort_keys : list[str]
3284
+ The columns that are used to sort the tuples that are produced by this node. This is most important for sort nodes,
3285
+ but can also be present on other nodes.
3286
+ shared_blocks_read : float, default NaN
3287
+ For *EXPLAIN ANALYZE* plans with *BUFFERS* enabled, this is the number of blocks/pages that where retrieved from
3288
+ disk while executing this node, including the reads of all its child nodes.
3289
+ shared_blocks_buffered : float, default NaN
3290
+ For *EXPLAIN ANALYZE* plans with *BUFFERS* enabled, this is the number of blocks/pages that where retrieved from
3291
+ the shared buffer while executing this node, including the hits of all its child nodes.
3292
+ temp_blocks_read : float, default NaN
3293
+ For *EXPLAIN ANALYZE* blocks with *BUFFERS* enabled, this is the number of short-term data structures (e.g. hash
3294
+ tables, sorts) that where read by this node, including reads of all its child nodes.
3295
+ temp_blocks_written : float, default NaN
3296
+ For *EXPLAIN ANALYZE* blocks with *BUFFERS* enabled, this is the number of short-term data structures (e.g. hash
3297
+ tables, sorts) that where written by this node, including writes of all its child nodes.
3298
+ plan_width : float, default NaN
3299
+ The average width of the tuples that are produced by this node.
3300
+ children : list[PostgresExplainNode]
3301
+ All child / input nodes for the current node
3302
+ """
3303
+
3304
+ def __init__(self, explain_data: dict) -> None:
3305
+ self.node_type = explain_data.get("Node Type", None)
3306
+
3307
+ self.cost = explain_data.get("Total Cost", math.nan)
3308
+ self.cardinality_estimate = explain_data.get("Plan Rows", math.nan)
3309
+ self.execution_time = explain_data.get("Actual Total Time", math.nan) / 1000
3310
+
3311
+ # true_cardinality is accessed as a property to add a warning for BitmapAnd/Or nodes
3312
+ self._true_card = explain_data.get("Actual Rows", math.nan)
3313
+
3314
+ self.loops = explain_data.get("Actual Loops", 1)
3315
+
3316
+ self.relation_name = explain_data.get("Relation Name", None)
3317
+ self.relation_alias = explain_data.get("Alias", None)
3318
+ self.index_name = explain_data.get("Index Name", None)
3319
+ self.subplan_name = explain_data.get("Subplan Name", None)
3320
+ self.cte_name = explain_data.get("CTE Name", None)
3321
+
3322
+ self.filter_condition = explain_data.get("Filter", None)
3323
+ self.index_condition = explain_data.get("Index Cond", None)
3324
+ self.join_filter = explain_data.get("Join Filter", None)
3325
+ self.hash_condition = explain_data.get("Hash Cond", None)
3326
+ self.recheck_condition = explain_data.get("Recheck Cond", None)
3327
+
3328
+ self.parent_relationship = explain_data.get("Parent Relationship", None)
3329
+ self.parallel_workers = explain_data.get("Workers Launched", math.nan)
3330
+ if math.isnan(self.parallel_workers):
3331
+ self.parallel_workers = explain_data.get("Workers Planned", math.nan)
3332
+ self.sort_keys = explain_data.get("Sort Key", [])
3333
+
3334
+ self.shared_blocks_read = explain_data.get("Shared Read Blocks", math.nan)
3335
+ self.shared_blocks_cached = explain_data.get("Shared Hit Blocks", math.nan)
3336
+ self.temp_blocks_read = explain_data.get("Temp Read Blocks", math.nan)
3337
+ self.temp_blocks_written = explain_data.get("Temp Written Blocks", math.nan)
3338
+ self.plan_width = explain_data.get("Plan Width", math.nan)
3339
+
3340
+ self.children = [
3341
+ PostgresExplainNode(child) for child in explain_data.get("Plans", [])
3342
+ ]
3343
+
3344
+ self.explain_data = explain_data
3345
+ self._hash_val = hash(
3346
+ (
3347
+ self.node_type,
3348
+ self.relation_name,
3349
+ self.relation_alias,
3350
+ self.index_name,
3351
+ self.subplan_name,
3352
+ self.cte_name,
3353
+ self.filter_condition,
3354
+ self.index_condition,
3355
+ self.join_filter,
3356
+ self.hash_condition,
3357
+ self.recheck_condition,
3358
+ self.parent_relationship,
3359
+ self.parallel_workers,
3360
+ tuple(self.children),
3361
+ )
3362
+ )
3363
+
3364
+ @property
3365
+ def true_cardinality(self) -> float:
3366
+ if self.node_type in {"BitmapAnd", "BitmapOr"}:
3367
+ # For BitmapAnd/BitmapOr nodes, the actual number of rows is always 0.
3368
+ # This is due to limitations in the Postgres implementation.
3369
+ warnings.warn(
3370
+ "Postgres does not report the actual number of rows for bitmap nodes correctly. Returning NaN."
3371
+ )
3372
+ return math.nan
3373
+ return self._true_card
3374
+
3375
+ def is_scan(self) -> bool:
3376
+ """Checks, whether the current node corresponds to a scan node.
3377
+
3378
+ For Bitmap index scans, which are multi-level scan operators, this is true for the heap scan part that takes care of
3379
+ actually reading the tuples according to the bitmap provided by the bitmap index scan operators.
3380
+
3381
+ Returns
3382
+ -------
3383
+ bool
3384
+ Whether the node is a scan node
3385
+ """
3386
+ return self.node_type in PostgresExplainScanNodes
3387
+
3388
+ def is_join(self) -> bool:
3389
+ """Checks, whether the current node corresponds to a join node.
3390
+
3391
+ Returns
3392
+ -------
3393
+ bool
3394
+ Whether the node is a join node
3395
+ """
3396
+ return self.node_type in PostgresExplainJoinNodes
3397
+
3398
+ def is_analyze(self) -> bool:
3399
+ """Checks, whether this *EXPLAIN* plan is an *EXPLAIN ANALYZE* plan or a pure *EXPLAIN* plan.
3400
+
3401
+ The analyze variant does not only obtain the plan, but actually executes it. This enables the comparison of the
3402
+ optimizer's estimates to the actual values. If a plan is an *EXPLAIN ANALYZE* plan, some attributes of this node
3403
+ receive actual values. These include `execution_time`, `true_cardinality`, `loops` and `parallel_workers`.
3404
+
3405
+
3406
+ Returns
3407
+ -------
3408
+ bool
3409
+ Whether the node represents part of an *EXPLAIN ANALYZE* plan
3410
+ """
3411
+ return not math.isnan(self.execution_time)
3412
+
3413
+ def filter_conditions(self) -> dict[str, str]:
3414
+ """Collects all filter conditions that are defined on this node
3415
+
3416
+ Returns
3417
+ -------
3418
+ dict[str, str]
3419
+ A dictionary mapping the type of filter condition (e.g. index condition or join filter) to the actual filter value.
3420
+ """
3421
+ conditions: dict[str, str] = {}
3422
+ if self.filter_condition is not None:
3423
+ conditions["Filter"] = self.filter_condition
3424
+ if self.index_condition is not None:
3425
+ conditions["Index Cond"] = self.index_condition
3426
+ if self.join_filter is not None:
3427
+ conditions["Join Filter"] = self.join_filter
3428
+ if self.hash_condition is not None:
3429
+ conditions["Hash Cond"] = self.hash_condition
3430
+ if self.recheck_condition is not None:
3431
+ conditions["Recheck Cond"] = self.recheck_condition
3432
+ return conditions
3433
+
3434
+ def inner_outer_children(self) -> Sequence[PostgresExplainNode]:
3435
+ """Provides the children of this node in a sequence of inner, outer if applicable.
3436
+
3437
+ For all nodes where this structure is not meaningful (e.g. intermediate nodes that operate on a single relation or
3438
+ scan nodes), the child nodes are returned as-is (e.g. as a list of a single child or an empty list).
3439
+
3440
+ Returns
3441
+ -------
3442
+ Sequence[PostgresExplainNode]
3443
+ The children of the current node in a unified format
3444
+ """
3445
+ if len(self.children) < 2:
3446
+ return self.children
3447
+ assert len(self.children) == 2
3448
+
3449
+ first_child, second_child = self.children
3450
+ inner_child = (
3451
+ first_child if first_child.parent_relationship == "Inner" else second_child
3452
+ )
3453
+ outer_child = first_child if second_child == inner_child else second_child
3454
+ return (inner_child, outer_child)
3455
+
3456
+ def parse_table(self) -> Optional[TableReference]:
3457
+ """Provides the table that is processed by this node.
3458
+
3459
+ Returns
3460
+ -------
3461
+ Optional[TableReference]
3462
+ The table being scanned. For non-scan nodes, or nodes where no table can be inferred, *None* will be returned.
3463
+ """
3464
+ if not self.relation_name:
3465
+ return None
3466
+ alias = (
3467
+ self.relation_alias
3468
+ if self.relation_alias is not None
3469
+ and self.relation_alias != self.relation_name
3470
+ else ""
3471
+ )
3472
+ return TableReference(self.relation_name, alias)
3473
+
3474
+ def as_qep(self) -> QueryPlan:
3475
+ """Transforms the postgres-specific plan to a standardized `QueryPlan` instance.
3476
+
3477
+ Notice that this transformation is lossy since not all information from the Postgres plan can be represented in query
3478
+ execution plan instances. Furthermore, this transformation can be problematic for complicated queries that use
3479
+ special Postgres features. Most importantly, for queries involving subqueries, special node types and parent
3480
+ relationships can be contained in the plan, that cannot be represented by other parts of PostBOUND. If this method
3481
+ and the resulting query execution plans should be used on complex workloads, it is advisable to check the plans twice
3482
+ before continuing.
3483
+
3484
+ Returns
3485
+ -------
3486
+ QueryPlan
3487
+ The equivalent query execution plan for this node
3488
+
3489
+ Raises
3490
+ ------
3491
+ ValueError
3492
+ If the node contains more than two children.
3493
+ """
3494
+ child_nodes = []
3495
+ inner_child, outer_child, subplan_child = None, None, None
3496
+ for child in self.children:
3497
+ parent_rel = child.parent_relationship
3498
+ qep_child = child.as_qep()
3499
+
3500
+ match parent_rel:
3501
+ case "Inner":
3502
+ inner_child = qep_child
3503
+ case "Outer":
3504
+ outer_child = qep_child
3505
+ case "SubPlan" | "InitPlan" | "Subquery":
3506
+ subplan_child = qep_child
3507
+ case "Member":
3508
+ child_nodes.append(qep_child)
3509
+ case _:
3510
+ raise ValueError(
3511
+ f"Unknown parent relationship '{parent_rel}' for child {child}"
3512
+ )
3513
+
3514
+ if inner_child and outer_child:
3515
+ child_nodes = [outer_child, inner_child] + child_nodes
3516
+ elif outer_child:
3517
+ child_nodes.insert(0, outer_child)
3518
+ elif inner_child:
3519
+ child_nodes.insert(0, inner_child)
3520
+
3521
+ table = self.parse_table()
3522
+ subplan_name = self.subplan_name or self.cte_name
3523
+ true_card = self.true_cardinality * self.loops
3524
+
3525
+ if self.is_scan():
3526
+ operator = PostgresExplainScanNodes.get(self.node_type, None)
3527
+ elif self.is_join():
3528
+ operator = PostgresExplainJoinNodes.get(self.node_type, None)
3529
+ else:
3530
+ operator = PostgresExplainIntermediateNodes.get(self.node_type, None)
3531
+
3532
+ sort_keys = (
3533
+ self._parse_sort_keys()
3534
+ if self.sort_keys
3535
+ else self._infer_sorting_from_children()
3536
+ )
3537
+ shared_hits = (
3538
+ None if math.isnan(self.shared_blocks_cached) else self.shared_blocks_cached
3539
+ )
3540
+ shared_misses = (
3541
+ None if math.isnan(self.shared_blocks_read) else self.shared_blocks_read
3542
+ )
3543
+ par_workers = (
3544
+ None if math.isnan(self.parallel_workers) else self.parallel_workers
3545
+ )
3546
+
3547
+ return QueryPlan(
3548
+ self.node_type,
3549
+ base_table=table,
3550
+ operator=operator,
3551
+ children=child_nodes,
3552
+ parallel_workers=par_workers,
3553
+ index=self.index_name,
3554
+ sort_keys=sort_keys,
3555
+ estimated_cost=self.cost,
3556
+ estimated_cardinality=Cardinality(self.cardinality_estimate),
3557
+ actual_cardinality=Cardinality(true_card),
3558
+ execution_time=self.execution_time,
3559
+ cache_hits=shared_hits,
3560
+ cache_misses=shared_misses,
3561
+ subplan_root=subplan_child,
3562
+ subplan_name=subplan_name,
3563
+ )
3564
+
3565
+ def inspect(self, *, _indentation: int = 0) -> str:
3566
+ """Provides a pretty string representation of the *EXPLAIN* sub-plan that can be printed.
3567
+
3568
+ Parameters
3569
+ ----------
3570
+ _indentation : int, optional
3571
+ This parameter is internal to the method and ensures that the correct indentation is used for the child nodes
3572
+ of the plan. When inspecting the root node, this value is set to its default value of `0`.
3573
+
3574
+ Returns
3575
+ -------
3576
+ str
3577
+ A string representation of the *EXPLAIN* sub-plan.
3578
+ """
3579
+ if self.parent_relationship in ("InitPlan", "SubPlan"):
3580
+ padding = " " * (max(_indentation - 2, 0))
3581
+ cte_name = self.subplan_name if self.subplan_name else ""
3582
+ own_inspection = [f"{padding}{self.parent_relationship}: {cte_name}"]
3583
+ else:
3584
+ own_inspection = []
3585
+ padding = " " * _indentation
3586
+ prefix = f"{padding}<- " if padding else ""
3587
+ own_inspection += [prefix + str(self)]
3588
+ child_inspections = [
3589
+ child.inspect(_indentation=_indentation + 2) for child in self.children
3590
+ ]
3591
+ return "\n".join(own_inspection + child_inspections)
3592
+
3593
+ def _infer_sorting_from_children(self) -> list[SortKey]:
3594
+ # TODO: Postgres is a cruel mistress. Even if output is sorted, it might not be marked as such.
3595
+ # For example, in index scans, this is implictly encoded in the index condition, somethimes even nested in other
3596
+ # expressions. We first need a reliable way to parse the expressions into a PostBOUND-compatible format.
3597
+ # See _parse_sort_keys for a start.
3598
+ return None
3599
+
3600
+ def _parse_sort_keys(self) -> list[SortKey]:
3601
+ # TODO implementation
3602
+ return None
3603
+
3604
+ def __hash__(self) -> int:
3605
+ return self._hash_val
3606
+
3607
+ def __eq__(self, other: object) -> bool:
3608
+ return (
3609
+ isinstance(other, type(self))
3610
+ and self.node_type == other.node_type
3611
+ and self.relation_name == other.relation_name
3612
+ and self.relation_alias == other.relation_alias
3613
+ and self.children == other.children
3614
+ )
3615
+
3616
+ def __repr__(self) -> str:
3617
+ return str(self)
3618
+
3619
+ def __str__(self) -> str:
3620
+ analyze_content = (
3621
+ f" (actual time={self.execution_time}s rows={self.true_cardinality} loops={self.loops})"
3622
+ if self.is_analyze()
3623
+ else ""
3624
+ )
3625
+ explain_content = f"(cost={self.cost} rows={self.cardinality_estimate})"
3626
+ conditions = " ".join(
3627
+ f"{condition}: {value}"
3628
+ for condition, value in self.filter_conditions().items()
3629
+ )
3630
+ conditions = " " + conditions if conditions else ""
3631
+ if self.is_scan():
3632
+ scan_info = f" on {self.parse_table().identifier()}"
3633
+ elif self.cte_name:
3634
+ scan_info = f" on {self.cte_name}"
3635
+ else:
3636
+ scan_info = ""
3637
+ return (
3638
+ self.node_type + scan_info + explain_content + analyze_content + conditions
3639
+ )
3640
+
3641
+
3642
+ class PostgresExplainPlan:
3643
+ """Models an entire *EXPLAIN* plan produced by Postgres
3644
+
3645
+ In contrast to `PostgresExplainNode`, this includes additional parameters (planning time and execution time) for the entire
3646
+ plan, rather than just portions of it.
3647
+
3648
+ This class supports all methods that are specified on the general `QueryPlan` and returns the correct data for its actual
3649
+ plan.
3650
+
3651
+ Parameters
3652
+ ----------
3653
+ explain_data : dict
3654
+ The JSON data of the entire explain plan. This is parsed and prepared as part of the *__init__* method.
3655
+
3656
+
3657
+ Attributes
3658
+ ----------
3659
+ planning_time : float
3660
+ The time in seconds that the optimizer spent to build the plan
3661
+ execution_time : float
3662
+ The time in seconds the query execution engine needed to calculate the result set of the query. This does not account
3663
+ for network time to transmit the result set.
3664
+ query_plan : PostgresExplainNode
3665
+ The actual plan
3666
+ """
3667
+
3668
+ def __init__(self, explain_data: dict) -> None:
3669
+ self.explain_data = (
3670
+ explain_data[0] if isinstance(explain_data, list) else explain_data
3671
+ )
3672
+ self.planning_time: float = (
3673
+ self.explain_data.get("Planning Time", math.nan) / 1000
3674
+ )
3675
+ self.execution_time: float = (
3676
+ self.explain_data.get("Execution Time", math.nan) / 1000
3677
+ )
3678
+ self.query_plan = PostgresExplainNode(self.explain_data["Plan"])
3679
+ self._normalized_plan = self.query_plan.as_qep()
3680
+
3681
+ @property
3682
+ def root(self) -> PostgresExplainNode:
3683
+ """Gets the root node of the actual query plan."""
3684
+ return self.query_plan
3685
+
3686
+ def is_analyze(self) -> bool:
3687
+ """Checks, whether this *EXPLAIN* plan is an *EXPLAIN ANALYZE* plan or a pure *EXPLAIN* plan.
3688
+
3689
+ The analyze variant does not only obtain the plan, but actually executes it. This enables the comparison of the
3690
+ optimizer's estimates to the actual values. If a plan is an *EXPLAIN ANALYZE* plan, some attributes of this node
3691
+ receive actual values. These include `execution_time`, `true_cardinality`, `loops` and `parallel_workers`.
3692
+
3693
+
3694
+ Returns
3695
+ -------
3696
+ bool
3697
+ Whether the plan represents an *EXPLAIN ANALYZE* plan
3698
+ """
3699
+ return self.query_plan.is_analyze()
3700
+
3701
+ def as_qep(self) -> QueryPlan:
3702
+ """Provides the actual explain plan as a normalized query execution plan instance
3703
+
3704
+ For notes on pecularities of this method, take a look at the *See Also* section
3705
+
3706
+ Returns
3707
+ -------
3708
+ QueryPlan
3709
+ The query execution plan
3710
+
3711
+ See Also
3712
+ --------
3713
+ PostgresExplainNode.as_qep
3714
+ """
3715
+ return self._normalized_plan
3716
+
3717
+ def inspect(self) -> str:
3718
+ """Provides a pretty string representation of the actual plan.
3719
+
3720
+ Returns
3721
+ -------
3722
+ str
3723
+ A string representation of the plan
3724
+
3725
+ See Also
3726
+ --------
3727
+ PostgresExplainNode.inspect
3728
+ """
3729
+ return self.query_plan.inspect()
3730
+
3731
+ def __json__(self) -> Any:
3732
+ return self.explain_data
3733
+
3734
+ def __getattribute__(self, name: str) -> Any:
3735
+ # All methods that are not defined on the Postgres plan delegate to the default DB plan
3736
+ try:
3737
+ return object.__getattribute__(self, name)
3738
+ except AttributeError:
3739
+ root_plan_node = object.__getattribute__(self, "query_plan")
3740
+ try:
3741
+ return root_plan_node.__getattribute__(name)
3742
+ except AttributeError:
3743
+ normalized_plan = object.__getattribute__(self, "_normalized_plan")
3744
+ return normalized_plan.__getattribute__(name)
3745
+
3746
+ def __hash__(self) -> int:
3747
+ return hash(self.query_plan)
3748
+
3749
+ def __eq__(self, other: object) -> bool:
3750
+ return isinstance(other, type(self)) and self.query_plan == other.query_plan
3751
+
3752
+ def __repr__(self) -> str:
3753
+ return str(self)
3754
+
3755
+ def __str__(self) -> str:
3756
+ if self.is_analyze():
3757
+ prefix = f"EXPLAIN ANALYZE (plan time={self.planning_time}, exec time={self.execution_time})"
3758
+ else:
3759
+ prefix = "EXPLAIN"
3760
+
3761
+ return f"{prefix} root: {self.query_plan}"
3762
+
3763
+
3764
+ class WorkloadShifter:
3765
+ """The shifter provides simple means to manipulate the current contents of a database.
3766
+
3767
+ Currently, such means only include the deletion of specific rows, but other tools could be added in the future.
3768
+
3769
+ Parameters
3770
+ ----------
3771
+ pg_instance : PostgresInterface
3772
+ The database to manipulate
3773
+ """
3774
+
3775
+ def __init__(self, pg_instance: PostgresInterface) -> None:
3776
+ self.pg_instance = pg_instance
3777
+
3778
+ def remove_random(
3779
+ self,
3780
+ table: TableReference | str,
3781
+ *,
3782
+ n_rows: Optional[int] = None,
3783
+ row_pct: Optional[float] = None,
3784
+ vacuum: bool = False,
3785
+ ) -> None:
3786
+ """Deletes tuples from a specific tables at random.
3787
+
3788
+ Parameters
3789
+ ----------
3790
+ table : TableReference | str
3791
+ The table from which to delete
3792
+ n_rows : Optional[int], optional
3793
+ The absolute number of rows to delete. Defaults to *None* in which case the `row_pct` is used.
3794
+ row_pct : Optional[float], optional
3795
+ The share of rows to delete. Value should be in range (0, 1). Defaults to *None* in which case the `n_rows` is
3796
+ used.
3797
+ vacuum : bool, optional
3798
+ Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
3799
+ forces a refresh of all statistics.
3800
+
3801
+ Raises
3802
+ ------
3803
+ ValueError
3804
+ If no correct `n_rows` or `row_pct` values have been given.
3805
+
3806
+ Warnings
3807
+ --------
3808
+ Notice that deletions in the given table can trigger further deletions in other tables through cascades in the schema.
3809
+ """
3810
+ table_name = table.full_name if isinstance(table, TableReference) else table
3811
+ n_rows = self._determine_row_cnt(table_name, n_rows, row_pct)
3812
+ pk_column = self.pg_instance.schema().primary_key_column(table_name)
3813
+ removal_template = textwrap.dedent("""
3814
+ WITH delete_samples AS (
3815
+ SELECT {col} AS sample_id, RANDOM() AS _pb_rand_val
3816
+ FROM {table}
3817
+ ORDER BY _pb_rand_val
3818
+ LIMIT {cnt}
3819
+ )
3820
+ DELETE FROM {table}
3821
+ WHERE EXISTS (SELECT 1 FROM delete_samples WHERE sample_id = {col})
3822
+ """)
3823
+ removal_query = removal_template.format(
3824
+ table=table_name, col=pk_column.name, cnt=n_rows
3825
+ )
3826
+ self._perform_removal(removal_query, vacuum)
3827
+
3828
+ def remove_ordered(
3829
+ self,
3830
+ column: ColumnReference | str,
3831
+ *,
3832
+ n_rows: Optional[int] = None,
3833
+ row_pct: Optional[float] = None,
3834
+ ascending: bool = True,
3835
+ null_placement: Optional[Literal["first", "last"]] = None,
3836
+ vacuum: bool = False,
3837
+ ) -> None:
3838
+ """Deletes the smallest/largest tuples from a specific table.
3839
+
3840
+ Parameters
3841
+ ----------
3842
+ column : ColumnReference | str
3843
+ The column to infer the deletion order. Can be either a proper column reference including the containing table, or
3844
+ a fully-qualified column string such as _table.column_ .
3845
+ n_rows : Optional[int], optional
3846
+ The absolute number of rows to delete. Defaults to *None* in which case the `row_pct` is used.
3847
+ row_pct : Optional[float], optional
3848
+ The share of rows to delete. Value should be in range (0, 1). Defaults to *None* in which case the `n_rows` is
3849
+ used.
3850
+ ascending : bool, optional
3851
+ Whether the first or the last rows should be deleted. *NULL* values are according to `null_placement`.
3852
+ null_placement : Optional[Literal["first", "last"]], optional
3853
+ Where to put *NULL* values in the order. Using the default value of *None* treats *NULL* values as being the
3854
+ largest values possible.
3855
+ vacuum : bool, optional
3856
+ Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
3857
+ forces a refresh of all statistics.
3858
+
3859
+ Raises
3860
+ ------
3861
+ ValueError
3862
+ If no correct `n_rows` or `row_pct` values have been given.
3863
+
3864
+ Warnings
3865
+ --------
3866
+ Notice that deletions in the given table can trigger further deletions in other tables through cascades in the schema.
3867
+ """
3868
+
3869
+ if isinstance(column, str):
3870
+ table_name, col_name = column.split(".")
3871
+ elif isinstance(column, ColumnReference):
3872
+ table_name, col_name = column.table.full_name, column.name
3873
+ else:
3874
+ raise TypeError("Unknown column type: " + str(column))
3875
+ n_rows = self._determine_row_cnt(table_name, n_rows, row_pct)
3876
+ pk_column = self.pg_instance.schema().primary_key_column(table_name)
3877
+ order_direction = "ASC" if ascending else "DESC"
3878
+ null_vals = "" if null_placement is None else f"NULLS {null_placement.upper()}"
3879
+ removal_template = textwrap.dedent("""
3880
+ WITH delete_entries AS (
3881
+ SELECT {pk_col}
3882
+ FROM {table}
3883
+ ORDER BY {order_col} {order_dir} {nulls}, {pk_col} ASC
3884
+ LIMIT {cnt}
3885
+ )
3886
+ DELETE FROM {table} t
3887
+ WHERE EXISTS (SELECT 1 FROM delete_entries
3888
+ WHERE delete_entries.{pk_col} = t.{pk_col})
3889
+ """)
3890
+ removal_query = removal_template.format(
3891
+ table=table_name,
3892
+ pk_col=pk_column.name,
3893
+ order_col=col_name,
3894
+ order_dir=order_direction,
3895
+ nulls=null_vals,
3896
+ cnt=n_rows,
3897
+ )
3898
+ self._perform_removal(removal_query, vacuum)
3899
+
3900
+ def generate_marker_table(
3901
+ self,
3902
+ target_table: str,
3903
+ marker_pct: float = 0.5,
3904
+ *,
3905
+ target_column: str = "id",
3906
+ marker_table: Optional[str] = None,
3907
+ marker_column: Optional[str] = None,
3908
+ ) -> None:
3909
+ """Generates a new table that can be used to store rows that should be deleted at a later point in time.
3910
+
3911
+ The marker table will be created if it does not exist already. It contains exactly two columns: one column for the
3912
+ marker index (an ascending integer value) and another column that stores the primary keys of rows that should be
3913
+ deleted from the target table. If the marker table exists already, all current markings (but not the marked rows
3914
+ themselves) are removed. Afterwards, the new rows to delete are selected at random.
3915
+
3916
+ By default, only the target table is a required parameter. All other parameters have default values or can be inferred
3917
+ from the target table. The marker index column is *marker_idx*.
3918
+
3919
+ Parameters
3920
+ ----------
3921
+ target_table : str
3922
+ The table from which rows should be removed
3923
+ marker_pct : float
3924
+ The percentage of rows that should be included in the marker table. Allowed range is *[0, 1]*.
3925
+ target_column : str, optional
3926
+ The column that contains the values used to identify the rows to be deleted in the target table. Defaults to *id*.
3927
+ marker_table : Optional[str], optional
3928
+ The name of the marker table that should store the row identifiers. Defaults to
3929
+ *<target table name>_delete_markers*.
3930
+ marker_column : Optional[str], optional
3931
+ The name of the column in the marker table that should contain the target column values. Defaults to
3932
+ *<target table name>_<target column name>*.
3933
+
3934
+ See Also
3935
+ --------
3936
+ remove_marked
3937
+ export_marker_table
3938
+ """
3939
+ marker_table = (
3940
+ f"{target_table}_delete_marker" if marker_table is None else marker_table
3941
+ )
3942
+ marker_column = (
3943
+ f"{target_table}_{target_column}"
3944
+ if marker_column is None
3945
+ else marker_column
3946
+ )
3947
+ target_col_ref = ColumnReference(target_column, TableReference(target_table))
3948
+ target_column_type = self.pg_instance.schema().datatype(target_col_ref)
3949
+ marker_create_query = textwrap.dedent(f"""
3950
+ CREATE TABLE IF NOT EXISTS {marker_table} (
3951
+ marker_idx BIGSERIAL PRIMARY KEY,
3952
+ {marker_column} {target_column_type}
3953
+ );
3954
+ """)
3955
+ marker_pct = round(marker_pct * 100)
3956
+ marker_inflate_query = textwrap.dedent(f"""
3957
+ INSERT INTO {marker_table}({marker_column})
3958
+ SELECT {target_column}
3959
+ FROM {target_table} TABLESAMPLE BERNOULLI ({marker_pct});
3960
+ """)
3961
+ with self.pg_instance.obtain_new_local_connection() as conn:
3962
+ cursor = conn.cursor()
3963
+ cursor.execute(marker_create_query)
3964
+ cursor.execute(f"DELETE FROM {marker_table};")
3965
+ cursor.execute(marker_inflate_query)
3966
+
3967
+ def export_marker_table(
3968
+ self,
3969
+ *,
3970
+ target_table: Optional[str] = None,
3971
+ marker_table: Optional[str] = None,
3972
+ out_file: Optional[str] = None,
3973
+ ) -> None:
3974
+ """Stores a marker table in a CSV file on disk.
3975
+
3976
+ This allows the marker table to be re-imported later on.
3977
+
3978
+ Parameters
3979
+ ----------
3980
+ target_table : Optional[str], optional
3981
+ The name of the target table for which the marker has been created. This can be used to infer the name of the
3982
+ marker table if the defaults have been used.
3983
+ marker_table : Optional[str], optional
3984
+ The name of the marker table. Can be omitted if the default name has been used and `target_table` is specified.
3985
+ out_file : Optional[str], optional
3986
+ The name and path of the output CSV file to create. If omitted, the name will be `<marker table name>.csv` and the
3987
+ file will be placed in the current working directory. If specified, an absolute path must be used.
3988
+
3989
+ Raises
3990
+ ------
3991
+ ValueError
3992
+ If neither `target_table` nor `marker_table` are given.
3993
+
3994
+ See Also
3995
+ --------
3996
+ import_marker_table
3997
+ remove_marked
3998
+ """
3999
+ if target_table is None and marker_table is None:
4000
+ raise ValueError("Either marker table or target table are required!")
4001
+ marker_table = (
4002
+ f"{target_table}_delete_marker" if marker_table is None else marker_table
4003
+ )
4004
+ out_file = (
4005
+ pathlib.Path(f"{marker_table}.csv").absolute()
4006
+ if out_file is None
4007
+ else out_file
4008
+ )
4009
+ self.pg_instance.cursor().execute(
4010
+ f"COPY {marker_table} TO '{out_file}' DELIMITER ',' CSV HEADER;"
4011
+ )
4012
+
4013
+ def import_marker_table(
4014
+ self,
4015
+ *,
4016
+ target_table: Optional[str] = None,
4017
+ marker_table: Optional[str] = None,
4018
+ target_column: str = "id",
4019
+ marker_column: Optional[str] = None,
4020
+ target_column_type: Optional[str] = None,
4021
+ in_file: Optional[str] = None,
4022
+ ) -> None:
4023
+ """Loads the contents of a marker table from a CSV file from disk.
4024
+
4025
+ The table will be created if it does not exist already. If the marker table exists already, all current markings (but
4026
+ not the marked rows themselves) are removed. Afterwards, the new markings are imported.
4027
+
4028
+ Parameters
4029
+ ----------
4030
+ target_table : Optional[str], optional
4031
+ The name of the target table for which the marker has been created. This can be used to infer the name of the
4032
+ marker table if the defaults have been used.
4033
+ marker_table : Optional[str], optional
4034
+ The name of the marker table. Can be omitted if the default name has been used and `target_table` is specified.
4035
+ target_column : str, optional
4036
+ The column that contains the values used to identify the rows to be deleted in the target table. Defaults to *id*.
4037
+ marker_table : Optional[str], optional
4038
+ The name of the marker table that should store the row identifiers. Defaults to
4039
+ *<target table name>_delete_markers*.
4040
+ target_column_type : Optional[str], optional
4041
+ The datatype of the target column. If this parameter is not given, `target_table` has to be specified to infer the
4042
+ proper datatype from the schema metadata.
4043
+ in_file : Optional[str], optional
4044
+ The name and path of the CSV file to read. If omitted, the name will be `<marker table name>.csv` and the
4045
+ file will be loaded in the current working directory. If specified, an absolute path must be used.
4046
+
4047
+ Raises
4048
+ ------
4049
+ ValueError
4050
+ If neither `target_table` nor `marker_table` are given.
4051
+
4052
+ See Also
4053
+ --------
4054
+ export_marker_table
4055
+ remove_marked
4056
+ """
4057
+ if not target_table and not marker_table:
4058
+ raise ValueError("Either marker table or target table are required!")
4059
+ marker_table = (
4060
+ f"{target_table}_delete_marker" if marker_table is None else marker_table
4061
+ )
4062
+ marker_column = (
4063
+ f"{target_table}_{target_column}"
4064
+ if marker_column is None
4065
+ else marker_column
4066
+ )
4067
+ in_file = (
4068
+ pathlib.Path(f"{marker_table}.csv").absolute()
4069
+ if in_file is None
4070
+ else in_file
4071
+ )
4072
+
4073
+ if target_column_type is None:
4074
+ target_col_ref = ColumnReference(
4075
+ target_column, TableReference(target_table)
4076
+ )
4077
+ target_column_type = self.pg_instance.schema().datatype(target_col_ref)
4078
+
4079
+ marker_create_query = textwrap.dedent(f"""
4080
+ CREATE TABLE IF NOT EXISTS {marker_table} (
4081
+ marker_idx BIGSERIAL PRIMARY KEY,
4082
+ {marker_column} {target_column_type}
4083
+ );
4084
+ """)
4085
+ marker_import_query = textwrap.dedent(f"""
4086
+ COPY {marker_table}(marker_idx, {marker_column})
4087
+ FROM '{in_file}'
4088
+ DELIMITER ','
4089
+ CSV HEADER;
4090
+ """)
4091
+ with self.pg_instance.obtain_new_local_connection() as conn:
4092
+ cursor = conn.cursor()
4093
+ cursor.execute(marker_create_query)
4094
+ cursor.execute(f"DELETE FROM {marker_table}")
4095
+ cursor.execute(marker_import_query)
4096
+
4097
+ def remove_marked(
4098
+ self,
4099
+ target_table: str,
4100
+ *,
4101
+ target_column: str = "id",
4102
+ marker_table: Optional[str] = None,
4103
+ marker_column: Optional[str] = None,
4104
+ vacuum: bool = False,
4105
+ ) -> None:
4106
+ """Deletes rows according to their primary keys stored in a marker table.
4107
+
4108
+ Parameters
4109
+ ----------
4110
+ target_table : str
4111
+ The table from which the rows should be removed.
4112
+ target_column : str, optional
4113
+ A column of the target table that is used to identify rows matching the marked rows to remove. Defaults to *id*.
4114
+ marker_table : Optional[str], optional
4115
+ A table containing marks of the rows to delete. Defaults to *<target table>_delete_markers*.
4116
+ marker_column : Optional[str], optional
4117
+ A column of the marker table that contains the values of the columns to remove. Defaults to
4118
+ *<target table>_<target column>*.
4119
+ vacuum : bool, optional
4120
+ Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
4121
+ forces a refresh of all statistics.
4122
+
4123
+ See Also
4124
+ --------
4125
+ generate_marker_table
4126
+ """
4127
+ # TODO: align parameter types with TableReference and ColumnReference
4128
+ marker_table = (
4129
+ f"{target_table}_delete_marker" if marker_table is None else marker_table
4130
+ )
4131
+ marker_column = (
4132
+ f"{target_table}_{target_column}"
4133
+ if marker_column is None
4134
+ else marker_column
4135
+ )
4136
+ removal_query = textwrap.dedent(f"""
4137
+ DELETE FROM {target_table}
4138
+ WHERE EXISTS (SELECT 1 FROM {marker_table}
4139
+ WHERE {marker_table}.{marker_column} = {target_table}.{target_column})""")
4140
+ self._perform_removal(removal_query, vacuum)
4141
+
4142
+ def _perform_removal(self, removal_query: str, vacuum: bool) -> None:
4143
+ """Executes a specific removal query and optionally cleans up the storage system.
4144
+
4145
+ Parameters
4146
+ ----------
4147
+ removal_query : str
4148
+ The query that describes the desired delete operation.
4149
+ vacuum : bool
4150
+ Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
4151
+ forces a refresh of all statistics.
4152
+ """
4153
+ with self.pg_instance.obtain_new_local_connection() as conn:
4154
+ cursor = conn.cursor()
4155
+ cursor.execute(removal_query)
4156
+ if vacuum:
4157
+ # We can't use the with-syntax here because VACUUM cannot be executed inside a transaction
4158
+ conn = self.pg_instance.obtain_new_local_connection()
4159
+ conn.autocommit = True
4160
+ cursor = conn.cursor()
4161
+ # We really need a full vacuum due to cascading deletes
4162
+ cursor.execute("VACUUM FULL ANALYZE;")
4163
+ cursor.close()
4164
+ conn.close()
4165
+
4166
+ def _determine_row_cnt(
4167
+ self, table: str, n_rows: Optional[int], row_pct: Optional[float]
4168
+ ) -> int:
4169
+ """Calculates the absolute number of rows to delete while also performing sanity checks.
4170
+
4171
+ Parameters
4172
+ ----------
4173
+ table : str
4174
+ The table from which rows should be deleted. This is necessary to determine the current row count.
4175
+ n_rows : Optional[int]
4176
+ The absolute number of rows to delete.
4177
+ row_pct : Optional[float]
4178
+ The fraction in (0, 1) of rows to delete.
4179
+
4180
+ Returns
4181
+ -------
4182
+ int
4183
+ The absolute number rows to delete. This is equal to `n_rows` if that parameter was given. Otherwise, the number is
4184
+ inferred from the `row_pct` and the current number of tuples in the table.
4185
+
4186
+ Raises
4187
+ ------
4188
+ ValueError
4189
+ If either both or neither `n_rows` and `row_pct` was given or any of the parameters is outside of the allowed
4190
+ range.
4191
+ """
4192
+ if n_rows is None and row_pct is None:
4193
+ raise ValueError(
4194
+ "Either absolute number of rows or row percentage must be given"
4195
+ )
4196
+ if n_rows is not None and row_pct is not None:
4197
+ raise ValueError(
4198
+ "Cannot use both absolute number of rows and row percentage"
4199
+ )
4200
+
4201
+ if n_rows is not None and not n_rows > 0:
4202
+ raise ValueError("Not a valid row count: " + str(n_rows))
4203
+ elif n_rows is not None and n_rows > 0:
4204
+ return n_rows
4205
+
4206
+ if not 0.0 < row_pct < 1.0:
4207
+ raise ValueError("Not a valid row percentage: " + str(row_pct))
4208
+
4209
+ total_n_rows = self.pg_instance.statistics().total_rows(
4210
+ TableReference(table), cache_enabled=False, emulated=True
4211
+ )
4212
+ if total_n_rows is None:
4213
+ raise StateError(
4214
+ "Could not determine total number of rows for table " + table
4215
+ )
4216
+ return round(row_pct * total_n_rows)