PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/db/postgres.py
ADDED
|
@@ -0,0 +1,4216 @@
|
|
|
1
|
+
"""Contains the Postgres implementation of the Database interface.
|
|
2
|
+
|
|
3
|
+
In many ways the Postgres implementation can be thought of as the reference or blueprint implementation of the database
|
|
4
|
+
interface. This is due to two main reasons: first up, Postgres' capabilities follow a traditional architecture and its
|
|
5
|
+
features cover most of the general aspects of query optimization (i.e. supported operators, join orders and statistics).
|
|
6
|
+
Secondly, and on a more pragmatic note Potsgres was the first database system that was supported by PostBOUND and therefore
|
|
7
|
+
a lot of the original Postgres interfaces eventually evolved into the more abstract database-independent interfaces.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import collections
|
|
13
|
+
import concurrent
|
|
14
|
+
import concurrent.futures
|
|
15
|
+
import math
|
|
16
|
+
import multiprocessing as mp
|
|
17
|
+
import os
|
|
18
|
+
import pathlib
|
|
19
|
+
import re
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import textwrap
|
|
23
|
+
import threading
|
|
24
|
+
import time
|
|
25
|
+
import warnings
|
|
26
|
+
from collections import UserString
|
|
27
|
+
from collections.abc import Callable, Generator, Iterable, Sequence
|
|
28
|
+
from multiprocessing import connection as mp_conn
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any, Literal, Optional
|
|
31
|
+
|
|
32
|
+
import psycopg
|
|
33
|
+
import psycopg.rows
|
|
34
|
+
|
|
35
|
+
from .. import util
|
|
36
|
+
from .._core import (
|
|
37
|
+
Cardinality,
|
|
38
|
+
IntermediateOperator,
|
|
39
|
+
JoinOperator,
|
|
40
|
+
PhysicalOperator,
|
|
41
|
+
ScanOperator,
|
|
42
|
+
UnboundColumnError,
|
|
43
|
+
VirtualTableError,
|
|
44
|
+
)
|
|
45
|
+
from .._hints import (
|
|
46
|
+
HintType,
|
|
47
|
+
PhysicalOperatorAssignment,
|
|
48
|
+
PlanParameterization,
|
|
49
|
+
operators_from_plan,
|
|
50
|
+
)
|
|
51
|
+
from .._jointree import JoinTree, jointree_from_plan, parameters_from_plan
|
|
52
|
+
from .._qep import QueryPlan, SortKey
|
|
53
|
+
from ..qal import formatter, transform
|
|
54
|
+
from ..qal._qal import (
|
|
55
|
+
AbstractPredicate,
|
|
56
|
+
ArrayAccessExpression,
|
|
57
|
+
BetweenPredicate,
|
|
58
|
+
BinaryPredicate,
|
|
59
|
+
CaseExpression,
|
|
60
|
+
CastExpression,
|
|
61
|
+
ColumnExpression,
|
|
62
|
+
ColumnReference,
|
|
63
|
+
CompoundOperator,
|
|
64
|
+
CompoundPredicate,
|
|
65
|
+
Explain,
|
|
66
|
+
FunctionExpression,
|
|
67
|
+
Hint,
|
|
68
|
+
InPredicate,
|
|
69
|
+
Limit,
|
|
70
|
+
MathExpression,
|
|
71
|
+
OrderBy,
|
|
72
|
+
OrderByExpression,
|
|
73
|
+
SqlExpression,
|
|
74
|
+
SqlQuery,
|
|
75
|
+
StarExpression,
|
|
76
|
+
StaticValueExpression,
|
|
77
|
+
SubqueryExpression,
|
|
78
|
+
TableReference,
|
|
79
|
+
UnaryPredicate,
|
|
80
|
+
WindowExpression,
|
|
81
|
+
)
|
|
82
|
+
from ..util import StateError, Version, jsondict
|
|
83
|
+
from ._db import (
|
|
84
|
+
Database,
|
|
85
|
+
DatabasePool,
|
|
86
|
+
DatabaseSchema,
|
|
87
|
+
DatabaseServerError,
|
|
88
|
+
DatabaseStatistics,
|
|
89
|
+
DatabaseUserError,
|
|
90
|
+
HintService,
|
|
91
|
+
HintWarning,
|
|
92
|
+
OptimizerInterface,
|
|
93
|
+
ResultSet,
|
|
94
|
+
UnsupportedDatabaseFeatureError,
|
|
95
|
+
simplify_result_set,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
_SignificantPostgresSettings = {
|
|
99
|
+
# Resource consumption settings (see https://www.postgresql.org/docs/current/runtime-config-resource.html)
|
|
100
|
+
# Memory
|
|
101
|
+
"shared_buffers",
|
|
102
|
+
"huge_pages",
|
|
103
|
+
"huge_page_size",
|
|
104
|
+
"temp_buffers",
|
|
105
|
+
"max_prepared_transactions",
|
|
106
|
+
"work_mem",
|
|
107
|
+
"hash_mem_multiplier",
|
|
108
|
+
"maintenance_work_mem",
|
|
109
|
+
"autovacuum_work_mem",
|
|
110
|
+
"vacuum_buffer_usage_limit",
|
|
111
|
+
"logical_decoding_work_mem",
|
|
112
|
+
"max_stack_depth",
|
|
113
|
+
"shared_memory_type",
|
|
114
|
+
"dynamic_shared_memory_type",
|
|
115
|
+
"min_dynamic_shared_memory",
|
|
116
|
+
# Disk
|
|
117
|
+
"temp_file_limit",
|
|
118
|
+
# Kernel Resource Usage
|
|
119
|
+
"max_files_per_process",
|
|
120
|
+
# Cost-based Vacuum Delay
|
|
121
|
+
"vacuum_cost_delay",
|
|
122
|
+
"vacuum_cost_page_hit",
|
|
123
|
+
"vacuum_cost_page_miss",
|
|
124
|
+
"vacuum_cost_page_dirty",
|
|
125
|
+
"vacuum_cost_limit",
|
|
126
|
+
# Background Writer
|
|
127
|
+
"bgwriter_delay",
|
|
128
|
+
"bgwriter_lru_maxpages",
|
|
129
|
+
"bgwriter_lru_multiplier",
|
|
130
|
+
"bgwriter_flush_after",
|
|
131
|
+
# Asynchronous Behavior
|
|
132
|
+
"backend_flush_after",
|
|
133
|
+
"effective_io_concurrency",
|
|
134
|
+
"maintenance_io_concurrency",
|
|
135
|
+
"max_worker_processes",
|
|
136
|
+
"max_parallel_workers_per_gather",
|
|
137
|
+
"max_parallel_maintenance_workers",
|
|
138
|
+
"max_parallel_workers",
|
|
139
|
+
"parallel_leader_participation",
|
|
140
|
+
"old_snapshot_threshold",
|
|
141
|
+
# Query Planning Settings (see https://www.postgresql.org/docs/current/runtime-config-query.html)
|
|
142
|
+
# Planner Method Configuration
|
|
143
|
+
"enable_async_append",
|
|
144
|
+
"enable_bitmapscan",
|
|
145
|
+
"enable_gathermerge",
|
|
146
|
+
"enable_hashagg",
|
|
147
|
+
"enable_hashjoin",
|
|
148
|
+
"enable_incremental_sort",
|
|
149
|
+
"enable_indexscan",
|
|
150
|
+
"enable_indexonlyscan",
|
|
151
|
+
"enable_material",
|
|
152
|
+
"enable_memoize",
|
|
153
|
+
"enable_mergejoin",
|
|
154
|
+
"enable_nestloop",
|
|
155
|
+
"enable_parallel_append",
|
|
156
|
+
"enable_parallel_hash",
|
|
157
|
+
"enable_partition_pruning",
|
|
158
|
+
"enable_partitionwise_join",
|
|
159
|
+
"enable_partitionwise_aggregate",
|
|
160
|
+
"enable_presorted_aggregate",
|
|
161
|
+
"enable_seqscan",
|
|
162
|
+
"enable_sort",
|
|
163
|
+
"enable_tidscan",
|
|
164
|
+
# Planner Cost Constants
|
|
165
|
+
"seq_page_cost",
|
|
166
|
+
"random_page_cost",
|
|
167
|
+
"cpu_tuple_cost",
|
|
168
|
+
"cpu_index_tuple_cost",
|
|
169
|
+
"cpu_operator_cost",
|
|
170
|
+
"parallel_setup_cost",
|
|
171
|
+
"parallel_tuple_cost",
|
|
172
|
+
"min_parallel_table_scan_size",
|
|
173
|
+
"min_parallel_index_scan_size",
|
|
174
|
+
"effective_cache_size",
|
|
175
|
+
"jit_above_cost",
|
|
176
|
+
"jit_inline_above_cost",
|
|
177
|
+
"jit_optimize_above_cost",
|
|
178
|
+
# Genetic Query Optimizer
|
|
179
|
+
"geqo",
|
|
180
|
+
"geqo_threshold",
|
|
181
|
+
"geqo_effort",
|
|
182
|
+
"geqo_pool_size",
|
|
183
|
+
"geqo_generations",
|
|
184
|
+
"geqo_selection_bias",
|
|
185
|
+
"geqo_seed",
|
|
186
|
+
# Other Planner Options
|
|
187
|
+
"default_statistics_target",
|
|
188
|
+
"constraint_exclusion",
|
|
189
|
+
"cursor_tuple_fraction",
|
|
190
|
+
"from_collapse_limit",
|
|
191
|
+
"jit",
|
|
192
|
+
"join_collapse_limit",
|
|
193
|
+
"plan_cache_mode",
|
|
194
|
+
"recursive_worktable_factor"
|
|
195
|
+
# Automatic Vacuuming (https://www.postgresql.org/docs/current/runtime-config-autovacuum.html)
|
|
196
|
+
"autovacuum",
|
|
197
|
+
"autovacuum_max_workers",
|
|
198
|
+
"autovacuum_naptime",
|
|
199
|
+
"autovacuum_threshold",
|
|
200
|
+
"autovacuum_insert_threshold",
|
|
201
|
+
"autovacuum_analyze_threshold",
|
|
202
|
+
"autovacuum_scale_factor",
|
|
203
|
+
"autovacuum_analyze_scale_factor",
|
|
204
|
+
"autovacuum_freeze_max_age",
|
|
205
|
+
"autovacuum_multixact_freeze_max_age",
|
|
206
|
+
"autovacuum_cost_delay",
|
|
207
|
+
"autovacuum_cost_limit",
|
|
208
|
+
}
|
|
209
|
+
"""Postgres settings that are relevant to many PostBOUND workflows.
|
|
210
|
+
|
|
211
|
+
These settings can influence performance measurements of different benchmarks. Therefore, we want to make their values
|
|
212
|
+
transparent in order to assess the results.
|
|
213
|
+
|
|
214
|
+
As a rule of thumb we include settings from three major categories: resource consumption (e.g. size of shared buffers),
|
|
215
|
+
optimizer settings (e.g. enable operators) and auto vacuum. The final category is required because it determines how good the
|
|
216
|
+
statistics are once a new database dump has been loaded or a data shift has been simulated. For all of these categories we
|
|
217
|
+
include all settings, even if they are not important right now to the best of our knowledge. This is done to prevent tedious
|
|
218
|
+
debugging if setting is later found to be indeed important: if the category to which it belongs is present in our "significant
|
|
219
|
+
settings", it is guaranteed to be monitored.
|
|
220
|
+
|
|
221
|
+
Most notably settings regarding replication, logging and network settings are excluded, as well as settings regarding locking.
|
|
222
|
+
This is done because PostBOUNDs database abstraction assumes read-only workloads with a single query at a time. If data shifts
|
|
223
|
+
are simulated, these are supposed to be happen strictly before or after a read-only workload is executed and benchmarked.
|
|
224
|
+
|
|
225
|
+
All settings are up-to-date as of Postgres version 16.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
_RuntimeChangeablePostgresSettings = {
|
|
229
|
+
setting for setting in _SignificantPostgresSettings
|
|
230
|
+
} - {
|
|
231
|
+
"autovacuum_max_workers",
|
|
232
|
+
"autovacuum_naptime",
|
|
233
|
+
"autovacuum_threshold",
|
|
234
|
+
"autovacuum_insert_threshold",
|
|
235
|
+
"autovacuum_analyze_threshold",
|
|
236
|
+
"autovacuum_scale_factor",
|
|
237
|
+
"autovacuum_analyze_scale_factor",
|
|
238
|
+
"autovacuum_freeze_max_age",
|
|
239
|
+
"autovacuum_multixact_freeze_max_age",
|
|
240
|
+
"autovacuum_cost_delay",
|
|
241
|
+
"autovacuum_cost_limit",
|
|
242
|
+
"autovacuum_work_mem",
|
|
243
|
+
"bgwriter_delay",
|
|
244
|
+
"bgwriter_lru_maxpages",
|
|
245
|
+
"bgwriter_lru_multiplier",
|
|
246
|
+
"bgwriter_flush_after",
|
|
247
|
+
"dynamic_shared_memory_type",
|
|
248
|
+
"huge_pages",
|
|
249
|
+
"huge_page_size",
|
|
250
|
+
"max_files_per_process",
|
|
251
|
+
"max_prepared_transactions",
|
|
252
|
+
"max_worker_processes",
|
|
253
|
+
"min_dynamic_shared_memory",
|
|
254
|
+
"old_snapshot_threshold",
|
|
255
|
+
"shared_buffers",
|
|
256
|
+
"shared_memory_type",
|
|
257
|
+
}
|
|
258
|
+
"""These are exactly those settings from `_SignificantPostgresSettings` that can be changed at runtime."""
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class PostgresSetting(str):
|
|
262
|
+
"""Model for a single Postgres configuration such as *SET enable_nestloop = 'off';*.
|
|
263
|
+
|
|
264
|
+
This setting can be used directly as a replacement where a string is expected, or its different components can be accessed
|
|
265
|
+
via the `parameter` and `value` attribute.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
parameter : str
|
|
270
|
+
The name of the setting
|
|
271
|
+
value : object
|
|
272
|
+
The setting's current or desired value
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
def __init__(self, parameter: str, value: object) -> None:
|
|
276
|
+
self._param = parameter
|
|
277
|
+
self._val = value
|
|
278
|
+
|
|
279
|
+
def __new__(cls, parameter: str, value: object):
|
|
280
|
+
value = "on" if value is True else "off" if value is False else value
|
|
281
|
+
return super().__new__(cls, f"SET {parameter} = '{value}';")
|
|
282
|
+
|
|
283
|
+
__match_args__ = ("parameter", "value")
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def parameter(self) -> str:
|
|
287
|
+
"""Gets the name of the setting.
|
|
288
|
+
|
|
289
|
+
Returns
|
|
290
|
+
-------
|
|
291
|
+
str
|
|
292
|
+
The name
|
|
293
|
+
"""
|
|
294
|
+
return self._param
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def value(self) -> object:
|
|
298
|
+
"""Gets the current or desired value of the setting.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
object
|
|
303
|
+
The raw, i.e. un-escaped value of the setting.
|
|
304
|
+
"""
|
|
305
|
+
return self._val
|
|
306
|
+
|
|
307
|
+
def update(self, value: object) -> PostgresSetting:
|
|
308
|
+
"""Creates a new setting with the same name but a different value.
|
|
309
|
+
|
|
310
|
+
Parameters
|
|
311
|
+
----------
|
|
312
|
+
value : object
|
|
313
|
+
The new value
|
|
314
|
+
|
|
315
|
+
Returns
|
|
316
|
+
-------
|
|
317
|
+
PostgresSetting
|
|
318
|
+
The new setting
|
|
319
|
+
"""
|
|
320
|
+
return PostgresSetting(self.parameter, value)
|
|
321
|
+
|
|
322
|
+
def __getnewargs__(self) -> tuple[str, object]:
|
|
323
|
+
return (self.parameter, self.value)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class PostgresConfiguration(collections.UserString):
|
|
327
|
+
"""Model for a collection of different postgres settings that form a complete server configuration.
|
|
328
|
+
|
|
329
|
+
Each configuration is build of indivdual `PostgresSetting` objects. The configuration can be used directly as a replacement
|
|
330
|
+
when a string is expected, or its different settings can be accessed individually - either through the accessor methods, or
|
|
331
|
+
by using a dict-like syntax: calling ``config[setting]`` with a string setting value will provide the matching
|
|
332
|
+
`PostgresSetting`. Since the configuration also subclasses string, the precise behavior of `__getitem__` depends on the
|
|
333
|
+
argument type: string arguments provide settings whereas integer arguments result in specific characters. All other string
|
|
334
|
+
methods are implemented such that the normal string behavior is retained. All additional behavior is part of new methods.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
settings : Iterable[PostgresSetting]
|
|
339
|
+
The settings that form the configuration.
|
|
340
|
+
|
|
341
|
+
Warnings
|
|
342
|
+
--------
|
|
343
|
+
Notice that while the configuration is a *UserString*, pyscopg currently does not support executing the configuration, i.e.
|
|
344
|
+
executing ``cursor.execute(config)`` will not work. Instead, the configuration has to be manually converted into a string
|
|
345
|
+
first by calling *str* as in ``cursor.execute(str(config))``. This also applies to the `execute_query()` method of the
|
|
346
|
+
`PostgresInterface` class, since it uses psycopg under the hood.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def load(*args, **kwargs) -> PostgresConfiguration:
|
|
351
|
+
"""Generates a new configuration based on (setting name, value) pairs.
|
|
352
|
+
|
|
353
|
+
Parameters
|
|
354
|
+
----------
|
|
355
|
+
args
|
|
356
|
+
Ready-to-use `PostgresSetting` objects
|
|
357
|
+
kwargs
|
|
358
|
+
Additional settings
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
PostgresConfiguration
|
|
363
|
+
The configuration
|
|
364
|
+
"""
|
|
365
|
+
return PostgresConfiguration(
|
|
366
|
+
list(args) + [PostgresSetting(key, val) for key, val in kwargs.items()]
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def __init__(self, settings: Iterable[PostgresSetting]) -> None:
|
|
370
|
+
self._settings = {setting.parameter: setting for setting in settings}
|
|
371
|
+
super().__init__(self._format())
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def settings(self) -> Sequence[PostgresSetting]:
|
|
375
|
+
"""Gets the settings that are part of the configuration.
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
Sequence[PostgresSetting]
|
|
380
|
+
The settings in the order in which they were originally specified.
|
|
381
|
+
"""
|
|
382
|
+
return list(self._settings.values())
|
|
383
|
+
|
|
384
|
+
def parameters(self) -> Sequence[str]:
|
|
385
|
+
"""Provides all setting names that are specified in this configuration.
|
|
386
|
+
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
Sequence[str]
|
|
390
|
+
The setting names in the order in which they were orignally specified.
|
|
391
|
+
"""
|
|
392
|
+
return list(self._settings.keys())
|
|
393
|
+
|
|
394
|
+
def add(
|
|
395
|
+
self, setting: PostgresSetting | str = None, value: object = None, **kwargs
|
|
396
|
+
) -> PostgresConfiguration:
|
|
397
|
+
"""Creates a new configuration with additional settings.
|
|
398
|
+
|
|
399
|
+
The setting can be supplied either as a `PostgresSetting` object or as a key-value pair.
|
|
400
|
+
The latter case allows both positional arguments, as well as as keyword arguments.
|
|
401
|
+
|
|
402
|
+
Parameters
|
|
403
|
+
----------
|
|
404
|
+
setting : PostgresSetting | str
|
|
405
|
+
The setting to add. This can either be a readily created `PostgresSetting` object or a string that will be used as
|
|
406
|
+
the setting name. In the latter case, the `value` has to be supplied as well.
|
|
407
|
+
value : object
|
|
408
|
+
The value of the setting. This is only used if `setting` is a string.
|
|
409
|
+
kwargs
|
|
410
|
+
If the setting is not specified as a string, nor as a `PostgresSetting` object, it has to be specified as keyword
|
|
411
|
+
arguments. The keyword argument names are used as the setting names, the values are used as the setting values.
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
PostgresConfiguration
|
|
416
|
+
The updated configuration. The original config is not modified.
|
|
417
|
+
"""
|
|
418
|
+
if isinstance(setting, str):
|
|
419
|
+
setting = PostgresSetting(setting, value)
|
|
420
|
+
|
|
421
|
+
target_settings = dict(self._settings)
|
|
422
|
+
if isinstance(setting, PostgresSetting):
|
|
423
|
+
target_settings[setting.parameter] = setting
|
|
424
|
+
else:
|
|
425
|
+
settings = {key: PostgresSetting(key, val) for key, val in kwargs.items()}
|
|
426
|
+
target_settings.update(settings)
|
|
427
|
+
|
|
428
|
+
return PostgresConfiguration(target_settings.values())
|
|
429
|
+
|
|
430
|
+
def remove(self, setting: PostgresSetting | str) -> PostgresConfiguration:
|
|
431
|
+
"""Creates a new configuration without a specific setting.
|
|
432
|
+
|
|
433
|
+
Parameters
|
|
434
|
+
----------
|
|
435
|
+
setting : PostgresSetting
|
|
436
|
+
The setting to remove
|
|
437
|
+
|
|
438
|
+
Returns
|
|
439
|
+
-------
|
|
440
|
+
PostgresConfiguration
|
|
441
|
+
The updated configuration. The original config is not modified.
|
|
442
|
+
"""
|
|
443
|
+
parameter = (
|
|
444
|
+
setting.parameter if isinstance(setting, PostgresSetting) else setting
|
|
445
|
+
)
|
|
446
|
+
target_settings = dict(self._settings)
|
|
447
|
+
target_settings.pop(parameter, None)
|
|
448
|
+
return PostgresConfiguration(target_settings.values())
|
|
449
|
+
|
|
450
|
+
def update(
|
|
451
|
+
self, setting: PostgresSetting | str, value: object
|
|
452
|
+
) -> PostgresConfiguration:
|
|
453
|
+
"""Creates a new configuration with an updated setting.
|
|
454
|
+
|
|
455
|
+
Parameters
|
|
456
|
+
----------
|
|
457
|
+
setting : PostgresSetting | str
|
|
458
|
+
The setting to update. This can either be the raw setting name, or a `PostgresSetting` object. In either case,
|
|
459
|
+
the updated value has to be supplied via the `value` parameter. (When supplying a `PostgresSetting`, only its
|
|
460
|
+
name is used.)
|
|
461
|
+
value : object
|
|
462
|
+
The updated value of the setting.
|
|
463
|
+
|
|
464
|
+
Returns
|
|
465
|
+
-------
|
|
466
|
+
PostgresConfiguration
|
|
467
|
+
The updated configuration. The original config is not modified.
|
|
468
|
+
"""
|
|
469
|
+
match setting:
|
|
470
|
+
case str():
|
|
471
|
+
setting = PostgresSetting(setting, value)
|
|
472
|
+
case PostgresSetting(name, _):
|
|
473
|
+
setting = PostgresSetting(name, value)
|
|
474
|
+
|
|
475
|
+
target_settings = dict(self._settings)
|
|
476
|
+
target_settings[setting.parameter] = setting
|
|
477
|
+
|
|
478
|
+
return PostgresConfiguration(target_settings.values())
|
|
479
|
+
|
|
480
|
+
def as_dict(self) -> dict[str, object]:
|
|
481
|
+
"""Provides all settings as setting name -> setting value mappings.
|
|
482
|
+
|
|
483
|
+
Returns
|
|
484
|
+
-------
|
|
485
|
+
dict[str, object]
|
|
486
|
+
The settings. Changes to this dictionary will not be reflected in the configuration object.
|
|
487
|
+
"""
|
|
488
|
+
return dict(self._settings)
|
|
489
|
+
|
|
490
|
+
def _format(self) -> str:
|
|
491
|
+
"""Provides the string representation of the configuration.
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
str
|
|
496
|
+
The string representation
|
|
497
|
+
"""
|
|
498
|
+
return "\n".join([str(setting) for setting in self.settings])
|
|
499
|
+
|
|
500
|
+
def __getitem__(self, key: object) -> str:
|
|
501
|
+
if isinstance(key, str):
|
|
502
|
+
return self._settings[key]
|
|
503
|
+
return super().__getitem__(key)
|
|
504
|
+
|
|
505
|
+
def __setitem__(self, key: object, value: object) -> None:
|
|
506
|
+
if isinstance(key, str):
|
|
507
|
+
self._settings[key] = value
|
|
508
|
+
self.data = self._format()
|
|
509
|
+
else:
|
|
510
|
+
super().__setitem__(key, value)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
class PostgresConfigInterface:
|
|
514
|
+
"""A thin wrapper that provides read-only access to Postgres configuration settings using __getitem__ syntax."""
|
|
515
|
+
|
|
516
|
+
def __init__(self, pg_instance: PostgresInterface) -> None:
|
|
517
|
+
self._pg = pg_instance
|
|
518
|
+
|
|
519
|
+
def __getitem__(self, key: str) -> Any:
|
|
520
|
+
return self._pg.execute_query(f"SHOW {key};", cache_enabled=False, raw=False)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
_PGVersionPattern = re.compile(r"^PostgreSQL (?P<pg_ver>[\d]+(\.[\d]+)?).*$")
|
|
524
|
+
"""Regular expression to extract the Postgres server version from the *VERSION()* function.
|
|
525
|
+
|
|
526
|
+
References
|
|
527
|
+
----------
|
|
528
|
+
|
|
529
|
+
.. Pattern debugging: https://regex101.com/r/UTQkfa/1
|
|
530
|
+
"""
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class PostgresInterface(Database):
|
|
534
|
+
"""Database implementation for PostgreSQL backends.
|
|
535
|
+
|
|
536
|
+
The `config` attribute provides read-only access to the current GUC values of the server.
|
|
537
|
+
|
|
538
|
+
Parameters
|
|
539
|
+
----------
|
|
540
|
+
connect_string : str
|
|
541
|
+
Connection string for `psycopg` to establish a connection to the Postgres server
|
|
542
|
+
system_name : str, optional
|
|
543
|
+
Description of the specific Postgres server, by default *Postgres*
|
|
544
|
+
application_name : str, optional
|
|
545
|
+
Identifier for the Postgres server. This will be the name that is shown in the server logs and process lists.
|
|
546
|
+
client_encoding : str, optional
|
|
547
|
+
The client encoding to use for the connection, by default *UTF8*
|
|
548
|
+
cache_enabled : bool, optional
|
|
549
|
+
Whether to enable caching of database queries, by default *False*
|
|
550
|
+
debug : bool, optional
|
|
551
|
+
Whether additional debug information should be printed during database interaction. Defaults to *False*.
|
|
552
|
+
"""
|
|
553
|
+
|
|
554
|
+
def __init__(
|
|
555
|
+
self,
|
|
556
|
+
connect_string: str,
|
|
557
|
+
system_name: str = "Postgres",
|
|
558
|
+
*,
|
|
559
|
+
application_name: str = "PostBOUND",
|
|
560
|
+
client_encoding: str = "UTF8",
|
|
561
|
+
cache_enabled: bool = False,
|
|
562
|
+
debug: bool = False,
|
|
563
|
+
) -> None:
|
|
564
|
+
self.connect_string = connect_string
|
|
565
|
+
self.debug = debug
|
|
566
|
+
self.config = PostgresConfigInterface(self)
|
|
567
|
+
self._application_name = application_name or "PostBOUND"
|
|
568
|
+
self._client_encoding = client_encoding
|
|
569
|
+
self._init_connection()
|
|
570
|
+
|
|
571
|
+
self._db_stats = PostgresStatisticsInterface(self)
|
|
572
|
+
self._db_schema = PostgresSchemaInterface(self)
|
|
573
|
+
self._hinting_backend = PostgresHintService(self)
|
|
574
|
+
|
|
575
|
+
self._timeout_executor = TimeoutQueryExecutor(self)
|
|
576
|
+
self._last_query_runtime = math.nan
|
|
577
|
+
|
|
578
|
+
super().__init__(system_name, cache_enabled=cache_enabled)
|
|
579
|
+
|
|
580
|
+
def schema(self) -> PostgresSchemaInterface:
|
|
581
|
+
return self._db_schema
|
|
582
|
+
|
|
583
|
+
def statistics(self) -> PostgresStatisticsInterface:
|
|
584
|
+
return self._db_stats
|
|
585
|
+
|
|
586
|
+
def hinting(self) -> PostgresHintService:
|
|
587
|
+
return self._hinting_backend
|
|
588
|
+
|
|
589
|
+
def execute_query(
|
|
590
|
+
self,
|
|
591
|
+
query: SqlQuery | str,
|
|
592
|
+
*,
|
|
593
|
+
cache_enabled: Optional[bool] = None,
|
|
594
|
+
raw: bool = False,
|
|
595
|
+
timeout: Optional[float] = None,
|
|
596
|
+
) -> Any:
|
|
597
|
+
if timeout is not None and timeout > 0:
|
|
598
|
+
return self._timeout_executor.execute_query(
|
|
599
|
+
query, timeout=timeout, cache_enabled=cache_enabled, raw=raw
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
cache_enabled = cache_enabled or (cache_enabled is None and self._cache_enabled)
|
|
603
|
+
if isinstance(query, UserString):
|
|
604
|
+
query = str(query)
|
|
605
|
+
elif isinstance(query, SqlQuery):
|
|
606
|
+
query = self._hinting_backend.format_query(query)
|
|
607
|
+
|
|
608
|
+
if cache_enabled and query in self._query_cache:
|
|
609
|
+
query_result = self._query_cache[query]
|
|
610
|
+
return query_result if raw else simplify_result_set(query_result)
|
|
611
|
+
|
|
612
|
+
try:
|
|
613
|
+
start_time = time.perf_counter_ns()
|
|
614
|
+
self._cursor.execute(query)
|
|
615
|
+
end_time = time.perf_counter_ns()
|
|
616
|
+
self._last_query_runtime = (
|
|
617
|
+
end_time - start_time
|
|
618
|
+
) / 10**9 # convert to seconds
|
|
619
|
+
|
|
620
|
+
query_result = (
|
|
621
|
+
self._cursor.fetchall() if self._cursor.rowcount >= 0 else None
|
|
622
|
+
)
|
|
623
|
+
if cache_enabled:
|
|
624
|
+
self._inflate_query_cache()
|
|
625
|
+
self._query_cache[query] = query_result
|
|
626
|
+
except (psycopg.InternalError, psycopg.OperationalError) as e:
|
|
627
|
+
msg = "\n".join(
|
|
628
|
+
[
|
|
629
|
+
f"At {util.timestamp()}",
|
|
630
|
+
"For query:",
|
|
631
|
+
str(query),
|
|
632
|
+
"Message:",
|
|
633
|
+
str(e),
|
|
634
|
+
]
|
|
635
|
+
)
|
|
636
|
+
raise DatabaseServerError(msg, e)
|
|
637
|
+
except psycopg.Error as e:
|
|
638
|
+
msg = "\n".join(
|
|
639
|
+
[
|
|
640
|
+
f"At {util.timestamp()}",
|
|
641
|
+
"For query:",
|
|
642
|
+
str(query),
|
|
643
|
+
"Message:",
|
|
644
|
+
str(e),
|
|
645
|
+
]
|
|
646
|
+
)
|
|
647
|
+
raise DatabaseUserError(msg, e)
|
|
648
|
+
|
|
649
|
+
return query_result if raw else simplify_result_set(query_result)
|
|
650
|
+
|
|
651
|
+
def execute_with_timeout(
|
|
652
|
+
self, query: SqlQuery | str, timeout: float = 60.0
|
|
653
|
+
) -> Optional[ResultSet]:
|
|
654
|
+
try:
|
|
655
|
+
result = self.execute_query(
|
|
656
|
+
query, timeout=timeout, cache_enabled=False, raw=True
|
|
657
|
+
)
|
|
658
|
+
return result
|
|
659
|
+
except TimeoutError:
|
|
660
|
+
return None
|
|
661
|
+
|
|
662
|
+
def last_query_runtime(self) -> float:
|
|
663
|
+
return self._last_query_runtime
|
|
664
|
+
|
|
665
|
+
def time_query(self, query: SqlQuery, *, timeout: Optional[float] = None) -> float:
|
|
666
|
+
self.execute_query(query, cache_enabled=False, raw=True, timeout=timeout)
|
|
667
|
+
return self.last_query_runtime()
|
|
668
|
+
|
|
669
|
+
def optimizer(self) -> PostgresOptimizer:
|
|
670
|
+
return PostgresOptimizer(self)
|
|
671
|
+
|
|
672
|
+
def database_name(self) -> str:
|
|
673
|
+
self._cursor.execute("SELECT CURRENT_DATABASE();")
|
|
674
|
+
db_name = self._cursor.fetchone()[0]
|
|
675
|
+
return db_name
|
|
676
|
+
|
|
677
|
+
def database_system_version(self) -> Version:
|
|
678
|
+
self._cursor.execute("SELECT VERSION();")
|
|
679
|
+
version_string = self._cursor.fetchone()[0]
|
|
680
|
+
version_match = _PGVersionPattern.match(version_string)
|
|
681
|
+
if not version_match:
|
|
682
|
+
raise RuntimeError(
|
|
683
|
+
f"Could not extract Postgres version from string '{version_string}'"
|
|
684
|
+
)
|
|
685
|
+
pg_ver = version_match.group("pg_ver")
|
|
686
|
+
return Version(pg_ver)
|
|
687
|
+
|
|
688
|
+
def backend_pid(self) -> int:
|
|
689
|
+
"""Provides the backend process ID of the current connection.
|
|
690
|
+
|
|
691
|
+
Returns
|
|
692
|
+
-------
|
|
693
|
+
int
|
|
694
|
+
The process ID
|
|
695
|
+
"""
|
|
696
|
+
return self._connection.info.backend_pid
|
|
697
|
+
|
|
698
|
+
def data_dir(self) -> Path:
|
|
699
|
+
"""Get the data directory of the Postgres server.
|
|
700
|
+
|
|
701
|
+
Returns
|
|
702
|
+
-------
|
|
703
|
+
Path
|
|
704
|
+
The data directory path
|
|
705
|
+
"""
|
|
706
|
+
self._cursor.execute("SHOW data_directory;")
|
|
707
|
+
data_dir = self._cursor.fetchone()[0]
|
|
708
|
+
return Path(data_dir)
|
|
709
|
+
|
|
710
|
+
def logfile(self) -> Optional[Path]:
|
|
711
|
+
"""Get the log file of the (local) Postgres server."""
|
|
712
|
+
proc_path = Path(f"/proc/{self.backend_pid()}/fd/1")
|
|
713
|
+
if not proc_path.exists() or not proc_path.is_symlink():
|
|
714
|
+
return None
|
|
715
|
+
return proc_path.resolve()
|
|
716
|
+
|
|
717
|
+
def describe(self) -> jsondict:
|
|
718
|
+
base_info = {
|
|
719
|
+
"system_name": self.database_system_name(),
|
|
720
|
+
"system_version": self.database_system_version(),
|
|
721
|
+
"database": self.database_name(),
|
|
722
|
+
"statistics_settings": {
|
|
723
|
+
"emulated": self._db_stats.emulated,
|
|
724
|
+
"cache_enabled": self._db_stats.cache_enabled,
|
|
725
|
+
},
|
|
726
|
+
"hinting_mode": self._hinting_backend.describe(),
|
|
727
|
+
"query_cache": self.cache_enabled,
|
|
728
|
+
}
|
|
729
|
+
self._cursor.execute("SELECT name, setting FROM pg_settings")
|
|
730
|
+
system_settings = self._cursor.fetchall()
|
|
731
|
+
base_info["system_settings"] = {
|
|
732
|
+
setting: value
|
|
733
|
+
for setting, value in system_settings
|
|
734
|
+
if setting in _SignificantPostgresSettings
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
schema_info: list[jsondict] = []
|
|
738
|
+
for table in self._db_schema.tables():
|
|
739
|
+
if table.full_name.startswith("pg_"):
|
|
740
|
+
continue # skip system tables
|
|
741
|
+
|
|
742
|
+
column_info: list[jsondict] = []
|
|
743
|
+
|
|
744
|
+
for column in self._db_schema.columns(table):
|
|
745
|
+
column_info.append(
|
|
746
|
+
{
|
|
747
|
+
"column": str(column),
|
|
748
|
+
"indexed": self.schema().has_index(column),
|
|
749
|
+
"foreign_keys": self._db_schema.foreign_keys_on(column),
|
|
750
|
+
}
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
pk_col = self._db_schema.primary_key_column(table)
|
|
754
|
+
schema_info.append(
|
|
755
|
+
{
|
|
756
|
+
"table": str(table),
|
|
757
|
+
"n_rows": self.statistics().total_rows(table, emulated=True),
|
|
758
|
+
"columns": column_info,
|
|
759
|
+
"primary_key": pk_col.name if pk_col else None,
|
|
760
|
+
}
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
base_info["schema_info"] = schema_info
|
|
764
|
+
return base_info
|
|
765
|
+
|
|
766
|
+
def reset_connection(self) -> int:
|
|
767
|
+
try:
|
|
768
|
+
self._connection.cancel()
|
|
769
|
+
self._cursor.close()
|
|
770
|
+
self._connection.close()
|
|
771
|
+
except psycopg.Error:
|
|
772
|
+
pass
|
|
773
|
+
return self._init_connection()
|
|
774
|
+
|
|
775
|
+
def cursor(self) -> psycopg.Cursor:
|
|
776
|
+
return self._cursor
|
|
777
|
+
|
|
778
|
+
def connection(self) -> psycopg.Connection:
|
|
779
|
+
"""Provides the current database connection.
|
|
780
|
+
|
|
781
|
+
Returns
|
|
782
|
+
-------
|
|
783
|
+
psycopg.Connection
|
|
784
|
+
The connection
|
|
785
|
+
"""
|
|
786
|
+
return self._connection
|
|
787
|
+
|
|
788
|
+
def obtain_new_local_connection(self) -> psycopg.Connection:
|
|
789
|
+
"""Provides a new database connection to be used exclusively be the client.
|
|
790
|
+
|
|
791
|
+
The current connection maintained by the `PostgresInterface` is not affected by obtaining a new connection in any
|
|
792
|
+
way.
|
|
793
|
+
|
|
794
|
+
Returns
|
|
795
|
+
-------
|
|
796
|
+
psycopg.Connection
|
|
797
|
+
The connection
|
|
798
|
+
"""
|
|
799
|
+
return psycopg.connect(self.connect_string)
|
|
800
|
+
|
|
801
|
+
def close(self) -> None:
|
|
802
|
+
self._cursor.close()
|
|
803
|
+
self._connection.close()
|
|
804
|
+
|
|
805
|
+
def prewarm_tables(
|
|
806
|
+
self,
|
|
807
|
+
tables: Optional[TableReference | Iterable[TableReference]] = None,
|
|
808
|
+
*more_tables: TableReference,
|
|
809
|
+
exclude_table_pages: bool = False,
|
|
810
|
+
include_primary_index: bool = True,
|
|
811
|
+
include_secondary_indexes: bool = True,
|
|
812
|
+
) -> None:
|
|
813
|
+
"""Prepares the Postgres buffer pool with tuples from specific tables.
|
|
814
|
+
|
|
815
|
+
Parameters
|
|
816
|
+
----------
|
|
817
|
+
tables : Optional[TableReference | Iterable[TableReference]], optional
|
|
818
|
+
The tables that should be placed into the buffer pool
|
|
819
|
+
*more_tables : TableReference
|
|
820
|
+
More tables that should be placed into the buffer pool, enabling a more convenient usage of this method.
|
|
821
|
+
See examples for details on the usage.
|
|
822
|
+
exclude_table_pages : bool, optional
|
|
823
|
+
Whether the table data (i.e. pages containing the actual tuples) should *not* be prewarmed. This is off by default,
|
|
824
|
+
meaning that prewarming is applied to the data pages. This can be toggled on to only prewarm index pages (see
|
|
825
|
+
`include_primary_index` and `include_secondary_index`).
|
|
826
|
+
include_primary_index : bool, optional
|
|
827
|
+
Whether the pages of the primary key index should also be prewarmed. Enabled by default.
|
|
828
|
+
include_secondary_indexes : bool, optional
|
|
829
|
+
Whether the pages for secondary indexes should also be prewarmed. Enabled by default.
|
|
830
|
+
|
|
831
|
+
Notes
|
|
832
|
+
-----
|
|
833
|
+
If the database should prewarm more table pages than can be contained in the shared buffer, the actual contents of the
|
|
834
|
+
pool are not specified. Since all prewarming tasks happen sequentially, the first prewarmed relations will typically
|
|
835
|
+
be evicted and only the last relations (tables or indexes) are retained in the shared buffer. The precise order in
|
|
836
|
+
which the prewarming tasks are executed is not specified and depends on the actual relations.
|
|
837
|
+
|
|
838
|
+
Examples
|
|
839
|
+
--------
|
|
840
|
+
>>> pg.prewarm_tables([table1, table2])
|
|
841
|
+
>>> pg.prewarm_tables(table1, table2)
|
|
842
|
+
"""
|
|
843
|
+
self._assert_active_extension("pg_prewarm")
|
|
844
|
+
tables: Iterable[TableReference] = list(util.enlist(tables)) + list(more_tables)
|
|
845
|
+
if not tables:
|
|
846
|
+
return
|
|
847
|
+
tables = set(
|
|
848
|
+
tab.full_name for tab in tables
|
|
849
|
+
) # eliminate duplicates if tables are selected multiple times
|
|
850
|
+
|
|
851
|
+
table_indexes = (
|
|
852
|
+
[self._fetch_index_relnames(tab) for tab in tables]
|
|
853
|
+
if include_primary_index or include_secondary_indexes
|
|
854
|
+
else []
|
|
855
|
+
)
|
|
856
|
+
indexes_to_prewarm = {
|
|
857
|
+
idx
|
|
858
|
+
for idx, primary in util.flatten(table_indexes)
|
|
859
|
+
if (primary and include_primary_index)
|
|
860
|
+
or (not primary and include_secondary_indexes)
|
|
861
|
+
}
|
|
862
|
+
tables = (
|
|
863
|
+
indexes_to_prewarm if exclude_table_pages else tables | indexes_to_prewarm
|
|
864
|
+
)
|
|
865
|
+
if not tables:
|
|
866
|
+
return
|
|
867
|
+
|
|
868
|
+
prewarm_invocations = [f"pg_prewarm('{tab}')" for tab in tables]
|
|
869
|
+
prewarm_text = ", ".join(prewarm_invocations)
|
|
870
|
+
prewarm_query = f"SELECT {prewarm_text}"
|
|
871
|
+
|
|
872
|
+
self._cursor.execute(prewarm_query)
|
|
873
|
+
|
|
874
|
+
def cooldown_tables(
|
|
875
|
+
self,
|
|
876
|
+
tables: Optional[TableReference | Iterable[TableReference]] = None,
|
|
877
|
+
*more_tables: TableReference,
|
|
878
|
+
exclude_table_pages: bool = False,
|
|
879
|
+
include_primary_index: bool = True,
|
|
880
|
+
include_secondary_indexes: bool = True,
|
|
881
|
+
) -> None:
|
|
882
|
+
"""Removes tuples from specific tables from the Postgres buffer pool.
|
|
883
|
+
|
|
884
|
+
This method can be used to simulate a cold start for the next incoming query. It requires the *pg_temperature*
|
|
885
|
+
extension that is part of the pg_lab project.
|
|
886
|
+
|
|
887
|
+
Parameters
|
|
888
|
+
----------
|
|
889
|
+
tables : Optional[TableReference | Iterable[TableReference]], optional
|
|
890
|
+
The tables that should be removed from the buffer pool
|
|
891
|
+
*more_tables : TableReference
|
|
892
|
+
More tables that should be removed into the buffer pool, enabling a more convenient usage of this method.
|
|
893
|
+
See examples for details on the usage.
|
|
894
|
+
exclude_table_pages : bool, optional
|
|
895
|
+
Whether the table data (i.e. pages containing the actual tuples) should *not* be removed. This is off by default,
|
|
896
|
+
meaning that the cooldown is applied to the data pages. This can be toggled on to only cooldown index pages (see
|
|
897
|
+
`include_primary_index` and `include_secondary_index`).
|
|
898
|
+
include_primary_index : bool, optional
|
|
899
|
+
Whether the pages of the primary key index should also be cooled down. Enabled by default.
|
|
900
|
+
include_secondary_indexes : bool, optional
|
|
901
|
+
Whether the pages for secondary indexes should also be cooled down. Enabled by default.
|
|
902
|
+
|
|
903
|
+
Examples
|
|
904
|
+
--------
|
|
905
|
+
>>> pg.cooldown_tables([table1, table2])
|
|
906
|
+
>>> pg.cooldown_tables(table1, table2)
|
|
907
|
+
|
|
908
|
+
References
|
|
909
|
+
----------
|
|
910
|
+
pg_lab : https://github.com/rbergm/pg_lab
|
|
911
|
+
"""
|
|
912
|
+
self._assert_active_extension("pg_temperature")
|
|
913
|
+
tables: Iterable[TableReference] = list(util.enlist(tables)) + list(more_tables)
|
|
914
|
+
if not tables:
|
|
915
|
+
return
|
|
916
|
+
tables = set(
|
|
917
|
+
tab.full_name for tab in tables
|
|
918
|
+
) # eliminate duplicates if tables are selected multiple times
|
|
919
|
+
|
|
920
|
+
table_indexes = (
|
|
921
|
+
[self._fetch_index_relnames(tab) for tab in tables]
|
|
922
|
+
if include_primary_index or include_secondary_indexes
|
|
923
|
+
else []
|
|
924
|
+
)
|
|
925
|
+
indexes_to_cooldown = {
|
|
926
|
+
idx
|
|
927
|
+
for idx, primary in util.flatten(table_indexes)
|
|
928
|
+
if (primary and include_primary_index)
|
|
929
|
+
or (not primary and include_secondary_indexes)
|
|
930
|
+
}
|
|
931
|
+
tables = (
|
|
932
|
+
indexes_to_cooldown if exclude_table_pages else tables | indexes_to_cooldown
|
|
933
|
+
)
|
|
934
|
+
if not tables:
|
|
935
|
+
return
|
|
936
|
+
|
|
937
|
+
cooldown_invocations = [f"pg_cooldown('{tab}')" for tab in tables]
|
|
938
|
+
cooldown_text = ", ".join(cooldown_invocations)
|
|
939
|
+
cooldown_query = f"SELECT {cooldown_text}"
|
|
940
|
+
|
|
941
|
+
self._cursor.execute(cooldown_query)
|
|
942
|
+
|
|
943
|
+
def current_configuration(
|
|
944
|
+
self, *, runtime_changeable_only: bool = False
|
|
945
|
+
) -> PostgresConfiguration:
|
|
946
|
+
"""Provides all current configuration settings in the current Postgres connection.
|
|
947
|
+
|
|
948
|
+
Parameters
|
|
949
|
+
----------
|
|
950
|
+
runtime_changeable_only : bool, optional
|
|
951
|
+
Whether only such settings that can be changed at runtime should be provided. Defaults to *False*.
|
|
952
|
+
|
|
953
|
+
Returns
|
|
954
|
+
-------
|
|
955
|
+
PostgresConfiguration
|
|
956
|
+
The current configuration.
|
|
957
|
+
"""
|
|
958
|
+
self._cursor.execute("SELECT name, setting FROM pg_settings")
|
|
959
|
+
system_settings = self._cursor.fetchall()
|
|
960
|
+
allowed_settings = (
|
|
961
|
+
_RuntimeChangeablePostgresSettings
|
|
962
|
+
if runtime_changeable_only
|
|
963
|
+
else _SignificantPostgresSettings
|
|
964
|
+
)
|
|
965
|
+
configuration = {
|
|
966
|
+
setting: value
|
|
967
|
+
for setting, value in system_settings
|
|
968
|
+
if setting in allowed_settings
|
|
969
|
+
}
|
|
970
|
+
return PostgresConfiguration.load(**configuration)
|
|
971
|
+
|
|
972
|
+
def apply_configuration(
|
|
973
|
+
self, configuration: PostgresConfiguration | PostgresSetting | str
|
|
974
|
+
) -> None:
|
|
975
|
+
"""Changes specific configuration parameters of the Postgres server or current connection.
|
|
976
|
+
|
|
977
|
+
Parameters
|
|
978
|
+
----------
|
|
979
|
+
configuration : PostgresConfiguration | PostgresSetting | str
|
|
980
|
+
The desired setting values. If a string is supplied directly, it already has to be a valid setting update such as
|
|
981
|
+
*SET geqo = FALSE;*.
|
|
982
|
+
"""
|
|
983
|
+
if (
|
|
984
|
+
isinstance(configuration, PostgresSetting)
|
|
985
|
+
and configuration.parameter not in _RuntimeChangeablePostgresSettings
|
|
986
|
+
):
|
|
987
|
+
warnings.warn(
|
|
988
|
+
f"Cannot apply configuration setting '{configuration.parameter}' at runtime"
|
|
989
|
+
)
|
|
990
|
+
return
|
|
991
|
+
elif isinstance(configuration, PostgresConfiguration):
|
|
992
|
+
supported_settings: list[PostgresSetting] = []
|
|
993
|
+
unsupported_settings: list[str] = []
|
|
994
|
+
for setting in configuration.settings:
|
|
995
|
+
if setting.parameter in _RuntimeChangeablePostgresSettings:
|
|
996
|
+
supported_settings.append(setting)
|
|
997
|
+
else:
|
|
998
|
+
unsupported_settings.append(setting.parameter)
|
|
999
|
+
if unsupported_settings:
|
|
1000
|
+
warnings.warn(
|
|
1001
|
+
f"Skipping configuration settings {unsupported_settings} "
|
|
1002
|
+
"because they cannot be changed at runtime"
|
|
1003
|
+
)
|
|
1004
|
+
configuration = str(PostgresConfiguration(supported_settings))
|
|
1005
|
+
|
|
1006
|
+
self._cursor.execute(configuration)
|
|
1007
|
+
|
|
1008
|
+
def has_extension(
|
|
1009
|
+
self, extension_name: str, *, is_shared_object: bool = True
|
|
1010
|
+
) -> bool:
|
|
1011
|
+
"""Checks, whether the current Postgres database has a specific extension loaded and active.
|
|
1012
|
+
|
|
1013
|
+
Extensions can be either created using the *CREATE EXTENSION* command, or by loading the shared object via *LOAD*.
|
|
1014
|
+
For the shared object-based check to work correctly, the Postgres server has to run in the same namespace as the
|
|
1015
|
+
PostBOUND client.
|
|
1016
|
+
|
|
1017
|
+
Parameters
|
|
1018
|
+
----------
|
|
1019
|
+
extension_name : str
|
|
1020
|
+
The name of the extension to be checked. In case of shared objects, this should be equivalent to the name of said
|
|
1021
|
+
object. In this case, the suffix is optional.
|
|
1022
|
+
is_shared_object : bool, optional
|
|
1023
|
+
Whether the extension is a shared object that is loaded into the Postgres server. By default this is set to *True*,
|
|
1024
|
+
which assumes that the extension is loaded as a shared object, rather than as a default extension.
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
Returns
|
|
1028
|
+
-------
|
|
1029
|
+
bool
|
|
1030
|
+
Whether the extension is loaded and active in the current Postgres database.
|
|
1031
|
+
"""
|
|
1032
|
+
match sys.platform:
|
|
1033
|
+
case "win32" | "cygwin":
|
|
1034
|
+
lib_suffix = ".dll"
|
|
1035
|
+
case "darwin":
|
|
1036
|
+
lib_suffix = ".dylib"
|
|
1037
|
+
case "linux":
|
|
1038
|
+
lib_suffix = ".so"
|
|
1039
|
+
case _:
|
|
1040
|
+
raise RuntimeError(
|
|
1041
|
+
f"Plaform '{sys.platform}' is not supported by extension check."
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
if is_shared_object or extension_name in ("pg_hint_plan", "pg_lab"):
|
|
1045
|
+
shared_object_name = (
|
|
1046
|
+
f"{extension_name}{lib_suffix}"
|
|
1047
|
+
if not extension_name.endswith(lib_suffix)
|
|
1048
|
+
else extension_name
|
|
1049
|
+
)
|
|
1050
|
+
loaded_shared_objects = util.system.open_files(
|
|
1051
|
+
self._connection.info.backend_pid
|
|
1052
|
+
)
|
|
1053
|
+
return any(so.endswith(shared_object_name) for so in loaded_shared_objects)
|
|
1054
|
+
else:
|
|
1055
|
+
self._cursor.execute("SELECT extname FROM pg_extension;")
|
|
1056
|
+
return any(ext[0] == extension_name for ext in self._cursor.fetchall())
|
|
1057
|
+
|
|
1058
|
+
def _init_connection(self) -> int:
|
|
1059
|
+
"""Sets all default connection parameters and creates the actual database cursor.
|
|
1060
|
+
|
|
1061
|
+
Returns
|
|
1062
|
+
-------
|
|
1063
|
+
int
|
|
1064
|
+
The backend process ID of the new connection
|
|
1065
|
+
"""
|
|
1066
|
+
self._connection: psycopg.Connection = psycopg.connect(
|
|
1067
|
+
self.connect_string,
|
|
1068
|
+
application_name=self._application_name,
|
|
1069
|
+
client_encoding=self._client_encoding,
|
|
1070
|
+
row_factory=psycopg.rows.tuple_row,
|
|
1071
|
+
)
|
|
1072
|
+
self._connection.autocommit = (
|
|
1073
|
+
True # pg_hint_plan hinting backend currently relies on autocommit!
|
|
1074
|
+
)
|
|
1075
|
+
self._connection.prepare_threshold = None
|
|
1076
|
+
self._cursor: psycopg.Cursor = self._connection.cursor()
|
|
1077
|
+
return self.backend_pid()
|
|
1078
|
+
|
|
1079
|
+
def _fetch_index_relnames(
|
|
1080
|
+
self, table: TableReference | str
|
|
1081
|
+
) -> Iterable[tuple[str, bool]]:
|
|
1082
|
+
"""Loads all physical index relations for a physical table.
|
|
1083
|
+
|
|
1084
|
+
Parameters
|
|
1085
|
+
----------
|
|
1086
|
+
table : TableReference
|
|
1087
|
+
The table for which to load the indexes
|
|
1088
|
+
|
|
1089
|
+
Returns
|
|
1090
|
+
-------
|
|
1091
|
+
Iterable[tuple[str, bool]]
|
|
1092
|
+
All indexes as pairs *(relation name, primary)*. Relation name corresponds to the table-like object that Postgres
|
|
1093
|
+
created internally to store the index (e.g. for a table called *title*, this is typically called *title_pkey* for
|
|
1094
|
+
the primary key index). The *primary* boolean indicates whether this is the primary key index of the table.
|
|
1095
|
+
"""
|
|
1096
|
+
query_template = textwrap.dedent("""
|
|
1097
|
+
SELECT cls.relname, idx.indisprimary
|
|
1098
|
+
FROM pg_index idx
|
|
1099
|
+
JOIN pg_class cls ON idx.indexrelid = cls.oid
|
|
1100
|
+
JOIN pg_class owner_cls ON idx.indrelid = owner_cls.oid
|
|
1101
|
+
WHERE owner_cls.relname = %s;
|
|
1102
|
+
""")
|
|
1103
|
+
table = table.full_name if isinstance(table, TableReference) else table
|
|
1104
|
+
self._cursor.execute(query_template, (table,))
|
|
1105
|
+
return list(self._cursor.fetchall())
|
|
1106
|
+
|
|
1107
|
+
def _assert_active_extension(
|
|
1108
|
+
self, extension_name: str, *, is_shared_object: bool = False
|
|
1109
|
+
) -> None:
|
|
1110
|
+
"""Raises an error if the current postgres database does not have the desired extension.
|
|
1111
|
+
|
|
1112
|
+
Extensions can be created using the *CREATE EXTENSION* command, or by loading the shared object via *LOAD*. In either
|
|
1113
|
+
case, this method can check whether they are indeed active.
|
|
1114
|
+
|
|
1115
|
+
Parameters
|
|
1116
|
+
----------
|
|
1117
|
+
extension_name : str
|
|
1118
|
+
The name of the extension to be checked.
|
|
1119
|
+
is_shared_object : bool, optional
|
|
1120
|
+
Whether the extension is activated using *LOAD*. If this it the case, the shared objects owned by the database
|
|
1121
|
+
process rather than the internal extension catalogs will be checked. The extension name will be automatically
|
|
1122
|
+
suffixed with *.so* if necessary. As a special case, for checking the *pg_hint_plan* extension this parameter does
|
|
1123
|
+
not need to be true. This is due to the central importance of that extension for the entire Postgres hinting
|
|
1124
|
+
system and saves some typing in that case.
|
|
1125
|
+
|
|
1126
|
+
Raises
|
|
1127
|
+
------
|
|
1128
|
+
StateError
|
|
1129
|
+
If the given extension is not active
|
|
1130
|
+
"""
|
|
1131
|
+
extension_is_active = self.has_extension(
|
|
1132
|
+
extension_name, is_shared_object=is_shared_object
|
|
1133
|
+
)
|
|
1134
|
+
if not extension_is_active:
|
|
1135
|
+
raise StateError(
|
|
1136
|
+
f"Extension '{extension_name}' is not active in database '{self.database_name()}'"
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
def __eq__(self, other: object) -> bool:
|
|
1140
|
+
return (
|
|
1141
|
+
isinstance(other, type(self))
|
|
1142
|
+
and self.connect_string == other.connect_string
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
def __hash__(self) -> int:
|
|
1146
|
+
return hash(self.connect_string)
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
class PostgresSchemaInterface(DatabaseSchema):
|
|
1150
|
+
"""Database schema implementation for Postgres systems.
|
|
1151
|
+
|
|
1152
|
+
Parameters
|
|
1153
|
+
----------
|
|
1154
|
+
postgres_db : PostgresInterface
|
|
1155
|
+
The database for which schema information should be retrieved
|
|
1156
|
+
"""
|
|
1157
|
+
|
|
1158
|
+
def __int__(self, postgres_db: PostgresInterface) -> None:
|
|
1159
|
+
super().__init__(postgres_db)
|
|
1160
|
+
|
|
1161
|
+
def tables(self, *, schema: str = "public") -> set[TableReference]:
|
|
1162
|
+
query_template = textwrap.dedent("""
|
|
1163
|
+
SELECT table_name
|
|
1164
|
+
FROM information_schema.tables
|
|
1165
|
+
WHERE table_catalog = %s AND table_schema = %s""")
|
|
1166
|
+
self._db.cursor().execute(query_template, (self._db.database_name(), schema))
|
|
1167
|
+
result_set = self._db.cursor().fetchall()
|
|
1168
|
+
assert result_set is not None
|
|
1169
|
+
return set(TableReference(row[0]) for row in result_set)
|
|
1170
|
+
|
|
1171
|
+
def lookup_column(
|
|
1172
|
+
self,
|
|
1173
|
+
column: ColumnReference | str,
|
|
1174
|
+
candidate_tables: Iterable[TableReference],
|
|
1175
|
+
*,
|
|
1176
|
+
expect_match: bool = False,
|
|
1177
|
+
) -> Optional[TableReference]:
|
|
1178
|
+
candidate_tables = (
|
|
1179
|
+
set(candidate_tables)
|
|
1180
|
+
if len(candidate_tables) > 5
|
|
1181
|
+
else list(candidate_tables)
|
|
1182
|
+
)
|
|
1183
|
+
column = column.name if isinstance(column, ColumnReference) else column
|
|
1184
|
+
lower_col = column.lower()
|
|
1185
|
+
|
|
1186
|
+
for table in candidate_tables:
|
|
1187
|
+
table_columns = self._fetch_columns(table)
|
|
1188
|
+
if column in table_columns or lower_col in table_columns:
|
|
1189
|
+
return table
|
|
1190
|
+
|
|
1191
|
+
if not expect_match:
|
|
1192
|
+
return None
|
|
1193
|
+
candidate_tables = [table.qualified_name() for table in candidate_tables]
|
|
1194
|
+
raise ValueError(
|
|
1195
|
+
f"Column '{column}' not found in candidate tables {candidate_tables}"
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
def is_primary_key(self, column: ColumnReference) -> bool:
|
|
1199
|
+
if not column.table:
|
|
1200
|
+
raise UnboundColumnError(column)
|
|
1201
|
+
if column.table.virtual:
|
|
1202
|
+
raise VirtualTableError(column.table)
|
|
1203
|
+
index_map = self._fetch_indexes(column.table)
|
|
1204
|
+
return index_map.get(column.name, False)
|
|
1205
|
+
|
|
1206
|
+
def has_secondary_index(self, column: ColumnReference) -> bool:
|
|
1207
|
+
if not column.table:
|
|
1208
|
+
raise UnboundColumnError(column)
|
|
1209
|
+
if column.table.virtual:
|
|
1210
|
+
raise VirtualTableError(column.table)
|
|
1211
|
+
index_map = self._fetch_indexes(column.table)
|
|
1212
|
+
|
|
1213
|
+
# The index map contains an entry for each attribute that actually has an index. The value is True, if the
|
|
1214
|
+
# attribute (which is known to be indexed), is even the Primary Key
|
|
1215
|
+
# Our method should return False in two cases: 1) the attribute is not indexed at all; and 2) the attribute
|
|
1216
|
+
# actually is the Primary key. Therefore, by assuming it is the PK in case of absence, we get the correct
|
|
1217
|
+
# value.
|
|
1218
|
+
return not index_map.get(column.name, True)
|
|
1219
|
+
|
|
1220
|
+
def indexes_on(self, column: ColumnReference) -> set[str]:
|
|
1221
|
+
if not column.table:
|
|
1222
|
+
raise UnboundColumnError(column)
|
|
1223
|
+
if column.table.virtual:
|
|
1224
|
+
raise VirtualTableError(column.table)
|
|
1225
|
+
schema = column.table.schema or "public"
|
|
1226
|
+
query_template = textwrap.dedent("""
|
|
1227
|
+
SELECT cls.relname
|
|
1228
|
+
FROM pg_index idx
|
|
1229
|
+
JOIN pg_class cls ON idx.indexrelid = cls.oid
|
|
1230
|
+
JOIN pg_class rel ON idx.indrelid = rel.oid
|
|
1231
|
+
JOIN pg_attribute att ON att.attnum = ANY(idx.indkey) AND idx.indrelid = att.attrelid
|
|
1232
|
+
JOIN pg_namespace nsp ON cls.relnamespace = nsp.oid AND rel.relnamespace = nsp.oid
|
|
1233
|
+
WHERE rel.relname = %s
|
|
1234
|
+
AND att.attname = %s
|
|
1235
|
+
AND nsp.nspname = %s
|
|
1236
|
+
""")
|
|
1237
|
+
|
|
1238
|
+
self._db.cursor().execute(
|
|
1239
|
+
query_template, (column.table.full_name, column.name, schema)
|
|
1240
|
+
)
|
|
1241
|
+
result_set = self._db.cursor().fetchall()
|
|
1242
|
+
return {row[0] for row in result_set}
|
|
1243
|
+
|
|
1244
|
+
def foreign_keys_on(self, column: ColumnReference) -> set[ColumnReference]:
|
|
1245
|
+
if not column.table:
|
|
1246
|
+
raise UnboundColumnError(column)
|
|
1247
|
+
if column.table.virtual:
|
|
1248
|
+
raise VirtualTableError(column.table)
|
|
1249
|
+
schema = column.table.schema or "public"
|
|
1250
|
+
query_template = textwrap.dedent("""
|
|
1251
|
+
SELECT target.table_name, target.column_name
|
|
1252
|
+
FROM information_schema.key_column_usage AS fk_sources
|
|
1253
|
+
JOIN information_schema.table_constraints AS all_constraints
|
|
1254
|
+
ON fk_sources.constraint_name = all_constraints.constraint_name
|
|
1255
|
+
AND fk_sources.table_schema = all_constraints.table_schema
|
|
1256
|
+
JOIN information_schema.constraint_column_usage AS target
|
|
1257
|
+
ON fk_sources.constraint_name = target.constraint_name
|
|
1258
|
+
AND fk_sources.table_schema = target.table_schema
|
|
1259
|
+
WHERE fk_sources.table_name = %s
|
|
1260
|
+
AND fk_sources.column_name = %s
|
|
1261
|
+
AND fk_sources.table_schema = %s
|
|
1262
|
+
AND all_constraints.constraint_type = 'FOREIGN KEY'
|
|
1263
|
+
""")
|
|
1264
|
+
|
|
1265
|
+
self._db.cursor().execute(
|
|
1266
|
+
query_template, (column.table.full_name, column.name, schema)
|
|
1267
|
+
)
|
|
1268
|
+
result_set = self._db.cursor().fetchall()
|
|
1269
|
+
return {ColumnReference(row[1], TableReference(row[0])) for row in result_set}
|
|
1270
|
+
|
|
1271
|
+
def datatype(self, column: ColumnReference) -> str:
|
|
1272
|
+
if not column.table:
|
|
1273
|
+
raise UnboundColumnError(column)
|
|
1274
|
+
if column.table.virtual:
|
|
1275
|
+
raise VirtualTableError(column.table)
|
|
1276
|
+
schema = column.table.schema or "public"
|
|
1277
|
+
query_template = textwrap.dedent("""
|
|
1278
|
+
SELECT data_type FROM information_schema.columns
|
|
1279
|
+
WHERE table_name = %s AND column_name = %s AND table_schema = %s""")
|
|
1280
|
+
self._db.cursor().execute(
|
|
1281
|
+
query_template, (column.table.full_name, column.name, schema)
|
|
1282
|
+
)
|
|
1283
|
+
result_set = self._db.cursor().fetchone()
|
|
1284
|
+
return result_set[0]
|
|
1285
|
+
|
|
1286
|
+
def is_nullable(self, column: ColumnReference) -> bool:
|
|
1287
|
+
if not column.table:
|
|
1288
|
+
raise UnboundColumnError(column)
|
|
1289
|
+
if column.table.virtual:
|
|
1290
|
+
raise VirtualTableError(column.table)
|
|
1291
|
+
schema = column.table.schema or "public"
|
|
1292
|
+
query_tempalte = textwrap.dedent("""
|
|
1293
|
+
SELECT is_nullable = 'YES' FROM information_schema.columns
|
|
1294
|
+
WHERE table_name = %s AND column_name = %s AND table_schema = %s""")
|
|
1295
|
+
self._db.cursor().execute(
|
|
1296
|
+
query_tempalte, (column.table.full_name, column.name, schema)
|
|
1297
|
+
)
|
|
1298
|
+
result_set = self._db.cursor().fetchone()
|
|
1299
|
+
return result_set[0]
|
|
1300
|
+
|
|
1301
|
+
def _fetch_columns(self, table: TableReference) -> list[str]:
|
|
1302
|
+
"""Retrieves all physical columns for a given table from the PG metadata catalogs.
|
|
1303
|
+
|
|
1304
|
+
Parameters
|
|
1305
|
+
----------
|
|
1306
|
+
table : TableReference
|
|
1307
|
+
The table whose columns should be loaded
|
|
1308
|
+
|
|
1309
|
+
Returns
|
|
1310
|
+
-------
|
|
1311
|
+
list[str]
|
|
1312
|
+
The names of all columns
|
|
1313
|
+
|
|
1314
|
+
Raises
|
|
1315
|
+
------
|
|
1316
|
+
VirtualTableError
|
|
1317
|
+
If the table is a virtual table (e.g. subquery or CTE)
|
|
1318
|
+
"""
|
|
1319
|
+
if table.virtual:
|
|
1320
|
+
raise VirtualTableError(table)
|
|
1321
|
+
schema = table.schema or "public"
|
|
1322
|
+
query_template = "SELECT column_name FROM information_schema.columns WHERE table_name = %s AND table_schema = %s"
|
|
1323
|
+
self._db.cursor().execute(query_template, (table.full_name, schema))
|
|
1324
|
+
result_set = self._db.cursor().fetchall()
|
|
1325
|
+
return [col[0] for col in result_set]
|
|
1326
|
+
|
|
1327
|
+
def _fetch_indexes(self, table: TableReference) -> dict[str, bool]:
|
|
1328
|
+
"""Retrieves all index structures for a given table based on the PG metadata catalogs.
|
|
1329
|
+
|
|
1330
|
+
Parameters
|
|
1331
|
+
----------
|
|
1332
|
+
table : TableReference
|
|
1333
|
+
The table whose indexes should be loaded
|
|
1334
|
+
|
|
1335
|
+
Returns
|
|
1336
|
+
-------
|
|
1337
|
+
dict
|
|
1338
|
+
Contains a key for each column that has an index. The column keys map to booleans that indicate whether
|
|
1339
|
+
the corresponding index is a primary key index. Columns without any index do not appear in the dictionary.
|
|
1340
|
+
|
|
1341
|
+
Raises
|
|
1342
|
+
------
|
|
1343
|
+
VirtualTableError
|
|
1344
|
+
If the table is a virtual table (e.g. subquery or CTE)
|
|
1345
|
+
"""
|
|
1346
|
+
if table.virtual:
|
|
1347
|
+
raise VirtualTableError(table)
|
|
1348
|
+
# query adapted from https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns
|
|
1349
|
+
table_name = table.full_name
|
|
1350
|
+
schema = table.schema or "public"
|
|
1351
|
+
index_query = textwrap.dedent("""
|
|
1352
|
+
SELECT attr.attname, idx.indisprimary
|
|
1353
|
+
FROM pg_index idx
|
|
1354
|
+
JOIN pg_attribute attr ON idx.indrelid = attr.attrelid AND attr.attnum = ANY(idx.indkey)
|
|
1355
|
+
JOIN pg_class cls ON idx.indrelid = cls.oid
|
|
1356
|
+
JOIN pg_namespace nsp ON cls.relnamespace = nsp.oid
|
|
1357
|
+
WHERE cls.relname = %s
|
|
1358
|
+
AND nsp.nspname = %s
|
|
1359
|
+
""")
|
|
1360
|
+
self._db.cursor().execute(index_query, (table_name, schema))
|
|
1361
|
+
result_set = self._db.cursor().fetchall()
|
|
1362
|
+
index_map = dict(result_set)
|
|
1363
|
+
return index_map
|
|
1364
|
+
|
|
1365
|
+
def __eq__(self, other: object) -> None:
|
|
1366
|
+
return isinstance(other, type(self)) and self._db == other._db
|
|
1367
|
+
|
|
1368
|
+
def __hash__(self):
|
|
1369
|
+
return hash(self._db)
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
# Postgres stores its array datatypes in a more general array-type structure (anyarray).
|
|
1373
|
+
# However, to extract the individual entries from such an array, the need to be casted to a typed array structure.
|
|
1374
|
+
# This dictionary contains the necessary casts for the actual column types.
|
|
1375
|
+
# For example, suppose a column contains integer values. If this column is aggregated into an anyarray entry, the
|
|
1376
|
+
# appropriate converter for this array is int[]. In other words DTypeArrayConverters["integer"] = "int[]"
|
|
1377
|
+
_DTypeArrayConverters = {
|
|
1378
|
+
"integer": "int[]",
|
|
1379
|
+
"text": "text[]",
|
|
1380
|
+
"character varying": "text[]",
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
class PostgresStatisticsInterface(DatabaseStatistics):
|
|
1385
|
+
"""Statistics implementation for Postgres systems.
|
|
1386
|
+
|
|
1387
|
+
Parameters
|
|
1388
|
+
----------
|
|
1389
|
+
postgres_db : PostgresInterface
|
|
1390
|
+
The database instance for which the statistics should be retrieved
|
|
1391
|
+
emulated : bool, optional
|
|
1392
|
+
Whether the statistics interface should operate in emulation mode. To enable reproducibility, this is *True*
|
|
1393
|
+
by default
|
|
1394
|
+
enable_emulation_fallback : bool, optional
|
|
1395
|
+
Whether emulation should be used for unsupported statistics when running in native mode, by default True
|
|
1396
|
+
cache_enabled : Optional[bool], optional
|
|
1397
|
+
Whether emulated statistics queries should be subject to caching, by default True. Set to *None* to use the
|
|
1398
|
+
caching behavior of the `db`
|
|
1399
|
+
"""
|
|
1400
|
+
|
|
1401
|
+
def __init__(
|
|
1402
|
+
self,
|
|
1403
|
+
postgres_db: PostgresInterface,
|
|
1404
|
+
*,
|
|
1405
|
+
emulated: bool = True,
|
|
1406
|
+
enable_emulation_fallback: bool = True,
|
|
1407
|
+
cache_enabled: Optional[bool] = True,
|
|
1408
|
+
) -> None:
|
|
1409
|
+
super().__init__(
|
|
1410
|
+
postgres_db,
|
|
1411
|
+
emulated=emulated,
|
|
1412
|
+
enable_emulation_fallback=enable_emulation_fallback,
|
|
1413
|
+
cache_enabled=cache_enabled,
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
def n_pages(self, table: TableReference | str) -> int:
|
|
1417
|
+
query_template = "SELECT relpages FROM pg_class WHERE oid = %s::regclass"
|
|
1418
|
+
regclass = table.full_name if isinstance(table, TableReference) else table
|
|
1419
|
+
self._db.cursor().execute(query_template, (regclass,))
|
|
1420
|
+
result_set = self._db.cursor().fetchone()
|
|
1421
|
+
if not result_set:
|
|
1422
|
+
raise ValueError(f"Could not retrieve page count for table '{table}'")
|
|
1423
|
+
return result_set[0]
|
|
1424
|
+
|
|
1425
|
+
def update_statistics(
|
|
1426
|
+
self,
|
|
1427
|
+
columns: Optional[ColumnReference | Iterable[ColumnReference]] = None,
|
|
1428
|
+
*,
|
|
1429
|
+
tables: Optional[TableReference | Iterable[TableReference]] = None,
|
|
1430
|
+
perfect_mcv: bool = False,
|
|
1431
|
+
perfect_n_distinct: bool = False,
|
|
1432
|
+
verbose: bool = False,
|
|
1433
|
+
) -> None:
|
|
1434
|
+
"""Instructs the Postgres server to update statistics for specific columns.
|
|
1435
|
+
|
|
1436
|
+
Notice that is one of the methods of the database interface that explicitly mutates the state of the database system.
|
|
1437
|
+
|
|
1438
|
+
Parameters
|
|
1439
|
+
----------
|
|
1440
|
+
columns : Optional[ColumnReference | Iterable[ColumnReference]], optional
|
|
1441
|
+
The columns for which statistics should be updated. If no columns are given, columns are inferred based on the
|
|
1442
|
+
`tables` and all detected columns are used.
|
|
1443
|
+
tables : Optional[TableReference | Iterable[TableReference]], optional
|
|
1444
|
+
The table for which statistics should be updated. If `columns` are given, this parameter is completely ignored. If
|
|
1445
|
+
no columns and no tables are given, all tables in the current database are used.
|
|
1446
|
+
perfect_mcv : bool, optional
|
|
1447
|
+
Whether the database system should attempt to create perfect statistics. Perfect statistics means that for each of
|
|
1448
|
+
the columns MCV lists are created such that each distinct value is contained within the list. For large and diverse
|
|
1449
|
+
columns, this might lots of compute time as well as storage space. Notice, that the database system still has the
|
|
1450
|
+
ultimate decision on whether to generate MCV lists in the first place. Postgres also imposes a hard limit on the
|
|
1451
|
+
maximum allowed length of MCV lists and histogram widths.
|
|
1452
|
+
perfect_n_distinct : bool, optional
|
|
1453
|
+
Whether to set the number of distinct values to its true value.
|
|
1454
|
+
verbose : bool, optional
|
|
1455
|
+
Whether to print some progress information to standard error.
|
|
1456
|
+
"""
|
|
1457
|
+
if not columns and not tables:
|
|
1458
|
+
tables = [
|
|
1459
|
+
tab
|
|
1460
|
+
for tab in self._db.schema().tables()
|
|
1461
|
+
if not self._db.schema().is_view(tab)
|
|
1462
|
+
]
|
|
1463
|
+
if not columns and tables:
|
|
1464
|
+
tables = util.enlist(tables)
|
|
1465
|
+
columns = util.set_union(self._db.schema().columns(tab) for tab in tables)
|
|
1466
|
+
|
|
1467
|
+
assert columns is not None
|
|
1468
|
+
columns: Iterable[ColumnReference] = util.enlist(columns)
|
|
1469
|
+
columns_map: dict[TableReference, list[str]] = util.dicts.generate_multi(
|
|
1470
|
+
(col.table, col.name) for col in columns
|
|
1471
|
+
)
|
|
1472
|
+
distinct_values: dict[ColumnReference, int] = {}
|
|
1473
|
+
|
|
1474
|
+
if perfect_mcv or perfect_n_distinct:
|
|
1475
|
+
for column in columns:
|
|
1476
|
+
util.logging.print_if(
|
|
1477
|
+
verbose,
|
|
1478
|
+
util.timestamp(),
|
|
1479
|
+
":: Now preparing column",
|
|
1480
|
+
column,
|
|
1481
|
+
use_stderr=True,
|
|
1482
|
+
)
|
|
1483
|
+
n_distinct = round(
|
|
1484
|
+
self.distinct_values(column, emulated=True, cache_enabled=True)
|
|
1485
|
+
)
|
|
1486
|
+
if perfect_n_distinct:
|
|
1487
|
+
distinct_values[column] = n_distinct
|
|
1488
|
+
if not perfect_mcv:
|
|
1489
|
+
continue
|
|
1490
|
+
|
|
1491
|
+
stats_target_query = textwrap.dedent(f"""
|
|
1492
|
+
ALTER TABLE {column.table.full_name}
|
|
1493
|
+
ALTER COLUMN {column.name}
|
|
1494
|
+
SET STATISTICS {n_distinct};
|
|
1495
|
+
""")
|
|
1496
|
+
# This query might issue a warning if the requested stats target is larger than the allowed maximum value
|
|
1497
|
+
# However, Postgres simply uses the maximum value in this case. To permit different maximum values in different
|
|
1498
|
+
# Postgres versions, we accept the warning and do not use a hard-coded maximum value with snapping logic
|
|
1499
|
+
# ourselves.
|
|
1500
|
+
self._db.cursor().execute(stats_target_query)
|
|
1501
|
+
|
|
1502
|
+
columns_str = {
|
|
1503
|
+
table: ", ".join(col for col in columns)
|
|
1504
|
+
for table, columns in columns_map.items()
|
|
1505
|
+
}
|
|
1506
|
+
tables_and_columns = ", ".join(
|
|
1507
|
+
f"{table.full_name}({cols})" for table, cols in columns_str.items()
|
|
1508
|
+
)
|
|
1509
|
+
|
|
1510
|
+
util.logging.print_if(
|
|
1511
|
+
verbose,
|
|
1512
|
+
util.timestamp(),
|
|
1513
|
+
":: Now analyzing columns",
|
|
1514
|
+
tables_and_columns,
|
|
1515
|
+
use_stderr=True,
|
|
1516
|
+
)
|
|
1517
|
+
query_template = f"ANALYZE {tables_and_columns}"
|
|
1518
|
+
self._db.cursor().execute(query_template)
|
|
1519
|
+
|
|
1520
|
+
for column, n_distinct in distinct_values.items():
|
|
1521
|
+
distinct_update_query = textwrap.dedent(f"""
|
|
1522
|
+
ALTER TABLE {column.table.full_name}
|
|
1523
|
+
ALTER COLUMN {column.name}
|
|
1524
|
+
SET (n_distinct = {n_distinct});
|
|
1525
|
+
""")
|
|
1526
|
+
self._db.cursor().execute(distinct_update_query)
|
|
1527
|
+
|
|
1528
|
+
def _retrieve_total_rows_from_stats(self, table: TableReference) -> Optional[int]:
|
|
1529
|
+
count_query = (
|
|
1530
|
+
f"SELECT reltuples FROM pg_class WHERE oid = '{table.full_name}'::regclass"
|
|
1531
|
+
)
|
|
1532
|
+
self._db.cursor().execute(count_query)
|
|
1533
|
+
result_set = self._db.cursor().fetchone()
|
|
1534
|
+
if not result_set:
|
|
1535
|
+
return None
|
|
1536
|
+
count = result_set[0]
|
|
1537
|
+
return count
|
|
1538
|
+
|
|
1539
|
+
def _retrieve_distinct_values_from_stats(
|
|
1540
|
+
self, column: ColumnReference
|
|
1541
|
+
) -> Optional[int]:
|
|
1542
|
+
dist_query = (
|
|
1543
|
+
"SELECT n_distinct FROM pg_stats WHERE tablename = %s and attname = %s"
|
|
1544
|
+
)
|
|
1545
|
+
self._db.cursor().execute(dist_query, (column.table.full_name, column.name))
|
|
1546
|
+
result_set = self._db.cursor().fetchone()
|
|
1547
|
+
if not result_set:
|
|
1548
|
+
return None
|
|
1549
|
+
dist_values = result_set[0]
|
|
1550
|
+
|
|
1551
|
+
# interpreting the n_distinct column is difficult, since different value ranges indicate different things
|
|
1552
|
+
# (see https://www.postgresql.org/docs/current/view-pg-stats.html)
|
|
1553
|
+
# If the value is >= 0, it represents the actual (approximated) number of distinct non-zero values in the
|
|
1554
|
+
# column.
|
|
1555
|
+
# If the value is < 0, it represents 'the negative of the number of distinct values divided by the number of
|
|
1556
|
+
# rows'. Therefore, we have to correct the number of distinct values manually in this case.
|
|
1557
|
+
if dist_values >= 0:
|
|
1558
|
+
return dist_values
|
|
1559
|
+
|
|
1560
|
+
# correct negative values
|
|
1561
|
+
n_rows = self._retrieve_total_rows_from_stats(column.table)
|
|
1562
|
+
return -1 * n_rows * dist_values
|
|
1563
|
+
|
|
1564
|
+
def _retrieve_min_max_values_from_stats(
|
|
1565
|
+
self, column: ColumnReference
|
|
1566
|
+
) -> Optional[tuple[Any, Any]]:
|
|
1567
|
+
# Postgres does not keep track of min/max values, so we need to determine them manually
|
|
1568
|
+
if not self.enable_emulation_fallback:
|
|
1569
|
+
raise UnsupportedDatabaseFeatureError(self._db, "min/max value statistics")
|
|
1570
|
+
return self._calculate_min_max_values(column, cache_enabled=True)
|
|
1571
|
+
|
|
1572
|
+
def _retrieve_most_common_values_from_stats(
|
|
1573
|
+
self, column: ColumnReference, k: int
|
|
1574
|
+
) -> Sequence[tuple[Any, int]]:
|
|
1575
|
+
# Postgres stores the Most common values in a column of type anyarray (since in this column, many MCVs from
|
|
1576
|
+
# many different tables and data types are present). However, this type is not very convenient to work on.
|
|
1577
|
+
# Therefore, we first need to convert the anyarray to an array of the actual attribute type.
|
|
1578
|
+
|
|
1579
|
+
# determine the attributes data type to figure out how it should be converted
|
|
1580
|
+
attribute_query = "SELECT data_type FROM information_schema.columns WHERE table_name = %s AND column_name = %s"
|
|
1581
|
+
self._db.cursor().execute(
|
|
1582
|
+
attribute_query, (column.table.full_name, column.name)
|
|
1583
|
+
)
|
|
1584
|
+
attribute_dtype = self._db.cursor().fetchone()[0]
|
|
1585
|
+
attribute_converter = _DTypeArrayConverters[attribute_dtype]
|
|
1586
|
+
|
|
1587
|
+
# now, load the most frequent values. Since the frequencies are expressed as a fraction of the total number of
|
|
1588
|
+
# rows, we need to multiply this number again to obtain the true number of occurrences
|
|
1589
|
+
mcv_query = textwrap.dedent(
|
|
1590
|
+
"""
|
|
1591
|
+
SELECT UNNEST(most_common_vals::text::{conv}),
|
|
1592
|
+
UNNEST(most_common_freqs) * (SELECT reltuples FROM pg_class WHERE oid = '{tab}'::regclass)
|
|
1593
|
+
FROM pg_stats
|
|
1594
|
+
WHERE tablename = %s AND attname = %s""".format(
|
|
1595
|
+
conv=attribute_converter, tab=column.table.full_name
|
|
1596
|
+
)
|
|
1597
|
+
)
|
|
1598
|
+
self._db.cursor().execute(mcv_query, (column.table.full_name, column.name))
|
|
1599
|
+
return self._db.cursor().fetchall()[:k]
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
PostgresOptimizerSettings = {
|
|
1603
|
+
JoinOperator.NestedLoopJoin: "enable_nestloop",
|
|
1604
|
+
JoinOperator.HashJoin: "enable_hashjoin",
|
|
1605
|
+
JoinOperator.SortMergeJoin: "enable_mergejoin",
|
|
1606
|
+
ScanOperator.SequentialScan: "enable_seqscan",
|
|
1607
|
+
ScanOperator.IndexScan: "enable_indexscan",
|
|
1608
|
+
ScanOperator.IndexOnlyScan: "enable_indexonlyscan",
|
|
1609
|
+
ScanOperator.BitmapScan: "enable_bitmapscan",
|
|
1610
|
+
IntermediateOperator.Memoize: "enable_memoize",
|
|
1611
|
+
IntermediateOperator.Materialize: "enable_material",
|
|
1612
|
+
IntermediateOperator.Sort: "enable_sort",
|
|
1613
|
+
}
|
|
1614
|
+
"""All (session-global) optimizer settings that modify the allowed physical operators."""
|
|
1615
|
+
|
|
1616
|
+
PGHintPlanOptimizerHints: dict[PhysicalOperator, str] = {
|
|
1617
|
+
JoinOperator.NestedLoopJoin: "NestLoop",
|
|
1618
|
+
JoinOperator.HashJoin: "HashJoin",
|
|
1619
|
+
JoinOperator.SortMergeJoin: "MergeJoin",
|
|
1620
|
+
ScanOperator.SequentialScan: "SeqScan",
|
|
1621
|
+
ScanOperator.IndexScan: "IndexOnlyScan",
|
|
1622
|
+
ScanOperator.IndexOnlyScan: "IndexOnlyScan",
|
|
1623
|
+
ScanOperator.BitmapScan: "BitmapScan",
|
|
1624
|
+
IntermediateOperator.Memoize: "Memoize",
|
|
1625
|
+
}
|
|
1626
|
+
"""All physical operators that can be enforced by pg_hint_plan.
|
|
1627
|
+
|
|
1628
|
+
These settings operate on a per-relation basis and overwrite the session-global optimizer settings.
|
|
1629
|
+
|
|
1630
|
+
References
|
|
1631
|
+
----------
|
|
1632
|
+
|
|
1633
|
+
.. pg_hint_plan hints: https://github.com/ossc-db/pg_hint_plan/blob/master/docs/hint_list.md
|
|
1634
|
+
"""
|
|
1635
|
+
|
|
1636
|
+
PGLabOptimizerHints: dict[PhysicalOperator, str] = {
|
|
1637
|
+
JoinOperator.NestedLoopJoin: "NestLoop",
|
|
1638
|
+
JoinOperator.HashJoin: "HashJoin",
|
|
1639
|
+
JoinOperator.SortMergeJoin: "MergeJoin",
|
|
1640
|
+
ScanOperator.SequentialScan: "SeqScan",
|
|
1641
|
+
ScanOperator.IndexScan: "IdxScan",
|
|
1642
|
+
ScanOperator.IndexOnlyScan: "IdxScan",
|
|
1643
|
+
ScanOperator.BitmapScan: "BitmapScan",
|
|
1644
|
+
IntermediateOperator.Materialize: "Material",
|
|
1645
|
+
IntermediateOperator.Memoize: "Memo",
|
|
1646
|
+
}
|
|
1647
|
+
"""All physical operators that can be enforced by pg_lab.
|
|
1648
|
+
|
|
1649
|
+
These settings operate on a per-relation basis and overwrite the session-global optimizer settings.
|
|
1650
|
+
|
|
1651
|
+
References
|
|
1652
|
+
----------
|
|
1653
|
+
|
|
1654
|
+
.. pg_lab extension: https://github.com/rbergm/pg_lab/blob/main/docs/hinting.md
|
|
1655
|
+
|
|
1656
|
+
"""
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
PostgresJoinHints = {
|
|
1660
|
+
JoinOperator.NestedLoopJoin,
|
|
1661
|
+
JoinOperator.HashJoin,
|
|
1662
|
+
JoinOperator.SortMergeJoin,
|
|
1663
|
+
}
|
|
1664
|
+
"""All join operators that are supported by Postgres."""
|
|
1665
|
+
|
|
1666
|
+
PostgresScanHints = {
|
|
1667
|
+
ScanOperator.SequentialScan,
|
|
1668
|
+
ScanOperator.IndexScan,
|
|
1669
|
+
ScanOperator.IndexOnlyScan,
|
|
1670
|
+
ScanOperator.BitmapScan,
|
|
1671
|
+
}
|
|
1672
|
+
"""All scan operators that are supported by Postgres."""
|
|
1673
|
+
|
|
1674
|
+
PostgresPlanHints = {
|
|
1675
|
+
HintType.Cardinality,
|
|
1676
|
+
HintType.Parallelization,
|
|
1677
|
+
HintType.LinearJoinOrder,
|
|
1678
|
+
HintType.BushyJoinOrder,
|
|
1679
|
+
HintType.JoinDirection,
|
|
1680
|
+
HintType.Operator,
|
|
1681
|
+
}
|
|
1682
|
+
"""All non-operator hints supported by Postgres, that can be used to enforce additional optimizer behaviour."""
|
|
1683
|
+
|
|
1684
|
+
|
|
1685
|
+
class PostgresExplainClause(Explain):
|
|
1686
|
+
"""A specialized *EXPLAIN* clause implementation to handle Postgres custom syntax for query plans.
|
|
1687
|
+
|
|
1688
|
+
If *ANALYZE* is enabled, this also retrieves information about shared buffer usage (page hits and disk reads).
|
|
1689
|
+
|
|
1690
|
+
Parameters
|
|
1691
|
+
----------
|
|
1692
|
+
original_clause : Explain
|
|
1693
|
+
The actual *EXPLAIN* clause. The new explain clause acts as a decorator around the original clause.
|
|
1694
|
+
"""
|
|
1695
|
+
|
|
1696
|
+
def __init__(self, original_clause: Explain) -> None:
|
|
1697
|
+
super().__init__(original_clause.analyze, original_clause.target_format)
|
|
1698
|
+
|
|
1699
|
+
def __str__(self) -> str:
|
|
1700
|
+
explain_args = "(SETTINGS, "
|
|
1701
|
+
if self.analyze:
|
|
1702
|
+
explain_args += "ANALYZE, BUFFERS, "
|
|
1703
|
+
explain_args += f"FORMAT {self.target_format})"
|
|
1704
|
+
return f"EXPLAIN {explain_args}"
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
class PostgresLimitClause(Limit):
|
|
1708
|
+
"""A specialized *LIMIT* clause implementation to handle Postgres custom syntax for limits / offsets
|
|
1709
|
+
|
|
1710
|
+
Parameters
|
|
1711
|
+
----------
|
|
1712
|
+
original_clause : Limit
|
|
1713
|
+
The actual *LIMIT* clause. The new limit clause acts as a decorator around the original clause.
|
|
1714
|
+
"""
|
|
1715
|
+
|
|
1716
|
+
def __init__(self, original_clause: Limit) -> None:
|
|
1717
|
+
super().__init__(
|
|
1718
|
+
limit=original_clause.limit,
|
|
1719
|
+
offset=original_clause.offset,
|
|
1720
|
+
fetch_direction=original_clause.fetch_direction,
|
|
1721
|
+
)
|
|
1722
|
+
|
|
1723
|
+
def __str__(self) -> str:
|
|
1724
|
+
if self.fetch_direction != "first":
|
|
1725
|
+
return super().__str__()
|
|
1726
|
+
|
|
1727
|
+
if self.limit and self.offset:
|
|
1728
|
+
return f"LIMIT {self.limit} OFFSET {self.offset}"
|
|
1729
|
+
elif self.limit:
|
|
1730
|
+
return f"LIMIT {self.limit}"
|
|
1731
|
+
elif self.offset:
|
|
1732
|
+
return f"OFFSET {self.offset}"
|
|
1733
|
+
else:
|
|
1734
|
+
return ""
|
|
1735
|
+
|
|
1736
|
+
|
|
1737
|
+
def _replace_postgres_cast_expressions(expression: SqlExpression) -> SqlExpression:
|
|
1738
|
+
"""Wraps a given expression by a `_PostgresCastExpression` if necessary.
|
|
1739
|
+
|
|
1740
|
+
This is the replacment method required by the `replace_expressions` transformation. It wraps all `CastExpression`
|
|
1741
|
+
instances by a `_PostgresCastExpression` and leaves all other expressions intact.
|
|
1742
|
+
|
|
1743
|
+
Parameters
|
|
1744
|
+
----------
|
|
1745
|
+
expression : SqlExpression
|
|
1746
|
+
The expression to check
|
|
1747
|
+
|
|
1748
|
+
Returns
|
|
1749
|
+
-------
|
|
1750
|
+
SqlExpression
|
|
1751
|
+
A potentially wrapped version of the original expression
|
|
1752
|
+
|
|
1753
|
+
See Also
|
|
1754
|
+
--------
|
|
1755
|
+
transform.replace_expressions
|
|
1756
|
+
"""
|
|
1757
|
+
target = type(expression)
|
|
1758
|
+
match expression:
|
|
1759
|
+
case StaticValueExpression() | ColumnExpression() | StarExpression():
|
|
1760
|
+
return expression
|
|
1761
|
+
case SubqueryExpression(query):
|
|
1762
|
+
replaced_subquery = transform.replace_expressions(
|
|
1763
|
+
query, _replace_postgres_cast_expressions
|
|
1764
|
+
)
|
|
1765
|
+
return target(replaced_subquery)
|
|
1766
|
+
case CaseExpression(cases, else_expr):
|
|
1767
|
+
replaced_cases: list[tuple[AbstractPredicate, SqlExpression]] = []
|
|
1768
|
+
for condition, result in cases:
|
|
1769
|
+
replaced_condition = _replace_postgres_cast_expressions(condition)
|
|
1770
|
+
replaced_result = _replace_postgres_cast_expressions(result)
|
|
1771
|
+
replaced_cases.append((replaced_condition, replaced_result))
|
|
1772
|
+
replaced_else = (
|
|
1773
|
+
_replace_postgres_cast_expressions(else_expr) if else_expr else None
|
|
1774
|
+
)
|
|
1775
|
+
return target(replaced_cases, else_expr=replaced_else)
|
|
1776
|
+
case CastExpression(cast, typ, params):
|
|
1777
|
+
replaced_cast = _replace_postgres_cast_expressions(cast)
|
|
1778
|
+
# return _PostgresCastExpression(replaced_cast, typ, type_params=params)
|
|
1779
|
+
return CastExpression(replaced_cast, typ, params)
|
|
1780
|
+
case MathExpression(op, lhs, rhs):
|
|
1781
|
+
replaced_lhs = _replace_postgres_cast_expressions(lhs)
|
|
1782
|
+
rhs = util.enlist(rhs) if rhs else []
|
|
1783
|
+
replaced_rhs = [_replace_postgres_cast_expressions(expr) for expr in rhs]
|
|
1784
|
+
return target(op, replaced_lhs, replaced_rhs)
|
|
1785
|
+
case ArrayAccessExpression(array, ind, lo, hi):
|
|
1786
|
+
replaced_arr = _replace_postgres_cast_expressions(array)
|
|
1787
|
+
replaced_ind = (
|
|
1788
|
+
_replace_postgres_cast_expressions(ind) if ind is not None else None
|
|
1789
|
+
)
|
|
1790
|
+
replaced_lo = (
|
|
1791
|
+
_replace_postgres_cast_expressions(lo) if lo is not None else None
|
|
1792
|
+
)
|
|
1793
|
+
replaced_hi = (
|
|
1794
|
+
_replace_postgres_cast_expressions(hi) if hi is not None else None
|
|
1795
|
+
)
|
|
1796
|
+
return target(
|
|
1797
|
+
replaced_arr,
|
|
1798
|
+
idx=replaced_ind,
|
|
1799
|
+
lower_idx=replaced_lo,
|
|
1800
|
+
upper_idx=replaced_hi,
|
|
1801
|
+
)
|
|
1802
|
+
case FunctionExpression(fn, args, distinct, cond):
|
|
1803
|
+
replaced_args = [_replace_postgres_cast_expressions(arg) for arg in args]
|
|
1804
|
+
replaced_cond = _replace_postgres_cast_expressions(cond) if cond else None
|
|
1805
|
+
return FunctionExpression(
|
|
1806
|
+
fn, replaced_args, distinct=distinct, filter_where=replaced_cond
|
|
1807
|
+
)
|
|
1808
|
+
case WindowExpression(fn, parts, ordering, cond):
|
|
1809
|
+
replaced_fn = _replace_postgres_cast_expressions(fn)
|
|
1810
|
+
replaced_parts = [
|
|
1811
|
+
_replace_postgres_cast_expressions(part) for part in parts
|
|
1812
|
+
]
|
|
1813
|
+
replaced_cond = _replace_postgres_cast_expressions(cond) if cond else None
|
|
1814
|
+
|
|
1815
|
+
replaced_order_exprs: list[OrderByExpression] = []
|
|
1816
|
+
for order in ordering or []:
|
|
1817
|
+
replaced_expr = _replace_postgres_cast_expressions(order.column)
|
|
1818
|
+
replaced_order_exprs.append(
|
|
1819
|
+
OrderByExpression(replaced_expr, order.ascending, order.nulls_first)
|
|
1820
|
+
)
|
|
1821
|
+
replaced_ordering = (
|
|
1822
|
+
OrderBy(replaced_order_exprs) if replaced_order_exprs else None
|
|
1823
|
+
)
|
|
1824
|
+
|
|
1825
|
+
return target(
|
|
1826
|
+
replaced_fn,
|
|
1827
|
+
partitioning=replaced_parts,
|
|
1828
|
+
ordering=replaced_ordering,
|
|
1829
|
+
filter_condition=replaced_cond,
|
|
1830
|
+
)
|
|
1831
|
+
case BinaryPredicate(op, lhs, rhs):
|
|
1832
|
+
replaced_lhs = _replace_postgres_cast_expressions(lhs)
|
|
1833
|
+
replaced_rhs = _replace_postgres_cast_expressions(rhs)
|
|
1834
|
+
return target(op, replaced_lhs, replaced_rhs)
|
|
1835
|
+
case BetweenPredicate(col, lo, hi):
|
|
1836
|
+
replaced_col = _replace_postgres_cast_expressions(col)
|
|
1837
|
+
replaced_lo = _replace_postgres_cast_expressions(lo)
|
|
1838
|
+
replaced_hi = _replace_postgres_cast_expressions(hi)
|
|
1839
|
+
return BetweenPredicate(replaced_col, (replaced_lo, replaced_hi))
|
|
1840
|
+
case InPredicate(col, vals):
|
|
1841
|
+
replaced_col = _replace_postgres_cast_expressions(col)
|
|
1842
|
+
replaced_vals = [_replace_postgres_cast_expressions(val) for val in vals]
|
|
1843
|
+
return target(replaced_col, replaced_vals)
|
|
1844
|
+
case UnaryPredicate(col, op):
|
|
1845
|
+
replaced_col = _replace_postgres_cast_expressions(col)
|
|
1846
|
+
return target(replaced_col, op)
|
|
1847
|
+
case CompoundPredicate(op, children) if op in {
|
|
1848
|
+
CompoundOperator.And,
|
|
1849
|
+
CompoundOperator.Or,
|
|
1850
|
+
}:
|
|
1851
|
+
replaced_children = [
|
|
1852
|
+
_replace_postgres_cast_expressions(child) for child in children
|
|
1853
|
+
]
|
|
1854
|
+
return target(op, replaced_children)
|
|
1855
|
+
case CompoundPredicate(op, child) if op == CompoundOperator.Not:
|
|
1856
|
+
replaced_child = _replace_postgres_cast_expressions(child)
|
|
1857
|
+
return target(op, replaced_child)
|
|
1858
|
+
case _:
|
|
1859
|
+
raise ValueError(
|
|
1860
|
+
f"Unsupported expression type {type(expression)}: {expression}"
|
|
1861
|
+
)
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
PostgresHintingBackend = Literal["pg_hint_plan", "pg_lab", "none"]
|
|
1865
|
+
"""The hinting backend being used.
|
|
1866
|
+
|
|
1867
|
+
If pg_lab is available, this is the preferred extension. Otherwise, pg_hint_plan is used as a fallback.
|
|
1868
|
+
If the hint service is inactive, the backend is set to _none_.
|
|
1869
|
+
"""
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
def _walk_join_order(node: JoinTree) -> str:
|
|
1873
|
+
if node.is_scan():
|
|
1874
|
+
return node.base_table.identifier()
|
|
1875
|
+
|
|
1876
|
+
outer = _walk_join_order(node.outer_child)
|
|
1877
|
+
inner = _walk_join_order(node.inner_child)
|
|
1878
|
+
return f"({outer} {inner})"
|
|
1879
|
+
|
|
1880
|
+
|
|
1881
|
+
def _generate_pghintplan_hints(
|
|
1882
|
+
query: SqlQuery,
|
|
1883
|
+
join_order: Optional[JoinTree],
|
|
1884
|
+
phys_ops: Optional[PhysicalOperatorAssignment],
|
|
1885
|
+
plan_params: Optional[PlanParameterization],
|
|
1886
|
+
*,
|
|
1887
|
+
pg_instance: PostgresInterface,
|
|
1888
|
+
) -> Hint:
|
|
1889
|
+
hints: list[str] = []
|
|
1890
|
+
prep_statements: list[str] = []
|
|
1891
|
+
used_parallel: bool = False
|
|
1892
|
+
|
|
1893
|
+
geqo_thresh: str = pg_instance.config["geqo_threshold"]
|
|
1894
|
+
if len(query.tables()) > int(geqo_thresh):
|
|
1895
|
+
warnings.warn(
|
|
1896
|
+
"Temporarily disabling GEQO. pg_hint_plan only works with the DP optimizer.",
|
|
1897
|
+
category=HintWarning,
|
|
1898
|
+
)
|
|
1899
|
+
hints.append("Set(geqo off)")
|
|
1900
|
+
|
|
1901
|
+
if join_order and len(join_order) > 1:
|
|
1902
|
+
join_str = _walk_join_order(join_order)
|
|
1903
|
+
hints.append(f"Leading({join_str})")
|
|
1904
|
+
|
|
1905
|
+
if phys_ops:
|
|
1906
|
+
for scan in phys_ops.scan_operators.values():
|
|
1907
|
+
op = PGHintPlanOptimizerHints[scan.operator]
|
|
1908
|
+
tab = scan.table.identifier()
|
|
1909
|
+
hints.append(f"{op}({tab})")
|
|
1910
|
+
if scan.parallel_workers > 1 and not used_parallel:
|
|
1911
|
+
hints.append(f"Parallel({tab} {scan.parallel_workers} hard)")
|
|
1912
|
+
used_parallel = True
|
|
1913
|
+
elif used_parallel:
|
|
1914
|
+
warnings.warn(
|
|
1915
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
1916
|
+
category=HintWarning,
|
|
1917
|
+
)
|
|
1918
|
+
|
|
1919
|
+
for join in phys_ops.join_operators.values():
|
|
1920
|
+
op = PGHintPlanOptimizerHints[join.operator]
|
|
1921
|
+
intermediate = " ".join(tab.identifier() for tab in join.intermediate)
|
|
1922
|
+
hints.append(f"{op}({intermediate})")
|
|
1923
|
+
if join.parallel_workers > 1 and not used_parallel:
|
|
1924
|
+
warnings.warn(
|
|
1925
|
+
"Cannot directly set parallel workers on a join with pg_hint_plan. "
|
|
1926
|
+
"Setting on all base tables instead.",
|
|
1927
|
+
category=HintWarning,
|
|
1928
|
+
)
|
|
1929
|
+
for tab in join.intermediate:
|
|
1930
|
+
hints.append(
|
|
1931
|
+
f"Parallel({tab.identifier()} {join.parallel_workers} hard)"
|
|
1932
|
+
)
|
|
1933
|
+
elif used_parallel:
|
|
1934
|
+
warnings.warn(
|
|
1935
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
1936
|
+
category=HintWarning,
|
|
1937
|
+
)
|
|
1938
|
+
|
|
1939
|
+
for tabs, intermediate_op in phys_ops.intermediate_operators.items():
|
|
1940
|
+
op = PGHintPlanOptimizerHints.get(intermediate_op)
|
|
1941
|
+
if not op:
|
|
1942
|
+
warnings.warn(
|
|
1943
|
+
f"Cannot enforce operator {intermediate_op} with pg_hint_plan. Ignoring this hint",
|
|
1944
|
+
category=HintWarning,
|
|
1945
|
+
)
|
|
1946
|
+
continue
|
|
1947
|
+
intermediate = " ".join(tab.identifier() for tab in tabs)
|
|
1948
|
+
hints.append(f"{op}({intermediate})")
|
|
1949
|
+
|
|
1950
|
+
for op, val in phys_ops.global_settings.items():
|
|
1951
|
+
setting = PostgresOptimizerSettings[op]
|
|
1952
|
+
hints.append(f"Set({setting} {val})")
|
|
1953
|
+
|
|
1954
|
+
if plan_params:
|
|
1955
|
+
for tabs, card in plan_params.cardinalities.items():
|
|
1956
|
+
if card.isnan():
|
|
1957
|
+
continue
|
|
1958
|
+
|
|
1959
|
+
intermediate = " ".join(tab.identifier() for tab in tabs)
|
|
1960
|
+
if card.isinf():
|
|
1961
|
+
warnings.warn(
|
|
1962
|
+
f"Ignoring infinite cardinality for intermediate {intermediate}",
|
|
1963
|
+
category=HintWarning,
|
|
1964
|
+
)
|
|
1965
|
+
continue
|
|
1966
|
+
|
|
1967
|
+
hints.append(f"Rows({intermediate} #{card.value})")
|
|
1968
|
+
|
|
1969
|
+
for tabs, workers in plan_params.parallel_workers.items():
|
|
1970
|
+
if workers == 1:
|
|
1971
|
+
continue
|
|
1972
|
+
elif used_parallel:
|
|
1973
|
+
warnings.warn(
|
|
1974
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
1975
|
+
category=HintWarning,
|
|
1976
|
+
)
|
|
1977
|
+
continue
|
|
1978
|
+
|
|
1979
|
+
intermediate = " ".join(tab.identifier() for tab in tabs)
|
|
1980
|
+
hints.append(f"Parallel({intermediate} {workers} hard)")
|
|
1981
|
+
used_parallel = True
|
|
1982
|
+
|
|
1983
|
+
for setting, val in plan_params.system_settings.items():
|
|
1984
|
+
# TODO: we could be smart here and differentiate betwen settings that only affect the optimizer and settings
|
|
1985
|
+
# that also affect the execution engine. The former can be set in pg_hint_plan via Set(...), while the latter
|
|
1986
|
+
# must be set via a preparatory SET statement. We should avoid this second case if at all possible since it
|
|
1987
|
+
# affects the entire session and not just the current query.
|
|
1988
|
+
# For now, we mitigate this issue in a different way: we emit SET LOCAL statements which only modify the
|
|
1989
|
+
# current transaction. Since the Postgres interface runs in autocommit mode, each query is executed within
|
|
1990
|
+
# its own transaction. Therefore, all changes are reverted immediately after the query has finished.
|
|
1991
|
+
prep_statements.append(f"SET LOCAL {setting} TO '{val}';")
|
|
1992
|
+
|
|
1993
|
+
if plan_params.execution_mode is not None:
|
|
1994
|
+
warnings.warn(
|
|
1995
|
+
"pg_hint_plan does not support execution mode hints",
|
|
1996
|
+
category=HintWarning,
|
|
1997
|
+
)
|
|
1998
|
+
|
|
1999
|
+
hints = [f" {line}" for line in hints]
|
|
2000
|
+
hints.insert(0, "/*+")
|
|
2001
|
+
hints.append(" */")
|
|
2002
|
+
|
|
2003
|
+
return Hint("\n".join(prep_statements), "\n".join(hints))
|
|
2004
|
+
|
|
2005
|
+
|
|
2006
|
+
def _generate_pglab_hints(
|
|
2007
|
+
join_order: Optional[JoinTree],
|
|
2008
|
+
phys_ops: Optional[PhysicalOperatorAssignment],
|
|
2009
|
+
plan_params: Optional[PlanParameterization],
|
|
2010
|
+
) -> Hint:
|
|
2011
|
+
hints: list[str] = []
|
|
2012
|
+
prep_statements: list[str] = []
|
|
2013
|
+
|
|
2014
|
+
has_worker_params = plan_params and plan_params.parallel_workers
|
|
2015
|
+
used_parallel = False
|
|
2016
|
+
|
|
2017
|
+
if has_worker_params and not phys_ops:
|
|
2018
|
+
warnings.warn(
|
|
2019
|
+
"pg_lab can only force parallel execution of nodes with known operators. Ignoring worker hints.",
|
|
2020
|
+
category=HintWarning,
|
|
2021
|
+
)
|
|
2022
|
+
elif has_worker_params:
|
|
2023
|
+
has_dangling_worker_hints = any(
|
|
2024
|
+
intermediate not in phys_ops
|
|
2025
|
+
for intermediate in plan_params.parallel_workers
|
|
2026
|
+
)
|
|
2027
|
+
if has_dangling_worker_hints:
|
|
2028
|
+
warnings.warn(
|
|
2029
|
+
"pg_lab can only force parallel execution of nodes with known operators. Ignoring additional hints.",
|
|
2030
|
+
category=HintWarning,
|
|
2031
|
+
)
|
|
2032
|
+
phys_ops = phys_ops.integrate_workers_from(plan_params)
|
|
2033
|
+
|
|
2034
|
+
hints.append("Config(plan_mode=anchored)")
|
|
2035
|
+
|
|
2036
|
+
if join_order and len(join_order) > 1:
|
|
2037
|
+
join_str = _walk_join_order(join_order)
|
|
2038
|
+
hints.append(f"JoinOrder({join_str})")
|
|
2039
|
+
|
|
2040
|
+
if phys_ops:
|
|
2041
|
+
for scan in phys_ops.scan_operators.values():
|
|
2042
|
+
op = PGLabOptimizerHints[scan.operator]
|
|
2043
|
+
table = scan.table.identifier()
|
|
2044
|
+
|
|
2045
|
+
if scan.parallel_workers > 1 and not used_parallel:
|
|
2046
|
+
# TODO: check for off-by-one errors!!!
|
|
2047
|
+
hint = f"{op}({table} (workers={scan.parallel_workers}))"
|
|
2048
|
+
used_parallel = True
|
|
2049
|
+
elif scan.parallel_workers > 1 and used_parallel:
|
|
2050
|
+
warnings.warn(
|
|
2051
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
2052
|
+
category=HintWarning,
|
|
2053
|
+
)
|
|
2054
|
+
else:
|
|
2055
|
+
hint = f"{op}({table})"
|
|
2056
|
+
hints.append(hint)
|
|
2057
|
+
|
|
2058
|
+
for join in phys_ops.join_operators.values():
|
|
2059
|
+
op = PGLabOptimizerHints[join.operator]
|
|
2060
|
+
intermediate = " ".join(tab.identifier() for tab in join.intermediate)
|
|
2061
|
+
|
|
2062
|
+
if join.parallel_workers > 1 and not used_parallel:
|
|
2063
|
+
hint = f"{op}({intermediate} (workers={join.parallel_workers}))"
|
|
2064
|
+
used_parallel = True
|
|
2065
|
+
elif join.parallel_workers > 1 and used_parallel:
|
|
2066
|
+
warnings.warn(
|
|
2067
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
2068
|
+
category=HintWarning,
|
|
2069
|
+
)
|
|
2070
|
+
else:
|
|
2071
|
+
hint = f"{op}({intermediate})"
|
|
2072
|
+
hints.append(hint)
|
|
2073
|
+
|
|
2074
|
+
for tabs, intermediate_op in phys_ops.intermediate_operators.items():
|
|
2075
|
+
op = PGLabOptimizerHints[intermediate_op]
|
|
2076
|
+
intermediate = " ".join(tab.identifier() for tab in tabs)
|
|
2077
|
+
hints.append(f"{op}({intermediate})")
|
|
2078
|
+
|
|
2079
|
+
for op, enabled in phys_ops.global_settings.items():
|
|
2080
|
+
setting = PostgresOptimizerSettings[op]
|
|
2081
|
+
value = "on" if enabled else "off"
|
|
2082
|
+
hints.append(f"Set({setting} = '{value}')")
|
|
2083
|
+
|
|
2084
|
+
if plan_params:
|
|
2085
|
+
for tabs, card in plan_params.cardinalities.items():
|
|
2086
|
+
if card.isnan():
|
|
2087
|
+
continue
|
|
2088
|
+
|
|
2089
|
+
intermediate = " ".join(tab.identifier() for tab in tabs)
|
|
2090
|
+
if card.isinf():
|
|
2091
|
+
warnings.warn(
|
|
2092
|
+
f"Ignoring infinite cardinality for intermediate {intermediate}",
|
|
2093
|
+
category=HintWarning,
|
|
2094
|
+
)
|
|
2095
|
+
continue
|
|
2096
|
+
|
|
2097
|
+
hints.append(f"Card({intermediate} #{card})")
|
|
2098
|
+
|
|
2099
|
+
for setting, val in plan_params.system_settings.items():
|
|
2100
|
+
hints.append(f"Set({setting} = '{val}')")
|
|
2101
|
+
|
|
2102
|
+
if plan_params.execution_mode is not None:
|
|
2103
|
+
mode = (
|
|
2104
|
+
"sequential"
|
|
2105
|
+
if plan_params.execution_mode == "sequential"
|
|
2106
|
+
else "parallel"
|
|
2107
|
+
)
|
|
2108
|
+
hints.append(f"Config(exec_mode={mode})")
|
|
2109
|
+
|
|
2110
|
+
hints = [f" {line}" for line in hints]
|
|
2111
|
+
hints.insert(0, "/*=pg_lab=")
|
|
2112
|
+
hints.append(" */")
|
|
2113
|
+
|
|
2114
|
+
return Hint("\n".join(prep_statements), "\n".join(hints))
|
|
2115
|
+
|
|
2116
|
+
|
|
2117
|
+
def _extract_plan_join_order(plan: QueryPlan) -> str:
|
|
2118
|
+
if plan.is_scan():
|
|
2119
|
+
return plan.base_table.identifier()
|
|
2120
|
+
elif plan.input_node:
|
|
2121
|
+
return _extract_plan_join_order(plan.input_node)
|
|
2122
|
+
|
|
2123
|
+
outer = _extract_plan_join_order(plan.outer_child)
|
|
2124
|
+
inner = _extract_plan_join_order(plan.inner_child)
|
|
2125
|
+
return f"({outer} {inner})"
|
|
2126
|
+
|
|
2127
|
+
|
|
2128
|
+
def _iter_plan_bfs(plan: QueryPlan) -> Generator[QueryPlan, None, None]:
|
|
2129
|
+
queue = collections.deque([plan])
|
|
2130
|
+
while queue:
|
|
2131
|
+
node = queue.popleft()
|
|
2132
|
+
queue.extend(node.children)
|
|
2133
|
+
yield node
|
|
2134
|
+
|
|
2135
|
+
|
|
2136
|
+
def _generate_pglab_plan(
|
|
2137
|
+
plan: QueryPlan,
|
|
2138
|
+
) -> Hint:
|
|
2139
|
+
hints: list[str] = ["Config(plan_mode=full)"]
|
|
2140
|
+
join_order = _extract_plan_join_order(plan)
|
|
2141
|
+
hints.append(f"JoinOrder({join_order})")
|
|
2142
|
+
|
|
2143
|
+
used_parallel = False
|
|
2144
|
+
in_upperrel = True
|
|
2145
|
+
par_workers: Optional[int] = None
|
|
2146
|
+
for node in _iter_plan_bfs(plan):
|
|
2147
|
+
if node.is_scan() or node.is_join():
|
|
2148
|
+
in_upperrel = False
|
|
2149
|
+
|
|
2150
|
+
par_workers = (
|
|
2151
|
+
node.parallel_workers if node.parallel_workers > 0 else par_workers
|
|
2152
|
+
)
|
|
2153
|
+
if in_upperrel and par_workers and not used_parallel:
|
|
2154
|
+
hints.append(f"Result(workers={par_workers})")
|
|
2155
|
+
used_parallel = True
|
|
2156
|
+
par_workers = None
|
|
2157
|
+
elif in_upperrel and par_workers and used_parallel:
|
|
2158
|
+
warnings.warn(
|
|
2159
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
2160
|
+
category=HintWarning,
|
|
2161
|
+
)
|
|
2162
|
+
|
|
2163
|
+
operator = PGLabOptimizerHints.get(node.operator)
|
|
2164
|
+
intermediate = " ".join(tab.identifier() for tab in node.tables())
|
|
2165
|
+
|
|
2166
|
+
if operator:
|
|
2167
|
+
if par_workers and not used_parallel:
|
|
2168
|
+
metadata = f" (workers={par_workers})"
|
|
2169
|
+
par_workers = None
|
|
2170
|
+
used_parallel = True
|
|
2171
|
+
elif par_workers and used_parallel:
|
|
2172
|
+
metadata = ""
|
|
2173
|
+
warnings.warn(
|
|
2174
|
+
"Cannot set multiple parallel hints for Postgres. Ignoring additional hints.",
|
|
2175
|
+
category=HintWarning,
|
|
2176
|
+
)
|
|
2177
|
+
else:
|
|
2178
|
+
metadata = ""
|
|
2179
|
+
|
|
2180
|
+
hints.append(f"{operator}({intermediate}{metadata})")
|
|
2181
|
+
|
|
2182
|
+
card = node.actual_cardinality or node.estimated_cardinality
|
|
2183
|
+
if operator and card.is_valid():
|
|
2184
|
+
hints.append(f"Card({intermediate} #{card})")
|
|
2185
|
+
|
|
2186
|
+
hints = [f" {line}" for line in hints]
|
|
2187
|
+
hints.insert(0, "/*=pg_lab=")
|
|
2188
|
+
hints.append(" */")
|
|
2189
|
+
return Hint("", "\n".join(hints))
|
|
2190
|
+
|
|
2191
|
+
|
|
2192
|
+
class PostgresHintService(HintService):
|
|
2193
|
+
"""Postgres-specific implementation of the hinting capabilities.
|
|
2194
|
+
|
|
2195
|
+
Most importantly, this service implements a mapping from the abstract optimization descisions (join order + operators) to
|
|
2196
|
+
their counterparts in the hinting backend and integrates Postgres' few deviations from standard SQL syntax (*CAST*
|
|
2197
|
+
expressions and *LIMIT* clauses).
|
|
2198
|
+
|
|
2199
|
+
The hinting service supports two different kinds of backends: pg_lab or pg_hint_plan. The former is the preferred option
|
|
2200
|
+
since it provides cardinality hints for base joins and does not require management of the GeQO optimizer.
|
|
2201
|
+
|
|
2202
|
+
Notice that by delegating the adaptation of Postgres' native optimizer to the pg_hint_plan extension, a couple of
|
|
2203
|
+
undesired side-effects have to be accepted:
|
|
2204
|
+
|
|
2205
|
+
1. forcing a join order also involves forcing a specific join direction. Our implementation applies a couple of heuristics
|
|
2206
|
+
to mitigate a bad impact on performance
|
|
2207
|
+
2. the extension only instruments the dynamic programming-based optimizer. If the *geqo_threshold* is reached and the
|
|
2208
|
+
genetic optimizer takes over, no modifications are applied. Therefore, it is best to disable GeQO while working with
|
|
2209
|
+
Postgres. At the same time, this means that certain scenarios like custom cardinality estimation for the genetic
|
|
2210
|
+
optimizer cannot currently be tested
|
|
2211
|
+
|
|
2212
|
+
Parameters
|
|
2213
|
+
----------
|
|
2214
|
+
postgres_db : PostgresInterface
|
|
2215
|
+
A postgres database with an active hinting backend (pg_hint_plan or pg_lab)
|
|
2216
|
+
|
|
2217
|
+
Raises
|
|
2218
|
+
------
|
|
2219
|
+
ValueError
|
|
2220
|
+
If the supplied `postgres_db` does not have a supported hinting backend enabled.
|
|
2221
|
+
|
|
2222
|
+
See Also
|
|
2223
|
+
--------
|
|
2224
|
+
_generate_pg_join_order_hint
|
|
2225
|
+
|
|
2226
|
+
References
|
|
2227
|
+
----------
|
|
2228
|
+
|
|
2229
|
+
.. pg_hint_plan extension: https://github.com/ossc-db/pg_hint_plan
|
|
2230
|
+
.. Postgres query planning configuration: https://www.postgresql.org/docs/current/runtime-config-query.html
|
|
2231
|
+
"""
|
|
2232
|
+
|
|
2233
|
+
def __init__(self, postgres_db: PostgresInterface) -> None:
|
|
2234
|
+
self._postgres_db = postgres_db
|
|
2235
|
+
self._inactive = True
|
|
2236
|
+
self._backend = "none"
|
|
2237
|
+
self._infer_pg_backend()
|
|
2238
|
+
|
|
2239
|
+
def _get_backend(self) -> PostgresHintingBackend:
|
|
2240
|
+
return self._backend
|
|
2241
|
+
|
|
2242
|
+
def _set_backend(self, backend_name: PostgresHintingBackend) -> None:
|
|
2243
|
+
self._inactive = backend_name == "none"
|
|
2244
|
+
self._backend = backend_name
|
|
2245
|
+
|
|
2246
|
+
backend = property(_get_backend, _set_backend, doc="The hinting backend in use.")
|
|
2247
|
+
|
|
2248
|
+
def generate_hints(
|
|
2249
|
+
self,
|
|
2250
|
+
query: SqlQuery,
|
|
2251
|
+
plan: Optional[QueryPlan] = None,
|
|
2252
|
+
*,
|
|
2253
|
+
join_order: Optional[JoinTree] = None,
|
|
2254
|
+
physical_operators: Optional[PhysicalOperatorAssignment] = None,
|
|
2255
|
+
plan_parameters: Optional[PlanParameterization] = None,
|
|
2256
|
+
) -> SqlQuery:
|
|
2257
|
+
self._assert_active_backend()
|
|
2258
|
+
|
|
2259
|
+
adapted_query = query
|
|
2260
|
+
if adapted_query.explain and not isinstance(
|
|
2261
|
+
adapted_query.explain, PostgresExplainClause
|
|
2262
|
+
):
|
|
2263
|
+
adapted_query = transform.replace_clause(
|
|
2264
|
+
adapted_query, PostgresExplainClause(adapted_query.explain)
|
|
2265
|
+
)
|
|
2266
|
+
if adapted_query.limit_clause and not isinstance(
|
|
2267
|
+
adapted_query.limit_clause, PostgresLimitClause
|
|
2268
|
+
):
|
|
2269
|
+
adapted_query = transform.replace_clause(
|
|
2270
|
+
adapted_query, PostgresLimitClause(adapted_query.limit_clause)
|
|
2271
|
+
)
|
|
2272
|
+
|
|
2273
|
+
has_param = any(
|
|
2274
|
+
param is not None
|
|
2275
|
+
for param in (join_order, physical_operators, plan_parameters)
|
|
2276
|
+
)
|
|
2277
|
+
if plan is not None and has_param:
|
|
2278
|
+
raise ValueError(
|
|
2279
|
+
"Can only hint an entire query plan, or individual parts, not both."
|
|
2280
|
+
)
|
|
2281
|
+
|
|
2282
|
+
match self._backend:
|
|
2283
|
+
case "pg_hint_plan":
|
|
2284
|
+
if plan is not None:
|
|
2285
|
+
join_order = jointree_from_plan(plan)
|
|
2286
|
+
physical_operators = operators_from_plan(
|
|
2287
|
+
plan, include_workers=False
|
|
2288
|
+
)
|
|
2289
|
+
plan_parameters = parameters_from_plan(
|
|
2290
|
+
plan, target_cardinality="actual", fallback_estimated=True
|
|
2291
|
+
)
|
|
2292
|
+
|
|
2293
|
+
hints = _generate_pghintplan_hints(
|
|
2294
|
+
query,
|
|
2295
|
+
join_order,
|
|
2296
|
+
physical_operators,
|
|
2297
|
+
plan_parameters,
|
|
2298
|
+
pg_instance=self._postgres_db,
|
|
2299
|
+
)
|
|
2300
|
+
case "pg_lab" if plan is not None:
|
|
2301
|
+
hints = _generate_pglab_plan(plan)
|
|
2302
|
+
case "pg_lab":
|
|
2303
|
+
hints = _generate_pglab_hints(
|
|
2304
|
+
join_order,
|
|
2305
|
+
physical_operators,
|
|
2306
|
+
plan_parameters,
|
|
2307
|
+
)
|
|
2308
|
+
|
|
2309
|
+
query = transform.add_clause(adapted_query, hints)
|
|
2310
|
+
return query
|
|
2311
|
+
|
|
2312
|
+
def format_query(self, query: SqlQuery) -> str:
|
|
2313
|
+
if query.explain:
|
|
2314
|
+
query = transform.replace_clause(
|
|
2315
|
+
query, PostgresExplainClause(query.explain)
|
|
2316
|
+
)
|
|
2317
|
+
return formatter.format_quick(query, flavor="postgres")
|
|
2318
|
+
|
|
2319
|
+
def supports_hint(self, hint: PhysicalOperator | HintType) -> bool:
|
|
2320
|
+
self._assert_active_backend()
|
|
2321
|
+
return hint in PostgresJoinHints | PostgresScanHints | PostgresPlanHints
|
|
2322
|
+
|
|
2323
|
+
def describe(self) -> dict[str, str]:
|
|
2324
|
+
"""Provides a JSON-serializable description of the hint service.
|
|
2325
|
+
|
|
2326
|
+
Returns
|
|
2327
|
+
-------
|
|
2328
|
+
dict[str, str]
|
|
2329
|
+
Information about the hinting backend
|
|
2330
|
+
"""
|
|
2331
|
+
return {"backend": self._backend}
|
|
2332
|
+
|
|
2333
|
+
def _infer_pg_backend(self) -> None:
|
|
2334
|
+
"""Determines the hinting backend that is provided by the current Postgres instance."""
|
|
2335
|
+
|
|
2336
|
+
# We first try the easy route: checking whether any of the settings related to the hinting backends are available and
|
|
2337
|
+
# activated. If this is the case, we are already done.
|
|
2338
|
+
# Otherwise, we need to become more creative and rely on more advanced heuristics.
|
|
2339
|
+
# Note that on recent installations of Postgres/pg_hint_plan or pg_lab, we can expect that the easy route does indeed
|
|
2340
|
+
# work. It is just on older versions that the settings were not available.
|
|
2341
|
+
|
|
2342
|
+
cur = self._postgres_db.cursor()
|
|
2343
|
+
try:
|
|
2344
|
+
cur.execute("SHOW pg_hint_plan.enable_hint;")
|
|
2345
|
+
res = cur.fetchone()
|
|
2346
|
+
if res and res[0] == "on":
|
|
2347
|
+
util.logging.print_if(
|
|
2348
|
+
self._postgres_db.debug,
|
|
2349
|
+
"Using pg_hint_plan hinting backend",
|
|
2350
|
+
file=sys.stderr,
|
|
2351
|
+
)
|
|
2352
|
+
self._inactive = False
|
|
2353
|
+
self._backend = "pg_hint_plan"
|
|
2354
|
+
return
|
|
2355
|
+
except psycopg.errors.UndefinedObject:
|
|
2356
|
+
pass
|
|
2357
|
+
|
|
2358
|
+
try:
|
|
2359
|
+
cur.execute("SHOW enable_pglab;")
|
|
2360
|
+
res = cur.fetchone()
|
|
2361
|
+
if res and res[0] == "on":
|
|
2362
|
+
util.logging.print_if(
|
|
2363
|
+
self._postgres_db.debug,
|
|
2364
|
+
"Using pg_lab hinting backend",
|
|
2365
|
+
file=sys.stderr,
|
|
2366
|
+
)
|
|
2367
|
+
self._inactive = False
|
|
2368
|
+
self._backend = "pg_lab"
|
|
2369
|
+
return
|
|
2370
|
+
except psycopg.errors.UndefinedObject:
|
|
2371
|
+
pass
|
|
2372
|
+
|
|
2373
|
+
# At this point the easy route failed and we need to rely on more advanced heuristics.
|
|
2374
|
+
# Specifically, we try to check whether a shared library related to one of the backends is currently loaded
|
|
2375
|
+
# in the backend process. See the later comment for the reasoning.
|
|
2376
|
+
#
|
|
2377
|
+
# All code below should be considered legacy and we might in fact remove it entirely in future versions of PostBOUND.
|
|
2378
|
+
|
|
2379
|
+
if os.name != "posix":
|
|
2380
|
+
warnings.warn(
|
|
2381
|
+
"It seems you are running PostBOUND on a non-POSIX system. "
|
|
2382
|
+
"Please beware that PostBOUND is currently not intended to run on different systems and "
|
|
2383
|
+
"there might be (many) dragons. "
|
|
2384
|
+
"Proceed at your own risk. "
|
|
2385
|
+
"We assume that the Postgres server has pg_hint_plan enabled. "
|
|
2386
|
+
"Please set the backend property to pg_lab manually if you are using pg_lab."
|
|
2387
|
+
)
|
|
2388
|
+
self._backend = "pg_hint_plan"
|
|
2389
|
+
self._inactive = False
|
|
2390
|
+
return
|
|
2391
|
+
|
|
2392
|
+
connection = self._postgres_db.connection()
|
|
2393
|
+
backend_pid = connection.info.backend_pid
|
|
2394
|
+
hostname = connection.info.host
|
|
2395
|
+
|
|
2396
|
+
# Postgres does not provide a direct method to determine which extensions are currently active if they have only
|
|
2397
|
+
# been loaded as a shared library (as is the case for both pg_hint_plan and pg_lab). Therefore, we have to rely on
|
|
2398
|
+
# the assumption that the Postgres server is running on the same (virtual) machine as our PostBOUND process and can
|
|
2399
|
+
# rely on the operating system to determine open files of the backend process (which will include the shared libaries)
|
|
2400
|
+
|
|
2401
|
+
if sys.platform == "darwin":
|
|
2402
|
+
pg_candidates = subprocess.run(
|
|
2403
|
+
["lsof -p " + str(backend_pid) + " | awk '/postgres/{print $1}'"],
|
|
2404
|
+
capture_output=True,
|
|
2405
|
+
shell=True,
|
|
2406
|
+
text=True,
|
|
2407
|
+
)
|
|
2408
|
+
else:
|
|
2409
|
+
pg_candidates = subprocess.run(
|
|
2410
|
+
["ps -aux | awk '/" + str(backend_pid) + "/{print $11}'"],
|
|
2411
|
+
capture_output=True,
|
|
2412
|
+
shell=True,
|
|
2413
|
+
text=True,
|
|
2414
|
+
)
|
|
2415
|
+
found_pg = any(
|
|
2416
|
+
candidate.lower().startswith("postgres")
|
|
2417
|
+
for candidate in pg_candidates.stdout.split()
|
|
2418
|
+
)
|
|
2419
|
+
|
|
2420
|
+
# There are some rare edge cases where our heuristics fail. We have to accept them for now, but should improve the
|
|
2421
|
+
# backend detection in the future. Most importantly, the heuristic will pass if we are connected to a remote server
|
|
2422
|
+
# on localhost (e.g. via SSH tunneling or WSL instances) and there is a different Postgres server running on the same
|
|
2423
|
+
# machine as the PostBOUND process. In this case, our heuristics assume that these are the same servers.
|
|
2424
|
+
# In the future, we might want to check the ports as well, but this probably requires superuser privileges
|
|
2425
|
+
# (for netstat).
|
|
2426
|
+
|
|
2427
|
+
if hostname not in ["localhost", "127.0.0.1", "::1"] or not found_pg:
|
|
2428
|
+
warnings.warn(
|
|
2429
|
+
"It seems you are connecting to a remote Postgres instance. "
|
|
2430
|
+
"PostBOUND cannot infer the hinting backend for such connections. "
|
|
2431
|
+
"We assume that the this server has pg_hint_plan enabled. "
|
|
2432
|
+
"Please set the backend property to pg_lab manually if you are using pg_lab."
|
|
2433
|
+
)
|
|
2434
|
+
self._backend = "pg_hint_plan"
|
|
2435
|
+
self._inactive = False
|
|
2436
|
+
return
|
|
2437
|
+
|
|
2438
|
+
lib_ext = "dylib" if sys.platform == "darwin" else "so"
|
|
2439
|
+
active_extensions = util.system.open_files(backend_pid)
|
|
2440
|
+
if any(ext.endswith(f"pg_lab.{lib_ext}") for ext in active_extensions):
|
|
2441
|
+
util.logging.print_if(
|
|
2442
|
+
self._postgres_db.debug, "Using pg_lab hinting backend", file=sys.stderr
|
|
2443
|
+
)
|
|
2444
|
+
self._inactive = False
|
|
2445
|
+
self._backend = "pg_lab"
|
|
2446
|
+
elif any(ext.endswith(f"pg_hint_plan.{lib_ext}") for ext in active_extensions):
|
|
2447
|
+
util.logging.print_if(
|
|
2448
|
+
self._postgres_db.debug,
|
|
2449
|
+
"Using pg_hint_plan hinting backend",
|
|
2450
|
+
file=sys.stderr,
|
|
2451
|
+
)
|
|
2452
|
+
self._inactive = False
|
|
2453
|
+
self._backend = "pg_hint_plan"
|
|
2454
|
+
else:
|
|
2455
|
+
warnings.warn(
|
|
2456
|
+
"No supported hinting backend found. "
|
|
2457
|
+
"Please ensure that either pg_hint_plan or pg_lab is available in your Postgres instance."
|
|
2458
|
+
)
|
|
2459
|
+
self._inactive = True
|
|
2460
|
+
self._backend = "none"
|
|
2461
|
+
|
|
2462
|
+
def _assert_active_backend(self) -> None:
|
|
2463
|
+
"""Ensures that a proper hinting backend is available.
|
|
2464
|
+
|
|
2465
|
+
Raises
|
|
2466
|
+
------
|
|
2467
|
+
ValueError
|
|
2468
|
+
If no backend is available.
|
|
2469
|
+
"""
|
|
2470
|
+
if self._inactive:
|
|
2471
|
+
connection_pid = self._postgres_db._connection.info.backend_pid
|
|
2472
|
+
raise ValueError(
|
|
2473
|
+
f"No supported hinting backend found for backend with PID {connection_pid}"
|
|
2474
|
+
)
|
|
2475
|
+
|
|
2476
|
+
def __repr__(self) -> str:
|
|
2477
|
+
return f"PostgresHintService(db={self._postgres_db} backend={self._backend})"
|
|
2478
|
+
|
|
2479
|
+
def __str__(self) -> str:
|
|
2480
|
+
return repr(self)
|
|
2481
|
+
|
|
2482
|
+
|
|
2483
|
+
class PostgresOptimizer(OptimizerInterface):
|
|
2484
|
+
"""Optimizer introspection for Postgres.
|
|
2485
|
+
|
|
2486
|
+
Parameters
|
|
2487
|
+
----------
|
|
2488
|
+
postgres_instance : PostgresInterface
|
|
2489
|
+
The database whose optimizer should be introspected
|
|
2490
|
+
"""
|
|
2491
|
+
|
|
2492
|
+
def __init__(self, postgres_instance: PostgresInterface) -> None:
|
|
2493
|
+
self._pg_instance = postgres_instance
|
|
2494
|
+
|
|
2495
|
+
def query_plan(self, query: SqlQuery | str) -> QueryPlan:
|
|
2496
|
+
if isinstance(query, SqlQuery):
|
|
2497
|
+
query = transform.as_explain(query)
|
|
2498
|
+
query = self._pg_instance._hinting_backend.format_query(query)
|
|
2499
|
+
else:
|
|
2500
|
+
query = self._explainify(query)
|
|
2501
|
+
raw_query_plan: list = self._pg_instance.execute_query(
|
|
2502
|
+
query, cache_enabled=False
|
|
2503
|
+
)
|
|
2504
|
+
query_plan = PostgresExplainPlan(raw_query_plan[0])
|
|
2505
|
+
return query_plan.as_qep()
|
|
2506
|
+
|
|
2507
|
+
def analyze_plan(
|
|
2508
|
+
self, query: SqlQuery, *, timeout: Optional[float] = None
|
|
2509
|
+
) -> Optional[QueryPlan]:
|
|
2510
|
+
query = transform.as_explain_analyze(query)
|
|
2511
|
+
|
|
2512
|
+
try:
|
|
2513
|
+
raw_query_plan: dict = self._pg_instance.execute_query(
|
|
2514
|
+
query, cache_enabled=False, raw=True, timeout=timeout
|
|
2515
|
+
)[0]
|
|
2516
|
+
except TimeoutError:
|
|
2517
|
+
return None
|
|
2518
|
+
|
|
2519
|
+
query_plan = PostgresExplainPlan(raw_query_plan)
|
|
2520
|
+
return query_plan.as_qep()
|
|
2521
|
+
|
|
2522
|
+
def cardinality_estimate(self, query: SqlQuery | str) -> Cardinality:
|
|
2523
|
+
if isinstance(query, SqlQuery):
|
|
2524
|
+
query = transform.as_explain(query)
|
|
2525
|
+
query = self._pg_instance._hinting_backend.format_query(query)
|
|
2526
|
+
else:
|
|
2527
|
+
query = self._explainify(query)
|
|
2528
|
+
query_plan = self._pg_instance.execute_query(query, cache_enabled=False)
|
|
2529
|
+
estimate: int = query_plan[0]["Plan"]["Plan Rows"]
|
|
2530
|
+
return Cardinality(estimate)
|
|
2531
|
+
|
|
2532
|
+
def cost_estimate(self, query: SqlQuery | str) -> float:
|
|
2533
|
+
if isinstance(query, SqlQuery):
|
|
2534
|
+
query = transform.as_explain(query)
|
|
2535
|
+
query = self._pg_instance._hinting_backend.format_query(query)
|
|
2536
|
+
else:
|
|
2537
|
+
query = self._explainify(query)
|
|
2538
|
+
query_plan = self._pg_instance.execute_query(query, cache_enabled=False)
|
|
2539
|
+
estimate: float = query_plan[0]["Plan"]["Total Cost"]
|
|
2540
|
+
return estimate
|
|
2541
|
+
|
|
2542
|
+
def configure_operator(self, operator: PhysicalOperator, *, enabled: bool) -> None:
|
|
2543
|
+
"""Enables or disables a specific physical operator for the current Postgres connection.
|
|
2544
|
+
|
|
2545
|
+
Parameters
|
|
2546
|
+
----------
|
|
2547
|
+
operator : PhysicalOperator
|
|
2548
|
+
The operator to configure.
|
|
2549
|
+
enabled : bool
|
|
2550
|
+
Whether the operator should be allowed or not.
|
|
2551
|
+
|
|
2552
|
+
References
|
|
2553
|
+
----------
|
|
2554
|
+
https://www.postgresql.org/docs/current/runtime-config-query.html
|
|
2555
|
+
"""
|
|
2556
|
+
setting_name = PostgresOptimizerSettings.get(operator)
|
|
2557
|
+
if not setting_name:
|
|
2558
|
+
raise ValueError(
|
|
2559
|
+
f"Cannot configure operator {operator} as it is not supported by Postgres"
|
|
2560
|
+
)
|
|
2561
|
+
status = "on" if enabled else "off"
|
|
2562
|
+
self._pg_instance.cursor.execute(f"SET {setting_name} TO {status}")
|
|
2563
|
+
|
|
2564
|
+
def _explainify(self, query: str) -> str:
|
|
2565
|
+
if not query.upper().startswith("EXPLAIN (FORMAT JSON)"):
|
|
2566
|
+
query = f"EXPLAIN (FORMAT JSON) {query}"
|
|
2567
|
+
return query
|
|
2568
|
+
|
|
2569
|
+
|
|
2570
|
+
def _reconnect(name: str, *, pool: DatabasePool) -> PostgresInterface:
|
|
2571
|
+
"""Fetches a connection from the database pool.
|
|
2572
|
+
|
|
2573
|
+
If the connection is in a bad state (e.g. because the user called close() before), it is re-established.
|
|
2574
|
+
|
|
2575
|
+
Parameters
|
|
2576
|
+
----------
|
|
2577
|
+
name : str
|
|
2578
|
+
The name of the database connection in the pool.
|
|
2579
|
+
pool : DatabasePool
|
|
2580
|
+
The current pool.
|
|
2581
|
+
"""
|
|
2582
|
+
current_instance: PostgresInterface = pool.retrieve_database(name)
|
|
2583
|
+
|
|
2584
|
+
status = current_instance._connection.info.status
|
|
2585
|
+
if status != psycopg.pq.ConnStatus.OK:
|
|
2586
|
+
# Actually there are a lot of other ConnStatus values beyond OK and Bad
|
|
2587
|
+
# We could handle them explicitly here, or we might just defined anything that is not OK as Bad.
|
|
2588
|
+
# The latter seems much simpler so let's just do this for now.
|
|
2589
|
+
current_instance.reset_connection()
|
|
2590
|
+
|
|
2591
|
+
return current_instance
|
|
2592
|
+
|
|
2593
|
+
|
|
2594
|
+
def connect(
|
|
2595
|
+
*,
|
|
2596
|
+
name: str = "postgres",
|
|
2597
|
+
application_name: str = "",
|
|
2598
|
+
connect_string: str | None = None,
|
|
2599
|
+
config_file: str | Path | None = ".psycopg_connection",
|
|
2600
|
+
encoding: str = "UTF8",
|
|
2601
|
+
cache_enabled: bool = False,
|
|
2602
|
+
refresh: bool = False,
|
|
2603
|
+
private: bool = False,
|
|
2604
|
+
debug: bool = False,
|
|
2605
|
+
) -> PostgresInterface:
|
|
2606
|
+
"""Convenience function to seamlessly connect to a Postgres instance.
|
|
2607
|
+
|
|
2608
|
+
This function obtains a connect-string to the database according to the following rules:
|
|
2609
|
+
|
|
2610
|
+
1. if the connect-string is supplied directly via the `connect_string` parameter, this is used
|
|
2611
|
+
2. if the connect-string is not supplied, it is read from the file indicated by `config_file`. This file has to be located
|
|
2612
|
+
in the current working directory, or the file name has to describe the path to that file.
|
|
2613
|
+
3. if the `config_file` does not exist, an error is raised
|
|
2614
|
+
|
|
2615
|
+
After a connection to the Postgres instance has been obtained, it is registered automatically on the current
|
|
2616
|
+
`DatabasePool` instance. This can be changed via the `private` parameter.
|
|
2617
|
+
|
|
2618
|
+
Parameters
|
|
2619
|
+
----------
|
|
2620
|
+
name : str, optional
|
|
2621
|
+
A name to identify the current connection if multiple connections to different Postgres instances should be maintained.
|
|
2622
|
+
This is used to register the instance on the `DatabasePool`. Defaults to *postgres*.
|
|
2623
|
+
application_name : str, optional
|
|
2624
|
+
Identifier for the Postgres server. This will be the name that is shown in the server logs and process lists.
|
|
2625
|
+
connect_string : str | None, optional
|
|
2626
|
+
A Psycopg-compatible connect string for the database. Supplying this parameter overwrites any other connection
|
|
2627
|
+
data
|
|
2628
|
+
config_file : str | Path | None, optional
|
|
2629
|
+
A file containing a Psycopg-compatible connect string for the database. This is the default and preferred method of
|
|
2630
|
+
connecting to a Postgres database. Defaults to *.psycopg_connection*
|
|
2631
|
+
encoding : str, optional
|
|
2632
|
+
The client enconding of the connection. Defaults to *UTF8*.
|
|
2633
|
+
cache_enabled : bool, optional
|
|
2634
|
+
Controls the default caching behaviour of the Postgres instance. Caching of general queries is disabled by default,
|
|
2635
|
+
whereas queries from the statistics interface are cached by default.
|
|
2636
|
+
refresh : bool, optional
|
|
2637
|
+
If true, a new connection to the database will always be established, even if a connection to the same database is
|
|
2638
|
+
already pooled. The registration key will be suffixed to prevent collisions. By default, the current connection is
|
|
2639
|
+
re-used. If that is the case, no further information (e.g. config strings) is read and only the `name` is accessed.
|
|
2640
|
+
private : bool, optional
|
|
2641
|
+
If true, skips registration of the new instance on the `DatabasePool`. Registration is performed by default.
|
|
2642
|
+
|
|
2643
|
+
Returns
|
|
2644
|
+
-------
|
|
2645
|
+
PostgresInterface
|
|
2646
|
+
The Postgres database object
|
|
2647
|
+
|
|
2648
|
+
Raises
|
|
2649
|
+
------
|
|
2650
|
+
ValueError
|
|
2651
|
+
If neither a config file nor a connect string was given, or if the connect file should be used but does not exist
|
|
2652
|
+
|
|
2653
|
+
References
|
|
2654
|
+
----------
|
|
2655
|
+
|
|
2656
|
+
.. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
|
|
2657
|
+
database
|
|
2658
|
+
"""
|
|
2659
|
+
db_pool = DatabasePool.get_instance()
|
|
2660
|
+
if name in db_pool and not refresh:
|
|
2661
|
+
return _reconnect(name, pool=db_pool)
|
|
2662
|
+
|
|
2663
|
+
if config_file and not connect_string:
|
|
2664
|
+
config_file = Path(config_file)
|
|
2665
|
+
if not config_file.is_file():
|
|
2666
|
+
wdir = os.getcwd()
|
|
2667
|
+
raise ValueError(
|
|
2668
|
+
f"Failed to obtain a database connection. Tried to read the config file '{config_file}' from "
|
|
2669
|
+
f"your current working directory, but the file was not found. Your working directory is {wdir}. "
|
|
2670
|
+
"Please either supply the connect string directly to the connect() method, or ensure that the "
|
|
2671
|
+
"config file exists."
|
|
2672
|
+
)
|
|
2673
|
+
with open(config_file, "r") as f:
|
|
2674
|
+
connect_string = f.readline().strip()
|
|
2675
|
+
elif not connect_string:
|
|
2676
|
+
raise ValueError(
|
|
2677
|
+
"Failed to obtain a database connection. Please either supply the connect string directly to the "
|
|
2678
|
+
"connect() method, or put a configuration file in your working directory. See the documentation of "
|
|
2679
|
+
"the connect() method for more details."
|
|
2680
|
+
)
|
|
2681
|
+
|
|
2682
|
+
postgres_db = PostgresInterface(
|
|
2683
|
+
connect_string,
|
|
2684
|
+
system_name=name,
|
|
2685
|
+
client_encoding=encoding,
|
|
2686
|
+
cache_enabled=cache_enabled,
|
|
2687
|
+
debug=debug,
|
|
2688
|
+
)
|
|
2689
|
+
if not private:
|
|
2690
|
+
orig_name = name
|
|
2691
|
+
instance_idx = 2
|
|
2692
|
+
while name in db_pool:
|
|
2693
|
+
name = f"{orig_name} - {instance_idx}"
|
|
2694
|
+
instance_idx += 1
|
|
2695
|
+
db_pool.register_database(name, postgres_db)
|
|
2696
|
+
return postgres_db
|
|
2697
|
+
|
|
2698
|
+
|
|
2699
|
+
def start(pgdata: str | Path = "", *, logfile: str | Path = "") -> None:
|
|
2700
|
+
"""Starts a local Postgres server.
|
|
2701
|
+
|
|
2702
|
+
This function assumes that *pg_ctl* is available on the system PATH and either the server's data directory is specified
|
|
2703
|
+
explicitly, or set via the *PGDATA* environment variable.
|
|
2704
|
+
"""
|
|
2705
|
+
if os.system("which pg_ctl") != 0:
|
|
2706
|
+
raise ValueError("Cannot start Postgres server: pg_ctl is not on PATH")
|
|
2707
|
+
|
|
2708
|
+
pgdata = pgdata or os.environ.get("PGDATA", "")
|
|
2709
|
+
pgdata = Path(pgdata).expanduser()
|
|
2710
|
+
if not pgdata:
|
|
2711
|
+
raise ValueError(
|
|
2712
|
+
"Cannot start Postgres server: Must either supply pgdata argument or set PGDATA environment variable"
|
|
2713
|
+
)
|
|
2714
|
+
|
|
2715
|
+
args = ["pg_ctl", "-D", pgdata]
|
|
2716
|
+
if logfile:
|
|
2717
|
+
args.extend(["-l", logfile])
|
|
2718
|
+
args.append("start")
|
|
2719
|
+
|
|
2720
|
+
subprocess.run(args, check=True)
|
|
2721
|
+
|
|
2722
|
+
|
|
2723
|
+
def stop(pgdata: str | Path = "", *, raise_on_error: bool = False) -> None:
|
|
2724
|
+
"""Stops a running (local) Postgres server.
|
|
2725
|
+
|
|
2726
|
+
This function assumes that *pg_ctl* is available on the system PATH and either the server's data directory is specified
|
|
2727
|
+
explicitly, or set via the *PGDATA* environment variable.
|
|
2728
|
+
|
|
2729
|
+
If the server cannot be stopped due to whatever reason, an error can be raised by setting the corresponding parameter.
|
|
2730
|
+
Otherwise, it is silently ignored.
|
|
2731
|
+
"""
|
|
2732
|
+
if os.system("which pg_ctl") != 0:
|
|
2733
|
+
raise ValueError("Cannot stop Postgres server: pg_ctl is not on PATH")
|
|
2734
|
+
|
|
2735
|
+
pgdata = pgdata or os.environ.get("PGDATA", "")
|
|
2736
|
+
pgdata = Path(pgdata).expanduser()
|
|
2737
|
+
if not pgdata:
|
|
2738
|
+
raise ValueError(
|
|
2739
|
+
"Cannot stop Postgres server: Must either supply pgdata argument or set PGDATA environment variable"
|
|
2740
|
+
)
|
|
2741
|
+
|
|
2742
|
+
subprocess.run(["pg_ctl", "-D", pgdata, "stop"], check=raise_on_error)
|
|
2743
|
+
|
|
2744
|
+
|
|
2745
|
+
def is_running(pgdata: str | Path = "") -> bool:
|
|
2746
|
+
"""Checks, whether a local Postgres server is currently running.
|
|
2747
|
+
|
|
2748
|
+
This function assumes that *pg_ctl* is available on the system PATH. A data directory can be supplied to check whether
|
|
2749
|
+
a server is running for the specific database. If *pgdata* is not supplied, the *PGDATA* environment variable is used as
|
|
2750
|
+
a fallback.
|
|
2751
|
+
"""
|
|
2752
|
+
if os.system("which pg_ctl") != 0:
|
|
2753
|
+
raise ValueError("Cannot start Postgres server: pg_ctl is not on PATH")
|
|
2754
|
+
|
|
2755
|
+
cmd = ["pg_ctl"]
|
|
2756
|
+
pgdata = pgdata or os.environ.get("PGDATA", "")
|
|
2757
|
+
if pgdata:
|
|
2758
|
+
cmd.extend(["-D", pgdata])
|
|
2759
|
+
cmd.append("status")
|
|
2760
|
+
|
|
2761
|
+
res = subprocess.run(cmd)
|
|
2762
|
+
return res.returncode == 0
|
|
2763
|
+
|
|
2764
|
+
|
|
2765
|
+
def _parallel_query_initializer(
|
|
2766
|
+
connect_string: str, local_data: threading.local, verbose: bool = False
|
|
2767
|
+
) -> None:
|
|
2768
|
+
"""Internal function for the `ParallelQueryExecutor` to setup worker connections.
|
|
2769
|
+
|
|
2770
|
+
Parameters
|
|
2771
|
+
----------
|
|
2772
|
+
connect_string : str
|
|
2773
|
+
Connection info to establish a network connection to the Postgres instance. Delegates to Psycopg
|
|
2774
|
+
local_data : threading.local
|
|
2775
|
+
Data object to store the opened connection
|
|
2776
|
+
verbose : bool, optional
|
|
2777
|
+
Whether to print logging information, by default *False*
|
|
2778
|
+
|
|
2779
|
+
References
|
|
2780
|
+
----------
|
|
2781
|
+
|
|
2782
|
+
.. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
|
|
2783
|
+
database
|
|
2784
|
+
"""
|
|
2785
|
+
log = util.make_logger(verbose)
|
|
2786
|
+
tid = threading.get_ident()
|
|
2787
|
+
connection = psycopg.connect(
|
|
2788
|
+
connect_string, application_name=f"PostBOUND parallel worker ID {tid}"
|
|
2789
|
+
)
|
|
2790
|
+
connection.autocommit = True
|
|
2791
|
+
local_data.connection = connection
|
|
2792
|
+
log(f"[worker id={tid}, ts={util.timestamp()}] Connected")
|
|
2793
|
+
|
|
2794
|
+
|
|
2795
|
+
def _parallel_query_worker(
|
|
2796
|
+
query: str | SqlQuery,
|
|
2797
|
+
local_data: threading.local,
|
|
2798
|
+
timeout: Optional[int] = None,
|
|
2799
|
+
verbose: bool = False,
|
|
2800
|
+
) -> tuple[SqlQuery | str, Any]:
|
|
2801
|
+
"""Internal function for the `ParallelQueryExecutor` to run individual queries.
|
|
2802
|
+
|
|
2803
|
+
Parameters
|
|
2804
|
+
----------
|
|
2805
|
+
query : str | SqlQuery
|
|
2806
|
+
The query to execute. The parallel executor does not make use of caching whatsoever, so no additional parameters are
|
|
2807
|
+
required.
|
|
2808
|
+
local_data : threading.local
|
|
2809
|
+
Data object that contains the database connection to use. This should have been initialized by
|
|
2810
|
+
`_parallel_query_initializer`
|
|
2811
|
+
timeout : Optional[int], optional
|
|
2812
|
+
The number of seconds to wait until the calculation is aborted. Defaults to *None*, which indicates no timeout. In
|
|
2813
|
+
case of timeout, *None* is returned.
|
|
2814
|
+
verbose : bool, optional
|
|
2815
|
+
Whether to print logging information, by default *False*
|
|
2816
|
+
|
|
2817
|
+
Returns
|
|
2818
|
+
-------
|
|
2819
|
+
tuple[SqlQuery | str, Any]
|
|
2820
|
+
A tuple of the original query and the (simplified) result set. See `Database.execute_query` for an outline of the
|
|
2821
|
+
simplification process. This method applies the same rules. The query is also provided to distinguish the different
|
|
2822
|
+
result sets that arrive in parallel.
|
|
2823
|
+
"""
|
|
2824
|
+
log = util.make_logger(verbose)
|
|
2825
|
+
connection: psycopg.connection.Connection = local_data.connection
|
|
2826
|
+
connection.rollback()
|
|
2827
|
+
cursor = connection.cursor()
|
|
2828
|
+
if timeout:
|
|
2829
|
+
cursor.execute(f"SET statement_timeout = '{timeout}s';")
|
|
2830
|
+
|
|
2831
|
+
log(
|
|
2832
|
+
f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Now executing query {query}"
|
|
2833
|
+
)
|
|
2834
|
+
try:
|
|
2835
|
+
cursor.execute(str(query))
|
|
2836
|
+
log(
|
|
2837
|
+
f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Executed query {query}"
|
|
2838
|
+
)
|
|
2839
|
+
except psycopg.errors.QueryCanceled as e:
|
|
2840
|
+
if "canceling statement due to statement timeout" in e.args:
|
|
2841
|
+
log(
|
|
2842
|
+
f"[worker id={threading.get_ident()}, ts={util.timestamp()}] Query {query} timed out"
|
|
2843
|
+
)
|
|
2844
|
+
return query, None
|
|
2845
|
+
else:
|
|
2846
|
+
raise e
|
|
2847
|
+
|
|
2848
|
+
result_set = cursor.fetchall()
|
|
2849
|
+
cursor.close()
|
|
2850
|
+
|
|
2851
|
+
return query, result_set
|
|
2852
|
+
|
|
2853
|
+
|
|
2854
|
+
class ParallelQueryExecutor:
|
|
2855
|
+
"""The ParallelQueryExecutor provides mechanisms to conveniently execute queries in parallel.
|
|
2856
|
+
|
|
2857
|
+
The parallel execution happens by maintaining a number of worker threads that execute the incoming queries.
|
|
2858
|
+
The number of input queries can exceed the worker pool size, potentially by a large margin. If that is the case,
|
|
2859
|
+
input queries will be buffered until a worker is available.
|
|
2860
|
+
|
|
2861
|
+
This parallel executor has nothing to do with the Database interface and acts entirely independently and
|
|
2862
|
+
Postgres-specific.
|
|
2863
|
+
|
|
2864
|
+
Parameters
|
|
2865
|
+
----------
|
|
2866
|
+
connect_string : str
|
|
2867
|
+
Connection info to establish a network connection to the Postgres instance. Delegates to Psycopg
|
|
2868
|
+
n_threads : Optional[int], optional
|
|
2869
|
+
The maximum number of parallel workers to use. If this is not specified, uses ``os.cpu_count()`` many workers.
|
|
2870
|
+
timeout : Optional[int], optional
|
|
2871
|
+
The number of seconds to wait until an individual query is aborted. Timeouts do not affect other queries (both those
|
|
2872
|
+
running in parallel or those running afterwards on the same worker). In case of a timeout, the query's entry in the
|
|
2873
|
+
result set will be *None*.
|
|
2874
|
+
verbose : bool, optional
|
|
2875
|
+
Whether to print logging information during the query execution. This is off by default.
|
|
2876
|
+
|
|
2877
|
+
See Also
|
|
2878
|
+
--------
|
|
2879
|
+
Database
|
|
2880
|
+
PostgresInterface
|
|
2881
|
+
|
|
2882
|
+
References
|
|
2883
|
+
----------
|
|
2884
|
+
|
|
2885
|
+
.. Psyopg v3: https://www.psycopg.org/psycopg3/ This is used internally by the Postgres interface to interact with the
|
|
2886
|
+
database
|
|
2887
|
+
"""
|
|
2888
|
+
|
|
2889
|
+
def __init__(
|
|
2890
|
+
self,
|
|
2891
|
+
connect_string: str,
|
|
2892
|
+
n_threads: Optional[int] = None,
|
|
2893
|
+
*,
|
|
2894
|
+
timeout: Optional[int] = None,
|
|
2895
|
+
verbose: bool = False,
|
|
2896
|
+
) -> None:
|
|
2897
|
+
self._n_threads = (
|
|
2898
|
+
n_threads if n_threads is not None and n_threads > 0 else os.cpu_count()
|
|
2899
|
+
)
|
|
2900
|
+
self._connect_string = connect_string
|
|
2901
|
+
self._timeout = timeout
|
|
2902
|
+
self._verbose = verbose
|
|
2903
|
+
|
|
2904
|
+
self._thread_data = threading.local()
|
|
2905
|
+
self._thread_pool = concurrent.futures.ThreadPoolExecutor(
|
|
2906
|
+
max_workers=self._n_threads,
|
|
2907
|
+
initializer=_parallel_query_initializer,
|
|
2908
|
+
initargs=(
|
|
2909
|
+
self._connect_string,
|
|
2910
|
+
self._thread_data,
|
|
2911
|
+
),
|
|
2912
|
+
)
|
|
2913
|
+
self._tasks: list[concurrent.futures.Future] = []
|
|
2914
|
+
self._results: list[Any] = []
|
|
2915
|
+
self._queries: dict[concurrent.futures.Future, SqlQuery | str] = {}
|
|
2916
|
+
|
|
2917
|
+
def queue_query(self, query: SqlQuery | str) -> None:
|
|
2918
|
+
"""Adds a new query to the queue, to be executed as soon as possible.
|
|
2919
|
+
|
|
2920
|
+
If a timeout was specified when creating the executor, this timeout will be applied to the query.
|
|
2921
|
+
|
|
2922
|
+
Parameters
|
|
2923
|
+
----------
|
|
2924
|
+
query : SqlQuery | str
|
|
2925
|
+
The query to execute
|
|
2926
|
+
"""
|
|
2927
|
+
future = self._thread_pool.submit(
|
|
2928
|
+
_parallel_query_worker,
|
|
2929
|
+
query,
|
|
2930
|
+
self._thread_data,
|
|
2931
|
+
self._timeout,
|
|
2932
|
+
self._verbose,
|
|
2933
|
+
)
|
|
2934
|
+
self._tasks.append(future)
|
|
2935
|
+
self._queries[future] = query
|
|
2936
|
+
|
|
2937
|
+
def drain_queue(
|
|
2938
|
+
self,
|
|
2939
|
+
timeout: Optional[float] = None,
|
|
2940
|
+
*,
|
|
2941
|
+
callback: Optional[Callable[[SqlQuery | str, ResultSet | None], None]] = None,
|
|
2942
|
+
) -> None:
|
|
2943
|
+
"""Blocks, until all queries currently queued have terminated.
|
|
2944
|
+
|
|
2945
|
+
Parameters
|
|
2946
|
+
----------
|
|
2947
|
+
timeout : Optional[float], optional
|
|
2948
|
+
The number of seconds to wait until the calculation is aborted. Defaults to *None*, which indicates no timeout,
|
|
2949
|
+
i.e. wait forever. Note that in contrast to the timeout specified when creating the executor, this timeout
|
|
2950
|
+
applies to the entire queue and not to individual queries. For example, one can set the per-query timeout to 1s
|
|
2951
|
+
which means that each query can be executed for at most 1 second. If an additional timeout of 10s is specified
|
|
2952
|
+
on the queue, the entire queue will be aborted if it takes longer than 10 seconds to complete.
|
|
2953
|
+
callback : Optional[Callable[[SqlQuery | str, ResultSet | None], None]], optional
|
|
2954
|
+
A callback to be executed with each query that completes. The callback receives the query that was executed and
|
|
2955
|
+
the corresponding (raw) result set as arguments. If the query ran into a timeout, the result set is *None*.
|
|
2956
|
+
|
|
2957
|
+
Raises
|
|
2958
|
+
------
|
|
2959
|
+
TimeoutError or concurrent.futures.TimeoutError
|
|
2960
|
+
If some queries have not completed after the given `timeout`.
|
|
2961
|
+
"""
|
|
2962
|
+
for future in concurrent.futures.as_completed(self._tasks, timeout=timeout):
|
|
2963
|
+
result_set = future.result()
|
|
2964
|
+
self._results.append(result_set)
|
|
2965
|
+
|
|
2966
|
+
if not callback:
|
|
2967
|
+
continue
|
|
2968
|
+
|
|
2969
|
+
query = self._queries[future]
|
|
2970
|
+
callback(query, result_set)
|
|
2971
|
+
|
|
2972
|
+
def result_set(self) -> dict[str | SqlQuery, ResultSet | None]:
|
|
2973
|
+
"""Provides the results of all queries that have terminated already, mapping query -> result set
|
|
2974
|
+
|
|
2975
|
+
Returns
|
|
2976
|
+
-------
|
|
2977
|
+
dict[str | SqlQuery, ResultSet | None]
|
|
2978
|
+
The query results. The raw result sets are provided without any simplification. If the query timed out, the result
|
|
2979
|
+
set is *None* (in contrast to empty result sets like `[]`).
|
|
2980
|
+
"""
|
|
2981
|
+
return dict(self._results)
|
|
2982
|
+
|
|
2983
|
+
def close(self) -> None:
|
|
2984
|
+
"""Terminates all worker threads. The executor is essentially useless afterwards."""
|
|
2985
|
+
self._thread_pool.shutdown()
|
|
2986
|
+
|
|
2987
|
+
def __repr__(self) -> str:
|
|
2988
|
+
return str(self)
|
|
2989
|
+
|
|
2990
|
+
def __str__(self) -> str:
|
|
2991
|
+
running_workers = [future for future in self._tasks if future.running()]
|
|
2992
|
+
completed_workers = [future for future in self._tasks if future.done()]
|
|
2993
|
+
|
|
2994
|
+
return (
|
|
2995
|
+
f"Concurrent query pool of {self._n_threads} workers, {len(self._tasks)} tasks "
|
|
2996
|
+
f"(run={len(running_workers)} fin={len(completed_workers)})"
|
|
2997
|
+
)
|
|
2998
|
+
|
|
2999
|
+
|
|
3000
|
+
def _timeout_query_worker(
|
|
3001
|
+
query: SqlQuery | str,
|
|
3002
|
+
*,
|
|
3003
|
+
pg_config: dict,
|
|
3004
|
+
result_send: mp_conn.Connection,
|
|
3005
|
+
err_send: mp_conn.Connection,
|
|
3006
|
+
backend_send: mp_conn.Connection,
|
|
3007
|
+
**kwargs,
|
|
3008
|
+
) -> None:
|
|
3009
|
+
"""Internal function to the `TimeoutQueryExecutor` to run individual queries.
|
|
3010
|
+
|
|
3011
|
+
Query results are sent via the `result_send` pipe, not as a return value. In case of any errors, these are sent via the
|
|
3012
|
+
`err_send` pipe. Therefore, it is best to check the `err_send` pipe first, before reading from the `result_send` pipe.
|
|
3013
|
+
|
|
3014
|
+
Parameters
|
|
3015
|
+
----------
|
|
3016
|
+
query : SqlQuery | str
|
|
3017
|
+
Query to execute
|
|
3018
|
+
pg_config : dict
|
|
3019
|
+
Pickable representation of the current Postgres connection. This is used to re-establish the connection in the parallel
|
|
3020
|
+
worker.
|
|
3021
|
+
result_send : mp_conn.Connection
|
|
3022
|
+
Pipe connection to send the query result
|
|
3023
|
+
err_send : mp_conn.Connection
|
|
3024
|
+
Pipe connection to send any errors that occurred during the query execution
|
|
3025
|
+
backend_send : mp_conn.Connection
|
|
3026
|
+
Pipe connection to send the backend PID
|
|
3027
|
+
kwargs : Any
|
|
3028
|
+
Additional parameters to pass to the `PostgresInterface.execute_query` method.
|
|
3029
|
+
"""
|
|
3030
|
+
try:
|
|
3031
|
+
connect_string = pg_config["connect_string"]
|
|
3032
|
+
cache_enabled = pg_config.get("cache_enabled", False)
|
|
3033
|
+
pg_instance = PostgresInterface(
|
|
3034
|
+
connect_string,
|
|
3035
|
+
application_name="PostBOUND Timeout Worker",
|
|
3036
|
+
cache_enabled=cache_enabled,
|
|
3037
|
+
)
|
|
3038
|
+
backend_send.send(pg_instance.backend_pid())
|
|
3039
|
+
pg_instance.apply_configuration(pg_config["config"])
|
|
3040
|
+
|
|
3041
|
+
result = pg_instance.execute_query(query, **kwargs)
|
|
3042
|
+
runtime = pg_instance.last_query_runtime()
|
|
3043
|
+
|
|
3044
|
+
result_send.send({"query_result": result, "runtime": runtime})
|
|
3045
|
+
except Exception as e:
|
|
3046
|
+
err_send.send(e)
|
|
3047
|
+
finally:
|
|
3048
|
+
pg_instance.close()
|
|
3049
|
+
|
|
3050
|
+
|
|
3051
|
+
class TimeoutQueryExecutor:
|
|
3052
|
+
"""The TimeoutQueryExecutor provides a mechanism to execute queries with a timeout attached.
|
|
3053
|
+
|
|
3054
|
+
If the query takes longer than the designated timeout, its execution is cancelled. The query execution itself is delegated
|
|
3055
|
+
to the `PostgresInterface`, so all its rules still apply. At the same time, using the timeout executor service can
|
|
3056
|
+
invalidate some of the state that is exposed by the database interface (see *Warnings* below). Therefore, the relevant
|
|
3057
|
+
variables should be refreshed once the timeout executor was used.
|
|
3058
|
+
|
|
3059
|
+
In addition to calling the `execute_query` method directly, the executor also implements *__call__* for more convenient
|
|
3060
|
+
access. Both methods accept the same parameters.
|
|
3061
|
+
|
|
3062
|
+
Parameters
|
|
3063
|
+
----------
|
|
3064
|
+
postgres_instance : Optional[PostgresInterface], optional
|
|
3065
|
+
Database to execute the queries. If omitted, this is inferred from the `DatabasePool`.
|
|
3066
|
+
|
|
3067
|
+
Warnings
|
|
3068
|
+
--------
|
|
3069
|
+
When a query gets cancelled due to the timeout being reached, the current cursor as well as database connection might be
|
|
3070
|
+
refreshed. Any direct references to these instances should no longer be used.
|
|
3071
|
+
"""
|
|
3072
|
+
|
|
3073
|
+
def __init__(self, postgres_instance: Optional[PostgresInterface] = None) -> None:
|
|
3074
|
+
self._pg_instance = (
|
|
3075
|
+
postgres_instance
|
|
3076
|
+
if postgres_instance is not None
|
|
3077
|
+
else DatabasePool.get_instance().current_database()
|
|
3078
|
+
)
|
|
3079
|
+
self._timeout_watchdog = psycopg.connect(
|
|
3080
|
+
self._pg_instance.connect_string,
|
|
3081
|
+
application_name="PostBOUND Timeout Watchdog",
|
|
3082
|
+
)
|
|
3083
|
+
|
|
3084
|
+
def execute_query(self, query: SqlQuery | str, timeout: float, **kwargs) -> Any:
|
|
3085
|
+
"""Runs a query on the database connection, cancelling if it takes longer than a specific timeout.
|
|
3086
|
+
|
|
3087
|
+
Parameters
|
|
3088
|
+
----------
|
|
3089
|
+
query : SqlQuery | str
|
|
3090
|
+
Query to execute
|
|
3091
|
+
timeout : float
|
|
3092
|
+
Maximum query execution time in seconds.
|
|
3093
|
+
**kwargs
|
|
3094
|
+
Additional parameters to pass to the `PostgresInterface.execute_query` method.
|
|
3095
|
+
|
|
3096
|
+
Returns
|
|
3097
|
+
-------
|
|
3098
|
+
Any
|
|
3099
|
+
The query result if it terminated timely. Rules from `PostgresInterface.execute_query` apply.
|
|
3100
|
+
|
|
3101
|
+
Raises
|
|
3102
|
+
------
|
|
3103
|
+
TimeoutError
|
|
3104
|
+
If the query execution was not finished after `timeout` seconds.
|
|
3105
|
+
|
|
3106
|
+
See Also
|
|
3107
|
+
--------
|
|
3108
|
+
PostgresInterface.execute_query
|
|
3109
|
+
PostgresInterface.reset_connection
|
|
3110
|
+
"""
|
|
3111
|
+
result_recv, result_send = mp.Pipe(False)
|
|
3112
|
+
error_recv, error_send = mp.Pipe(False)
|
|
3113
|
+
backend_recv, backend_send = mp.Pipe(False)
|
|
3114
|
+
query_worker = mp.Process(
|
|
3115
|
+
target=_timeout_query_worker,
|
|
3116
|
+
args=(query,),
|
|
3117
|
+
kwargs={
|
|
3118
|
+
"pg_config": self._pg_fingerprint(),
|
|
3119
|
+
"result_send": result_send,
|
|
3120
|
+
"err_send": error_send,
|
|
3121
|
+
"backend_send": backend_send,
|
|
3122
|
+
**kwargs,
|
|
3123
|
+
},
|
|
3124
|
+
)
|
|
3125
|
+
|
|
3126
|
+
query_worker.start()
|
|
3127
|
+
query_worker.join(timeout)
|
|
3128
|
+
|
|
3129
|
+
# We perform the timeout check before doing anything else to make sure that the worker process cannot terminate
|
|
3130
|
+
# immediately after the timeout has been reached. E.g., suppose that the query is still running after calling join().
|
|
3131
|
+
# If we would now proceed to check the error-pipe, the query would have more time to terminate while we are performing
|
|
3132
|
+
# our error checks. This might result in an involuntary increase in the timeout duration. By keeping the timeout check
|
|
3133
|
+
# as close to the join() call as possible, we minimize this risk.
|
|
3134
|
+
timed_out = query_worker.is_alive()
|
|
3135
|
+
query_worker.terminate()
|
|
3136
|
+
query_worker.join()
|
|
3137
|
+
|
|
3138
|
+
# Now that we know whether the worker timed out or not, we need to make sure that it actually terminated properly
|
|
3139
|
+
# (or timed out). In case of an error, we just propagate it to the client.
|
|
3140
|
+
if error_recv.poll():
|
|
3141
|
+
self._pg_instance._last_query_runtime = math.nan
|
|
3142
|
+
self._abort_backend(backend_recv.recv())
|
|
3143
|
+
err = error_recv.recv()
|
|
3144
|
+
|
|
3145
|
+
query_worker.close()
|
|
3146
|
+
result_send.close()
|
|
3147
|
+
result_recv.close()
|
|
3148
|
+
error_send.close()
|
|
3149
|
+
error_recv.close()
|
|
3150
|
+
|
|
3151
|
+
raise err
|
|
3152
|
+
|
|
3153
|
+
# At this point we know that the worker either terminated in time or that it timed out, but it did not error.
|
|
3154
|
+
# Both the timeout and the termination case can be handled in a pretty straightforward manner.
|
|
3155
|
+
if timed_out:
|
|
3156
|
+
self._abort_backend(backend_recv.recv())
|
|
3157
|
+
query_result = None
|
|
3158
|
+
self._pg_instance._last_query_runtime = timeout
|
|
3159
|
+
else:
|
|
3160
|
+
raw_result = result_recv.recv()
|
|
3161
|
+
query_result = raw_result["query_result"]
|
|
3162
|
+
self._pg_instance._last_query_runtime = raw_result["runtime"]
|
|
3163
|
+
|
|
3164
|
+
query_worker.close()
|
|
3165
|
+
result_send.close()
|
|
3166
|
+
result_recv.close()
|
|
3167
|
+
error_send.close()
|
|
3168
|
+
error_recv.close()
|
|
3169
|
+
|
|
3170
|
+
if timed_out:
|
|
3171
|
+
raise TimeoutError(query)
|
|
3172
|
+
else:
|
|
3173
|
+
return query_result
|
|
3174
|
+
|
|
3175
|
+
def _pg_fingerprint(self) -> dict:
|
|
3176
|
+
"""Generate a pickable representation of the current Postgres connection."""
|
|
3177
|
+
return {
|
|
3178
|
+
"connect_string": self._pg_instance.connect_string,
|
|
3179
|
+
"cache_enabled": self._pg_instance.cache_enabled,
|
|
3180
|
+
"config": self._pg_instance.current_configuration(
|
|
3181
|
+
runtime_changeable_only=True
|
|
3182
|
+
),
|
|
3183
|
+
}
|
|
3184
|
+
|
|
3185
|
+
def _abort_backend(self, pid: int) -> None:
|
|
3186
|
+
with self._timeout_watchdog.cursor() as cursor:
|
|
3187
|
+
cursor.execute(f"SELECT pg_cancel_backend({pid});")
|
|
3188
|
+
self._timeout_watchdog.rollback()
|
|
3189
|
+
|
|
3190
|
+
def __call__(self, query: SqlQuery | str, timeout: float, **kwargs) -> Any:
|
|
3191
|
+
return self.execute_query(query, timeout, **kwargs)
|
|
3192
|
+
|
|
3193
|
+
|
|
3194
|
+
PostgresExplainJoinNodes = {
|
|
3195
|
+
"Nested Loop": JoinOperator.NestedLoopJoin,
|
|
3196
|
+
"Hash Join": JoinOperator.HashJoin,
|
|
3197
|
+
"Merge Join": JoinOperator.SortMergeJoin,
|
|
3198
|
+
}
|
|
3199
|
+
"""A mapping from Postgres EXPLAIN node names to the corresponding join operators."""
|
|
3200
|
+
|
|
3201
|
+
PostgresExplainScanNodes = {
|
|
3202
|
+
"Seq Scan": ScanOperator.SequentialScan,
|
|
3203
|
+
"Index Scan": ScanOperator.IndexScan,
|
|
3204
|
+
"Index Only Scan": ScanOperator.IndexOnlyScan,
|
|
3205
|
+
"Bitmap Heap Scan": ScanOperator.BitmapScan,
|
|
3206
|
+
}
|
|
3207
|
+
"""A mapping from Postgres EXPLAIN node names to the corresponding scan operators."""
|
|
3208
|
+
|
|
3209
|
+
PostgresExplainIntermediateNodes = {
|
|
3210
|
+
"Materialize": IntermediateOperator.Materialize,
|
|
3211
|
+
"Memoize": IntermediateOperator.Memoize,
|
|
3212
|
+
"Sort": IntermediateOperator.Sort,
|
|
3213
|
+
}
|
|
3214
|
+
"""A mapping from Postgres EXPLAIN node names to the corresponding intermediate operators."""
|
|
3215
|
+
|
|
3216
|
+
|
|
3217
|
+
class PostgresExplainNode:
|
|
3218
|
+
"""Simplified model of a plan node as provided by Postgres' *EXPLAIN* output in JSON format.
|
|
3219
|
+
|
|
3220
|
+
Generally speaking, a node stores all the information about the plan node that we currently care about. This is mostly
|
|
3221
|
+
focused on optimizer statistics, along with some additional data. Explain nodes form a hierarchichal structure with each
|
|
3222
|
+
node containing an arbitrary number of child nodes. Notice that this model is very loose in the sense that no constraints
|
|
3223
|
+
are enforced and no sanity checking is performed. For example, this means that nodes can contain more than two children
|
|
3224
|
+
even though this can never happen in a real *EXPLAIN* plan. Similarly, the correspondence between filter predicates and
|
|
3225
|
+
the node typse (e.g. join filter for a join node) is not checked.
|
|
3226
|
+
|
|
3227
|
+
All relevant data from the explain node is exposed as attributes on the objects. Even though these are mutable, they should
|
|
3228
|
+
be thought of as read-only data objects.
|
|
3229
|
+
|
|
3230
|
+
Parameters
|
|
3231
|
+
----------
|
|
3232
|
+
explain_data : dict
|
|
3233
|
+
The JSON data of the current explain node. This is parsed and prepared as part of the *__init__* method.
|
|
3234
|
+
|
|
3235
|
+
Attributes
|
|
3236
|
+
----------
|
|
3237
|
+
node_type : str | None, default None
|
|
3238
|
+
The node type. This should never be empty or *None*, even though it is technically allowed.
|
|
3239
|
+
cost : float, default NaN
|
|
3240
|
+
The optimizer's cost estimation for this node. This includes the cost of all child nodes as well. This should normally
|
|
3241
|
+
not be *NaN*, even though it is technically allowed.
|
|
3242
|
+
cardinality_estimate : float, default NaN
|
|
3243
|
+
The optimizer's estimation of the number of tuples that will be *produced* by this operator. This should normally not
|
|
3244
|
+
be *NaN*, even though it is technically allowed.
|
|
3245
|
+
execution_time : float, default NaN
|
|
3246
|
+
For *EXPLAIN ANALYZE* plans, this is the actual total execution time of the node in seconds. For pure *EXPLAIN*
|
|
3247
|
+
plans, this is *NaN*
|
|
3248
|
+
true_cardinality : float, default NaN
|
|
3249
|
+
For *EXPLAIN ANALYZE* plans, this is the average of the number of tuples that were actually produced for each loop of
|
|
3250
|
+
the node. For pure *EXPLAIN* plans, this is *NaN*
|
|
3251
|
+
loops : int, default 1
|
|
3252
|
+
For *EXPLAIN ANALYZE* plans, this is the number of times the operator was invoked. The number of invocations can mean
|
|
3253
|
+
a number of different things: for parallel operators, this normally matches the number of parallel workers. For scans,
|
|
3254
|
+
this matches the number of times a new tuple was requested (e.g. for an index nested-loop join the number of loops of
|
|
3255
|
+
the index scan part indicates how many times the index was probed).
|
|
3256
|
+
relation_name : str | None, default None
|
|
3257
|
+
The name of the relation/table that is processed by this node. This should be defined on scan nodes, but could also
|
|
3258
|
+
be present on other nodes.
|
|
3259
|
+
relation_alias : str | None, default None
|
|
3260
|
+
The alias of the relation/table under which the relation was accessed in th equery plan. See `relation_name`.
|
|
3261
|
+
index_name : str | None, default None
|
|
3262
|
+
The name of the index that was probed. This should be defined on index scans and index-only scans, but could also be
|
|
3263
|
+
present on other nodes.
|
|
3264
|
+
filter_condition : str | None, default None
|
|
3265
|
+
A post-processing filter that is applied to all rows emitted by this operator. This is most important for scan
|
|
3266
|
+
operations with an attached filter predicate, but can also be present on some joins.
|
|
3267
|
+
index_condition : str | None, default None
|
|
3268
|
+
The condition that is used to locate the matching tuples in an index scan or index-only scan
|
|
3269
|
+
join_filter : str | None, default None
|
|
3270
|
+
The condition that is used to determine matching tuples in a join
|
|
3271
|
+
hash_condition : str | None, default None
|
|
3272
|
+
The condition that is used to determine matching tuples in a hash join
|
|
3273
|
+
recheck_condition : str | None, default None
|
|
3274
|
+
For lossy bitmap scans or bitmap scans based on lossy indexes, this is post-processing check for whether the produced
|
|
3275
|
+
tuples actually match the filter condition
|
|
3276
|
+
parent_relationship : str | None, default None
|
|
3277
|
+
Describes the role that this node plays in relation to its parent. Common values are *inner* which denotes that
|
|
3278
|
+
this is the inner child of a join and *outer* which denotes the opposite.
|
|
3279
|
+
parallel_workers : int | float, default NaN
|
|
3280
|
+
For parallel operators in *EXPLAIN ANALYZE* plans, this is the actual number of worker processes that were started.
|
|
3281
|
+
Notice that in total there is one additional worker. This process takes care of spawning the other workers and
|
|
3282
|
+
managing them, but can also take part in the input processing.
|
|
3283
|
+
sort_keys : list[str]
|
|
3284
|
+
The columns that are used to sort the tuples that are produced by this node. This is most important for sort nodes,
|
|
3285
|
+
but can also be present on other nodes.
|
|
3286
|
+
shared_blocks_read : float, default NaN
|
|
3287
|
+
For *EXPLAIN ANALYZE* plans with *BUFFERS* enabled, this is the number of blocks/pages that where retrieved from
|
|
3288
|
+
disk while executing this node, including the reads of all its child nodes.
|
|
3289
|
+
shared_blocks_buffered : float, default NaN
|
|
3290
|
+
For *EXPLAIN ANALYZE* plans with *BUFFERS* enabled, this is the number of blocks/pages that where retrieved from
|
|
3291
|
+
the shared buffer while executing this node, including the hits of all its child nodes.
|
|
3292
|
+
temp_blocks_read : float, default NaN
|
|
3293
|
+
For *EXPLAIN ANALYZE* blocks with *BUFFERS* enabled, this is the number of short-term data structures (e.g. hash
|
|
3294
|
+
tables, sorts) that where read by this node, including reads of all its child nodes.
|
|
3295
|
+
temp_blocks_written : float, default NaN
|
|
3296
|
+
For *EXPLAIN ANALYZE* blocks with *BUFFERS* enabled, this is the number of short-term data structures (e.g. hash
|
|
3297
|
+
tables, sorts) that where written by this node, including writes of all its child nodes.
|
|
3298
|
+
plan_width : float, default NaN
|
|
3299
|
+
The average width of the tuples that are produced by this node.
|
|
3300
|
+
children : list[PostgresExplainNode]
|
|
3301
|
+
All child / input nodes for the current node
|
|
3302
|
+
"""
|
|
3303
|
+
|
|
3304
|
+
def __init__(self, explain_data: dict) -> None:
|
|
3305
|
+
self.node_type = explain_data.get("Node Type", None)
|
|
3306
|
+
|
|
3307
|
+
self.cost = explain_data.get("Total Cost", math.nan)
|
|
3308
|
+
self.cardinality_estimate = explain_data.get("Plan Rows", math.nan)
|
|
3309
|
+
self.execution_time = explain_data.get("Actual Total Time", math.nan) / 1000
|
|
3310
|
+
|
|
3311
|
+
# true_cardinality is accessed as a property to add a warning for BitmapAnd/Or nodes
|
|
3312
|
+
self._true_card = explain_data.get("Actual Rows", math.nan)
|
|
3313
|
+
|
|
3314
|
+
self.loops = explain_data.get("Actual Loops", 1)
|
|
3315
|
+
|
|
3316
|
+
self.relation_name = explain_data.get("Relation Name", None)
|
|
3317
|
+
self.relation_alias = explain_data.get("Alias", None)
|
|
3318
|
+
self.index_name = explain_data.get("Index Name", None)
|
|
3319
|
+
self.subplan_name = explain_data.get("Subplan Name", None)
|
|
3320
|
+
self.cte_name = explain_data.get("CTE Name", None)
|
|
3321
|
+
|
|
3322
|
+
self.filter_condition = explain_data.get("Filter", None)
|
|
3323
|
+
self.index_condition = explain_data.get("Index Cond", None)
|
|
3324
|
+
self.join_filter = explain_data.get("Join Filter", None)
|
|
3325
|
+
self.hash_condition = explain_data.get("Hash Cond", None)
|
|
3326
|
+
self.recheck_condition = explain_data.get("Recheck Cond", None)
|
|
3327
|
+
|
|
3328
|
+
self.parent_relationship = explain_data.get("Parent Relationship", None)
|
|
3329
|
+
self.parallel_workers = explain_data.get("Workers Launched", math.nan)
|
|
3330
|
+
if math.isnan(self.parallel_workers):
|
|
3331
|
+
self.parallel_workers = explain_data.get("Workers Planned", math.nan)
|
|
3332
|
+
self.sort_keys = explain_data.get("Sort Key", [])
|
|
3333
|
+
|
|
3334
|
+
self.shared_blocks_read = explain_data.get("Shared Read Blocks", math.nan)
|
|
3335
|
+
self.shared_blocks_cached = explain_data.get("Shared Hit Blocks", math.nan)
|
|
3336
|
+
self.temp_blocks_read = explain_data.get("Temp Read Blocks", math.nan)
|
|
3337
|
+
self.temp_blocks_written = explain_data.get("Temp Written Blocks", math.nan)
|
|
3338
|
+
self.plan_width = explain_data.get("Plan Width", math.nan)
|
|
3339
|
+
|
|
3340
|
+
self.children = [
|
|
3341
|
+
PostgresExplainNode(child) for child in explain_data.get("Plans", [])
|
|
3342
|
+
]
|
|
3343
|
+
|
|
3344
|
+
self.explain_data = explain_data
|
|
3345
|
+
self._hash_val = hash(
|
|
3346
|
+
(
|
|
3347
|
+
self.node_type,
|
|
3348
|
+
self.relation_name,
|
|
3349
|
+
self.relation_alias,
|
|
3350
|
+
self.index_name,
|
|
3351
|
+
self.subplan_name,
|
|
3352
|
+
self.cte_name,
|
|
3353
|
+
self.filter_condition,
|
|
3354
|
+
self.index_condition,
|
|
3355
|
+
self.join_filter,
|
|
3356
|
+
self.hash_condition,
|
|
3357
|
+
self.recheck_condition,
|
|
3358
|
+
self.parent_relationship,
|
|
3359
|
+
self.parallel_workers,
|
|
3360
|
+
tuple(self.children),
|
|
3361
|
+
)
|
|
3362
|
+
)
|
|
3363
|
+
|
|
3364
|
+
@property
|
|
3365
|
+
def true_cardinality(self) -> float:
|
|
3366
|
+
if self.node_type in {"BitmapAnd", "BitmapOr"}:
|
|
3367
|
+
# For BitmapAnd/BitmapOr nodes, the actual number of rows is always 0.
|
|
3368
|
+
# This is due to limitations in the Postgres implementation.
|
|
3369
|
+
warnings.warn(
|
|
3370
|
+
"Postgres does not report the actual number of rows for bitmap nodes correctly. Returning NaN."
|
|
3371
|
+
)
|
|
3372
|
+
return math.nan
|
|
3373
|
+
return self._true_card
|
|
3374
|
+
|
|
3375
|
+
def is_scan(self) -> bool:
|
|
3376
|
+
"""Checks, whether the current node corresponds to a scan node.
|
|
3377
|
+
|
|
3378
|
+
For Bitmap index scans, which are multi-level scan operators, this is true for the heap scan part that takes care of
|
|
3379
|
+
actually reading the tuples according to the bitmap provided by the bitmap index scan operators.
|
|
3380
|
+
|
|
3381
|
+
Returns
|
|
3382
|
+
-------
|
|
3383
|
+
bool
|
|
3384
|
+
Whether the node is a scan node
|
|
3385
|
+
"""
|
|
3386
|
+
return self.node_type in PostgresExplainScanNodes
|
|
3387
|
+
|
|
3388
|
+
def is_join(self) -> bool:
|
|
3389
|
+
"""Checks, whether the current node corresponds to a join node.
|
|
3390
|
+
|
|
3391
|
+
Returns
|
|
3392
|
+
-------
|
|
3393
|
+
bool
|
|
3394
|
+
Whether the node is a join node
|
|
3395
|
+
"""
|
|
3396
|
+
return self.node_type in PostgresExplainJoinNodes
|
|
3397
|
+
|
|
3398
|
+
def is_analyze(self) -> bool:
|
|
3399
|
+
"""Checks, whether this *EXPLAIN* plan is an *EXPLAIN ANALYZE* plan or a pure *EXPLAIN* plan.
|
|
3400
|
+
|
|
3401
|
+
The analyze variant does not only obtain the plan, but actually executes it. This enables the comparison of the
|
|
3402
|
+
optimizer's estimates to the actual values. If a plan is an *EXPLAIN ANALYZE* plan, some attributes of this node
|
|
3403
|
+
receive actual values. These include `execution_time`, `true_cardinality`, `loops` and `parallel_workers`.
|
|
3404
|
+
|
|
3405
|
+
|
|
3406
|
+
Returns
|
|
3407
|
+
-------
|
|
3408
|
+
bool
|
|
3409
|
+
Whether the node represents part of an *EXPLAIN ANALYZE* plan
|
|
3410
|
+
"""
|
|
3411
|
+
return not math.isnan(self.execution_time)
|
|
3412
|
+
|
|
3413
|
+
def filter_conditions(self) -> dict[str, str]:
|
|
3414
|
+
"""Collects all filter conditions that are defined on this node
|
|
3415
|
+
|
|
3416
|
+
Returns
|
|
3417
|
+
-------
|
|
3418
|
+
dict[str, str]
|
|
3419
|
+
A dictionary mapping the type of filter condition (e.g. index condition or join filter) to the actual filter value.
|
|
3420
|
+
"""
|
|
3421
|
+
conditions: dict[str, str] = {}
|
|
3422
|
+
if self.filter_condition is not None:
|
|
3423
|
+
conditions["Filter"] = self.filter_condition
|
|
3424
|
+
if self.index_condition is not None:
|
|
3425
|
+
conditions["Index Cond"] = self.index_condition
|
|
3426
|
+
if self.join_filter is not None:
|
|
3427
|
+
conditions["Join Filter"] = self.join_filter
|
|
3428
|
+
if self.hash_condition is not None:
|
|
3429
|
+
conditions["Hash Cond"] = self.hash_condition
|
|
3430
|
+
if self.recheck_condition is not None:
|
|
3431
|
+
conditions["Recheck Cond"] = self.recheck_condition
|
|
3432
|
+
return conditions
|
|
3433
|
+
|
|
3434
|
+
def inner_outer_children(self) -> Sequence[PostgresExplainNode]:
|
|
3435
|
+
"""Provides the children of this node in a sequence of inner, outer if applicable.
|
|
3436
|
+
|
|
3437
|
+
For all nodes where this structure is not meaningful (e.g. intermediate nodes that operate on a single relation or
|
|
3438
|
+
scan nodes), the child nodes are returned as-is (e.g. as a list of a single child or an empty list).
|
|
3439
|
+
|
|
3440
|
+
Returns
|
|
3441
|
+
-------
|
|
3442
|
+
Sequence[PostgresExplainNode]
|
|
3443
|
+
The children of the current node in a unified format
|
|
3444
|
+
"""
|
|
3445
|
+
if len(self.children) < 2:
|
|
3446
|
+
return self.children
|
|
3447
|
+
assert len(self.children) == 2
|
|
3448
|
+
|
|
3449
|
+
first_child, second_child = self.children
|
|
3450
|
+
inner_child = (
|
|
3451
|
+
first_child if first_child.parent_relationship == "Inner" else second_child
|
|
3452
|
+
)
|
|
3453
|
+
outer_child = first_child if second_child == inner_child else second_child
|
|
3454
|
+
return (inner_child, outer_child)
|
|
3455
|
+
|
|
3456
|
+
def parse_table(self) -> Optional[TableReference]:
|
|
3457
|
+
"""Provides the table that is processed by this node.
|
|
3458
|
+
|
|
3459
|
+
Returns
|
|
3460
|
+
-------
|
|
3461
|
+
Optional[TableReference]
|
|
3462
|
+
The table being scanned. For non-scan nodes, or nodes where no table can be inferred, *None* will be returned.
|
|
3463
|
+
"""
|
|
3464
|
+
if not self.relation_name:
|
|
3465
|
+
return None
|
|
3466
|
+
alias = (
|
|
3467
|
+
self.relation_alias
|
|
3468
|
+
if self.relation_alias is not None
|
|
3469
|
+
and self.relation_alias != self.relation_name
|
|
3470
|
+
else ""
|
|
3471
|
+
)
|
|
3472
|
+
return TableReference(self.relation_name, alias)
|
|
3473
|
+
|
|
3474
|
+
def as_qep(self) -> QueryPlan:
|
|
3475
|
+
"""Transforms the postgres-specific plan to a standardized `QueryPlan` instance.
|
|
3476
|
+
|
|
3477
|
+
Notice that this transformation is lossy since not all information from the Postgres plan can be represented in query
|
|
3478
|
+
execution plan instances. Furthermore, this transformation can be problematic for complicated queries that use
|
|
3479
|
+
special Postgres features. Most importantly, for queries involving subqueries, special node types and parent
|
|
3480
|
+
relationships can be contained in the plan, that cannot be represented by other parts of PostBOUND. If this method
|
|
3481
|
+
and the resulting query execution plans should be used on complex workloads, it is advisable to check the plans twice
|
|
3482
|
+
before continuing.
|
|
3483
|
+
|
|
3484
|
+
Returns
|
|
3485
|
+
-------
|
|
3486
|
+
QueryPlan
|
|
3487
|
+
The equivalent query execution plan for this node
|
|
3488
|
+
|
|
3489
|
+
Raises
|
|
3490
|
+
------
|
|
3491
|
+
ValueError
|
|
3492
|
+
If the node contains more than two children.
|
|
3493
|
+
"""
|
|
3494
|
+
child_nodes = []
|
|
3495
|
+
inner_child, outer_child, subplan_child = None, None, None
|
|
3496
|
+
for child in self.children:
|
|
3497
|
+
parent_rel = child.parent_relationship
|
|
3498
|
+
qep_child = child.as_qep()
|
|
3499
|
+
|
|
3500
|
+
match parent_rel:
|
|
3501
|
+
case "Inner":
|
|
3502
|
+
inner_child = qep_child
|
|
3503
|
+
case "Outer":
|
|
3504
|
+
outer_child = qep_child
|
|
3505
|
+
case "SubPlan" | "InitPlan" | "Subquery":
|
|
3506
|
+
subplan_child = qep_child
|
|
3507
|
+
case "Member":
|
|
3508
|
+
child_nodes.append(qep_child)
|
|
3509
|
+
case _:
|
|
3510
|
+
raise ValueError(
|
|
3511
|
+
f"Unknown parent relationship '{parent_rel}' for child {child}"
|
|
3512
|
+
)
|
|
3513
|
+
|
|
3514
|
+
if inner_child and outer_child:
|
|
3515
|
+
child_nodes = [outer_child, inner_child] + child_nodes
|
|
3516
|
+
elif outer_child:
|
|
3517
|
+
child_nodes.insert(0, outer_child)
|
|
3518
|
+
elif inner_child:
|
|
3519
|
+
child_nodes.insert(0, inner_child)
|
|
3520
|
+
|
|
3521
|
+
table = self.parse_table()
|
|
3522
|
+
subplan_name = self.subplan_name or self.cte_name
|
|
3523
|
+
true_card = self.true_cardinality * self.loops
|
|
3524
|
+
|
|
3525
|
+
if self.is_scan():
|
|
3526
|
+
operator = PostgresExplainScanNodes.get(self.node_type, None)
|
|
3527
|
+
elif self.is_join():
|
|
3528
|
+
operator = PostgresExplainJoinNodes.get(self.node_type, None)
|
|
3529
|
+
else:
|
|
3530
|
+
operator = PostgresExplainIntermediateNodes.get(self.node_type, None)
|
|
3531
|
+
|
|
3532
|
+
sort_keys = (
|
|
3533
|
+
self._parse_sort_keys()
|
|
3534
|
+
if self.sort_keys
|
|
3535
|
+
else self._infer_sorting_from_children()
|
|
3536
|
+
)
|
|
3537
|
+
shared_hits = (
|
|
3538
|
+
None if math.isnan(self.shared_blocks_cached) else self.shared_blocks_cached
|
|
3539
|
+
)
|
|
3540
|
+
shared_misses = (
|
|
3541
|
+
None if math.isnan(self.shared_blocks_read) else self.shared_blocks_read
|
|
3542
|
+
)
|
|
3543
|
+
par_workers = (
|
|
3544
|
+
None if math.isnan(self.parallel_workers) else self.parallel_workers
|
|
3545
|
+
)
|
|
3546
|
+
|
|
3547
|
+
return QueryPlan(
|
|
3548
|
+
self.node_type,
|
|
3549
|
+
base_table=table,
|
|
3550
|
+
operator=operator,
|
|
3551
|
+
children=child_nodes,
|
|
3552
|
+
parallel_workers=par_workers,
|
|
3553
|
+
index=self.index_name,
|
|
3554
|
+
sort_keys=sort_keys,
|
|
3555
|
+
estimated_cost=self.cost,
|
|
3556
|
+
estimated_cardinality=Cardinality(self.cardinality_estimate),
|
|
3557
|
+
actual_cardinality=Cardinality(true_card),
|
|
3558
|
+
execution_time=self.execution_time,
|
|
3559
|
+
cache_hits=shared_hits,
|
|
3560
|
+
cache_misses=shared_misses,
|
|
3561
|
+
subplan_root=subplan_child,
|
|
3562
|
+
subplan_name=subplan_name,
|
|
3563
|
+
)
|
|
3564
|
+
|
|
3565
|
+
def inspect(self, *, _indentation: int = 0) -> str:
|
|
3566
|
+
"""Provides a pretty string representation of the *EXPLAIN* sub-plan that can be printed.
|
|
3567
|
+
|
|
3568
|
+
Parameters
|
|
3569
|
+
----------
|
|
3570
|
+
_indentation : int, optional
|
|
3571
|
+
This parameter is internal to the method and ensures that the correct indentation is used for the child nodes
|
|
3572
|
+
of the plan. When inspecting the root node, this value is set to its default value of `0`.
|
|
3573
|
+
|
|
3574
|
+
Returns
|
|
3575
|
+
-------
|
|
3576
|
+
str
|
|
3577
|
+
A string representation of the *EXPLAIN* sub-plan.
|
|
3578
|
+
"""
|
|
3579
|
+
if self.parent_relationship in ("InitPlan", "SubPlan"):
|
|
3580
|
+
padding = " " * (max(_indentation - 2, 0))
|
|
3581
|
+
cte_name = self.subplan_name if self.subplan_name else ""
|
|
3582
|
+
own_inspection = [f"{padding}{self.parent_relationship}: {cte_name}"]
|
|
3583
|
+
else:
|
|
3584
|
+
own_inspection = []
|
|
3585
|
+
padding = " " * _indentation
|
|
3586
|
+
prefix = f"{padding}<- " if padding else ""
|
|
3587
|
+
own_inspection += [prefix + str(self)]
|
|
3588
|
+
child_inspections = [
|
|
3589
|
+
child.inspect(_indentation=_indentation + 2) for child in self.children
|
|
3590
|
+
]
|
|
3591
|
+
return "\n".join(own_inspection + child_inspections)
|
|
3592
|
+
|
|
3593
|
+
def _infer_sorting_from_children(self) -> list[SortKey]:
|
|
3594
|
+
# TODO: Postgres is a cruel mistress. Even if output is sorted, it might not be marked as such.
|
|
3595
|
+
# For example, in index scans, this is implictly encoded in the index condition, somethimes even nested in other
|
|
3596
|
+
# expressions. We first need a reliable way to parse the expressions into a PostBOUND-compatible format.
|
|
3597
|
+
# See _parse_sort_keys for a start.
|
|
3598
|
+
return None
|
|
3599
|
+
|
|
3600
|
+
def _parse_sort_keys(self) -> list[SortKey]:
|
|
3601
|
+
# TODO implementation
|
|
3602
|
+
return None
|
|
3603
|
+
|
|
3604
|
+
def __hash__(self) -> int:
|
|
3605
|
+
return self._hash_val
|
|
3606
|
+
|
|
3607
|
+
def __eq__(self, other: object) -> bool:
|
|
3608
|
+
return (
|
|
3609
|
+
isinstance(other, type(self))
|
|
3610
|
+
and self.node_type == other.node_type
|
|
3611
|
+
and self.relation_name == other.relation_name
|
|
3612
|
+
and self.relation_alias == other.relation_alias
|
|
3613
|
+
and self.children == other.children
|
|
3614
|
+
)
|
|
3615
|
+
|
|
3616
|
+
def __repr__(self) -> str:
|
|
3617
|
+
return str(self)
|
|
3618
|
+
|
|
3619
|
+
def __str__(self) -> str:
|
|
3620
|
+
analyze_content = (
|
|
3621
|
+
f" (actual time={self.execution_time}s rows={self.true_cardinality} loops={self.loops})"
|
|
3622
|
+
if self.is_analyze()
|
|
3623
|
+
else ""
|
|
3624
|
+
)
|
|
3625
|
+
explain_content = f"(cost={self.cost} rows={self.cardinality_estimate})"
|
|
3626
|
+
conditions = " ".join(
|
|
3627
|
+
f"{condition}: {value}"
|
|
3628
|
+
for condition, value in self.filter_conditions().items()
|
|
3629
|
+
)
|
|
3630
|
+
conditions = " " + conditions if conditions else ""
|
|
3631
|
+
if self.is_scan():
|
|
3632
|
+
scan_info = f" on {self.parse_table().identifier()}"
|
|
3633
|
+
elif self.cte_name:
|
|
3634
|
+
scan_info = f" on {self.cte_name}"
|
|
3635
|
+
else:
|
|
3636
|
+
scan_info = ""
|
|
3637
|
+
return (
|
|
3638
|
+
self.node_type + scan_info + explain_content + analyze_content + conditions
|
|
3639
|
+
)
|
|
3640
|
+
|
|
3641
|
+
|
|
3642
|
+
class PostgresExplainPlan:
|
|
3643
|
+
"""Models an entire *EXPLAIN* plan produced by Postgres
|
|
3644
|
+
|
|
3645
|
+
In contrast to `PostgresExplainNode`, this includes additional parameters (planning time and execution time) for the entire
|
|
3646
|
+
plan, rather than just portions of it.
|
|
3647
|
+
|
|
3648
|
+
This class supports all methods that are specified on the general `QueryPlan` and returns the correct data for its actual
|
|
3649
|
+
plan.
|
|
3650
|
+
|
|
3651
|
+
Parameters
|
|
3652
|
+
----------
|
|
3653
|
+
explain_data : dict
|
|
3654
|
+
The JSON data of the entire explain plan. This is parsed and prepared as part of the *__init__* method.
|
|
3655
|
+
|
|
3656
|
+
|
|
3657
|
+
Attributes
|
|
3658
|
+
----------
|
|
3659
|
+
planning_time : float
|
|
3660
|
+
The time in seconds that the optimizer spent to build the plan
|
|
3661
|
+
execution_time : float
|
|
3662
|
+
The time in seconds the query execution engine needed to calculate the result set of the query. This does not account
|
|
3663
|
+
for network time to transmit the result set.
|
|
3664
|
+
query_plan : PostgresExplainNode
|
|
3665
|
+
The actual plan
|
|
3666
|
+
"""
|
|
3667
|
+
|
|
3668
|
+
def __init__(self, explain_data: dict) -> None:
|
|
3669
|
+
self.explain_data = (
|
|
3670
|
+
explain_data[0] if isinstance(explain_data, list) else explain_data
|
|
3671
|
+
)
|
|
3672
|
+
self.planning_time: float = (
|
|
3673
|
+
self.explain_data.get("Planning Time", math.nan) / 1000
|
|
3674
|
+
)
|
|
3675
|
+
self.execution_time: float = (
|
|
3676
|
+
self.explain_data.get("Execution Time", math.nan) / 1000
|
|
3677
|
+
)
|
|
3678
|
+
self.query_plan = PostgresExplainNode(self.explain_data["Plan"])
|
|
3679
|
+
self._normalized_plan = self.query_plan.as_qep()
|
|
3680
|
+
|
|
3681
|
+
@property
|
|
3682
|
+
def root(self) -> PostgresExplainNode:
|
|
3683
|
+
"""Gets the root node of the actual query plan."""
|
|
3684
|
+
return self.query_plan
|
|
3685
|
+
|
|
3686
|
+
def is_analyze(self) -> bool:
|
|
3687
|
+
"""Checks, whether this *EXPLAIN* plan is an *EXPLAIN ANALYZE* plan or a pure *EXPLAIN* plan.
|
|
3688
|
+
|
|
3689
|
+
The analyze variant does not only obtain the plan, but actually executes it. This enables the comparison of the
|
|
3690
|
+
optimizer's estimates to the actual values. If a plan is an *EXPLAIN ANALYZE* plan, some attributes of this node
|
|
3691
|
+
receive actual values. These include `execution_time`, `true_cardinality`, `loops` and `parallel_workers`.
|
|
3692
|
+
|
|
3693
|
+
|
|
3694
|
+
Returns
|
|
3695
|
+
-------
|
|
3696
|
+
bool
|
|
3697
|
+
Whether the plan represents an *EXPLAIN ANALYZE* plan
|
|
3698
|
+
"""
|
|
3699
|
+
return self.query_plan.is_analyze()
|
|
3700
|
+
|
|
3701
|
+
def as_qep(self) -> QueryPlan:
|
|
3702
|
+
"""Provides the actual explain plan as a normalized query execution plan instance
|
|
3703
|
+
|
|
3704
|
+
For notes on pecularities of this method, take a look at the *See Also* section
|
|
3705
|
+
|
|
3706
|
+
Returns
|
|
3707
|
+
-------
|
|
3708
|
+
QueryPlan
|
|
3709
|
+
The query execution plan
|
|
3710
|
+
|
|
3711
|
+
See Also
|
|
3712
|
+
--------
|
|
3713
|
+
PostgresExplainNode.as_qep
|
|
3714
|
+
"""
|
|
3715
|
+
return self._normalized_plan
|
|
3716
|
+
|
|
3717
|
+
def inspect(self) -> str:
|
|
3718
|
+
"""Provides a pretty string representation of the actual plan.
|
|
3719
|
+
|
|
3720
|
+
Returns
|
|
3721
|
+
-------
|
|
3722
|
+
str
|
|
3723
|
+
A string representation of the plan
|
|
3724
|
+
|
|
3725
|
+
See Also
|
|
3726
|
+
--------
|
|
3727
|
+
PostgresExplainNode.inspect
|
|
3728
|
+
"""
|
|
3729
|
+
return self.query_plan.inspect()
|
|
3730
|
+
|
|
3731
|
+
def __json__(self) -> Any:
|
|
3732
|
+
return self.explain_data
|
|
3733
|
+
|
|
3734
|
+
def __getattribute__(self, name: str) -> Any:
|
|
3735
|
+
# All methods that are not defined on the Postgres plan delegate to the default DB plan
|
|
3736
|
+
try:
|
|
3737
|
+
return object.__getattribute__(self, name)
|
|
3738
|
+
except AttributeError:
|
|
3739
|
+
root_plan_node = object.__getattribute__(self, "query_plan")
|
|
3740
|
+
try:
|
|
3741
|
+
return root_plan_node.__getattribute__(name)
|
|
3742
|
+
except AttributeError:
|
|
3743
|
+
normalized_plan = object.__getattribute__(self, "_normalized_plan")
|
|
3744
|
+
return normalized_plan.__getattribute__(name)
|
|
3745
|
+
|
|
3746
|
+
def __hash__(self) -> int:
|
|
3747
|
+
return hash(self.query_plan)
|
|
3748
|
+
|
|
3749
|
+
def __eq__(self, other: object) -> bool:
|
|
3750
|
+
return isinstance(other, type(self)) and self.query_plan == other.query_plan
|
|
3751
|
+
|
|
3752
|
+
def __repr__(self) -> str:
|
|
3753
|
+
return str(self)
|
|
3754
|
+
|
|
3755
|
+
def __str__(self) -> str:
|
|
3756
|
+
if self.is_analyze():
|
|
3757
|
+
prefix = f"EXPLAIN ANALYZE (plan time={self.planning_time}, exec time={self.execution_time})"
|
|
3758
|
+
else:
|
|
3759
|
+
prefix = "EXPLAIN"
|
|
3760
|
+
|
|
3761
|
+
return f"{prefix} root: {self.query_plan}"
|
|
3762
|
+
|
|
3763
|
+
|
|
3764
|
+
class WorkloadShifter:
|
|
3765
|
+
"""The shifter provides simple means to manipulate the current contents of a database.
|
|
3766
|
+
|
|
3767
|
+
Currently, such means only include the deletion of specific rows, but other tools could be added in the future.
|
|
3768
|
+
|
|
3769
|
+
Parameters
|
|
3770
|
+
----------
|
|
3771
|
+
pg_instance : PostgresInterface
|
|
3772
|
+
The database to manipulate
|
|
3773
|
+
"""
|
|
3774
|
+
|
|
3775
|
+
def __init__(self, pg_instance: PostgresInterface) -> None:
|
|
3776
|
+
self.pg_instance = pg_instance
|
|
3777
|
+
|
|
3778
|
+
def remove_random(
|
|
3779
|
+
self,
|
|
3780
|
+
table: TableReference | str,
|
|
3781
|
+
*,
|
|
3782
|
+
n_rows: Optional[int] = None,
|
|
3783
|
+
row_pct: Optional[float] = None,
|
|
3784
|
+
vacuum: bool = False,
|
|
3785
|
+
) -> None:
|
|
3786
|
+
"""Deletes tuples from a specific tables at random.
|
|
3787
|
+
|
|
3788
|
+
Parameters
|
|
3789
|
+
----------
|
|
3790
|
+
table : TableReference | str
|
|
3791
|
+
The table from which to delete
|
|
3792
|
+
n_rows : Optional[int], optional
|
|
3793
|
+
The absolute number of rows to delete. Defaults to *None* in which case the `row_pct` is used.
|
|
3794
|
+
row_pct : Optional[float], optional
|
|
3795
|
+
The share of rows to delete. Value should be in range (0, 1). Defaults to *None* in which case the `n_rows` is
|
|
3796
|
+
used.
|
|
3797
|
+
vacuum : bool, optional
|
|
3798
|
+
Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
|
|
3799
|
+
forces a refresh of all statistics.
|
|
3800
|
+
|
|
3801
|
+
Raises
|
|
3802
|
+
------
|
|
3803
|
+
ValueError
|
|
3804
|
+
If no correct `n_rows` or `row_pct` values have been given.
|
|
3805
|
+
|
|
3806
|
+
Warnings
|
|
3807
|
+
--------
|
|
3808
|
+
Notice that deletions in the given table can trigger further deletions in other tables through cascades in the schema.
|
|
3809
|
+
"""
|
|
3810
|
+
table_name = table.full_name if isinstance(table, TableReference) else table
|
|
3811
|
+
n_rows = self._determine_row_cnt(table_name, n_rows, row_pct)
|
|
3812
|
+
pk_column = self.pg_instance.schema().primary_key_column(table_name)
|
|
3813
|
+
removal_template = textwrap.dedent("""
|
|
3814
|
+
WITH delete_samples AS (
|
|
3815
|
+
SELECT {col} AS sample_id, RANDOM() AS _pb_rand_val
|
|
3816
|
+
FROM {table}
|
|
3817
|
+
ORDER BY _pb_rand_val
|
|
3818
|
+
LIMIT {cnt}
|
|
3819
|
+
)
|
|
3820
|
+
DELETE FROM {table}
|
|
3821
|
+
WHERE EXISTS (SELECT 1 FROM delete_samples WHERE sample_id = {col})
|
|
3822
|
+
""")
|
|
3823
|
+
removal_query = removal_template.format(
|
|
3824
|
+
table=table_name, col=pk_column.name, cnt=n_rows
|
|
3825
|
+
)
|
|
3826
|
+
self._perform_removal(removal_query, vacuum)
|
|
3827
|
+
|
|
3828
|
+
def remove_ordered(
|
|
3829
|
+
self,
|
|
3830
|
+
column: ColumnReference | str,
|
|
3831
|
+
*,
|
|
3832
|
+
n_rows: Optional[int] = None,
|
|
3833
|
+
row_pct: Optional[float] = None,
|
|
3834
|
+
ascending: bool = True,
|
|
3835
|
+
null_placement: Optional[Literal["first", "last"]] = None,
|
|
3836
|
+
vacuum: bool = False,
|
|
3837
|
+
) -> None:
|
|
3838
|
+
"""Deletes the smallest/largest tuples from a specific table.
|
|
3839
|
+
|
|
3840
|
+
Parameters
|
|
3841
|
+
----------
|
|
3842
|
+
column : ColumnReference | str
|
|
3843
|
+
The column to infer the deletion order. Can be either a proper column reference including the containing table, or
|
|
3844
|
+
a fully-qualified column string such as _table.column_ .
|
|
3845
|
+
n_rows : Optional[int], optional
|
|
3846
|
+
The absolute number of rows to delete. Defaults to *None* in which case the `row_pct` is used.
|
|
3847
|
+
row_pct : Optional[float], optional
|
|
3848
|
+
The share of rows to delete. Value should be in range (0, 1). Defaults to *None* in which case the `n_rows` is
|
|
3849
|
+
used.
|
|
3850
|
+
ascending : bool, optional
|
|
3851
|
+
Whether the first or the last rows should be deleted. *NULL* values are according to `null_placement`.
|
|
3852
|
+
null_placement : Optional[Literal["first", "last"]], optional
|
|
3853
|
+
Where to put *NULL* values in the order. Using the default value of *None* treats *NULL* values as being the
|
|
3854
|
+
largest values possible.
|
|
3855
|
+
vacuum : bool, optional
|
|
3856
|
+
Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
|
|
3857
|
+
forces a refresh of all statistics.
|
|
3858
|
+
|
|
3859
|
+
Raises
|
|
3860
|
+
------
|
|
3861
|
+
ValueError
|
|
3862
|
+
If no correct `n_rows` or `row_pct` values have been given.
|
|
3863
|
+
|
|
3864
|
+
Warnings
|
|
3865
|
+
--------
|
|
3866
|
+
Notice that deletions in the given table can trigger further deletions in other tables through cascades in the schema.
|
|
3867
|
+
"""
|
|
3868
|
+
|
|
3869
|
+
if isinstance(column, str):
|
|
3870
|
+
table_name, col_name = column.split(".")
|
|
3871
|
+
elif isinstance(column, ColumnReference):
|
|
3872
|
+
table_name, col_name = column.table.full_name, column.name
|
|
3873
|
+
else:
|
|
3874
|
+
raise TypeError("Unknown column type: " + str(column))
|
|
3875
|
+
n_rows = self._determine_row_cnt(table_name, n_rows, row_pct)
|
|
3876
|
+
pk_column = self.pg_instance.schema().primary_key_column(table_name)
|
|
3877
|
+
order_direction = "ASC" if ascending else "DESC"
|
|
3878
|
+
null_vals = "" if null_placement is None else f"NULLS {null_placement.upper()}"
|
|
3879
|
+
removal_template = textwrap.dedent("""
|
|
3880
|
+
WITH delete_entries AS (
|
|
3881
|
+
SELECT {pk_col}
|
|
3882
|
+
FROM {table}
|
|
3883
|
+
ORDER BY {order_col} {order_dir} {nulls}, {pk_col} ASC
|
|
3884
|
+
LIMIT {cnt}
|
|
3885
|
+
)
|
|
3886
|
+
DELETE FROM {table} t
|
|
3887
|
+
WHERE EXISTS (SELECT 1 FROM delete_entries
|
|
3888
|
+
WHERE delete_entries.{pk_col} = t.{pk_col})
|
|
3889
|
+
""")
|
|
3890
|
+
removal_query = removal_template.format(
|
|
3891
|
+
table=table_name,
|
|
3892
|
+
pk_col=pk_column.name,
|
|
3893
|
+
order_col=col_name,
|
|
3894
|
+
order_dir=order_direction,
|
|
3895
|
+
nulls=null_vals,
|
|
3896
|
+
cnt=n_rows,
|
|
3897
|
+
)
|
|
3898
|
+
self._perform_removal(removal_query, vacuum)
|
|
3899
|
+
|
|
3900
|
+
def generate_marker_table(
|
|
3901
|
+
self,
|
|
3902
|
+
target_table: str,
|
|
3903
|
+
marker_pct: float = 0.5,
|
|
3904
|
+
*,
|
|
3905
|
+
target_column: str = "id",
|
|
3906
|
+
marker_table: Optional[str] = None,
|
|
3907
|
+
marker_column: Optional[str] = None,
|
|
3908
|
+
) -> None:
|
|
3909
|
+
"""Generates a new table that can be used to store rows that should be deleted at a later point in time.
|
|
3910
|
+
|
|
3911
|
+
The marker table will be created if it does not exist already. It contains exactly two columns: one column for the
|
|
3912
|
+
marker index (an ascending integer value) and another column that stores the primary keys of rows that should be
|
|
3913
|
+
deleted from the target table. If the marker table exists already, all current markings (but not the marked rows
|
|
3914
|
+
themselves) are removed. Afterwards, the new rows to delete are selected at random.
|
|
3915
|
+
|
|
3916
|
+
By default, only the target table is a required parameter. All other parameters have default values or can be inferred
|
|
3917
|
+
from the target table. The marker index column is *marker_idx*.
|
|
3918
|
+
|
|
3919
|
+
Parameters
|
|
3920
|
+
----------
|
|
3921
|
+
target_table : str
|
|
3922
|
+
The table from which rows should be removed
|
|
3923
|
+
marker_pct : float
|
|
3924
|
+
The percentage of rows that should be included in the marker table. Allowed range is *[0, 1]*.
|
|
3925
|
+
target_column : str, optional
|
|
3926
|
+
The column that contains the values used to identify the rows to be deleted in the target table. Defaults to *id*.
|
|
3927
|
+
marker_table : Optional[str], optional
|
|
3928
|
+
The name of the marker table that should store the row identifiers. Defaults to
|
|
3929
|
+
*<target table name>_delete_markers*.
|
|
3930
|
+
marker_column : Optional[str], optional
|
|
3931
|
+
The name of the column in the marker table that should contain the target column values. Defaults to
|
|
3932
|
+
*<target table name>_<target column name>*.
|
|
3933
|
+
|
|
3934
|
+
See Also
|
|
3935
|
+
--------
|
|
3936
|
+
remove_marked
|
|
3937
|
+
export_marker_table
|
|
3938
|
+
"""
|
|
3939
|
+
marker_table = (
|
|
3940
|
+
f"{target_table}_delete_marker" if marker_table is None else marker_table
|
|
3941
|
+
)
|
|
3942
|
+
marker_column = (
|
|
3943
|
+
f"{target_table}_{target_column}"
|
|
3944
|
+
if marker_column is None
|
|
3945
|
+
else marker_column
|
|
3946
|
+
)
|
|
3947
|
+
target_col_ref = ColumnReference(target_column, TableReference(target_table))
|
|
3948
|
+
target_column_type = self.pg_instance.schema().datatype(target_col_ref)
|
|
3949
|
+
marker_create_query = textwrap.dedent(f"""
|
|
3950
|
+
CREATE TABLE IF NOT EXISTS {marker_table} (
|
|
3951
|
+
marker_idx BIGSERIAL PRIMARY KEY,
|
|
3952
|
+
{marker_column} {target_column_type}
|
|
3953
|
+
);
|
|
3954
|
+
""")
|
|
3955
|
+
marker_pct = round(marker_pct * 100)
|
|
3956
|
+
marker_inflate_query = textwrap.dedent(f"""
|
|
3957
|
+
INSERT INTO {marker_table}({marker_column})
|
|
3958
|
+
SELECT {target_column}
|
|
3959
|
+
FROM {target_table} TABLESAMPLE BERNOULLI ({marker_pct});
|
|
3960
|
+
""")
|
|
3961
|
+
with self.pg_instance.obtain_new_local_connection() as conn:
|
|
3962
|
+
cursor = conn.cursor()
|
|
3963
|
+
cursor.execute(marker_create_query)
|
|
3964
|
+
cursor.execute(f"DELETE FROM {marker_table};")
|
|
3965
|
+
cursor.execute(marker_inflate_query)
|
|
3966
|
+
|
|
3967
|
+
def export_marker_table(
|
|
3968
|
+
self,
|
|
3969
|
+
*,
|
|
3970
|
+
target_table: Optional[str] = None,
|
|
3971
|
+
marker_table: Optional[str] = None,
|
|
3972
|
+
out_file: Optional[str] = None,
|
|
3973
|
+
) -> None:
|
|
3974
|
+
"""Stores a marker table in a CSV file on disk.
|
|
3975
|
+
|
|
3976
|
+
This allows the marker table to be re-imported later on.
|
|
3977
|
+
|
|
3978
|
+
Parameters
|
|
3979
|
+
----------
|
|
3980
|
+
target_table : Optional[str], optional
|
|
3981
|
+
The name of the target table for which the marker has been created. This can be used to infer the name of the
|
|
3982
|
+
marker table if the defaults have been used.
|
|
3983
|
+
marker_table : Optional[str], optional
|
|
3984
|
+
The name of the marker table. Can be omitted if the default name has been used and `target_table` is specified.
|
|
3985
|
+
out_file : Optional[str], optional
|
|
3986
|
+
The name and path of the output CSV file to create. If omitted, the name will be `<marker table name>.csv` and the
|
|
3987
|
+
file will be placed in the current working directory. If specified, an absolute path must be used.
|
|
3988
|
+
|
|
3989
|
+
Raises
|
|
3990
|
+
------
|
|
3991
|
+
ValueError
|
|
3992
|
+
If neither `target_table` nor `marker_table` are given.
|
|
3993
|
+
|
|
3994
|
+
See Also
|
|
3995
|
+
--------
|
|
3996
|
+
import_marker_table
|
|
3997
|
+
remove_marked
|
|
3998
|
+
"""
|
|
3999
|
+
if target_table is None and marker_table is None:
|
|
4000
|
+
raise ValueError("Either marker table or target table are required!")
|
|
4001
|
+
marker_table = (
|
|
4002
|
+
f"{target_table}_delete_marker" if marker_table is None else marker_table
|
|
4003
|
+
)
|
|
4004
|
+
out_file = (
|
|
4005
|
+
pathlib.Path(f"{marker_table}.csv").absolute()
|
|
4006
|
+
if out_file is None
|
|
4007
|
+
else out_file
|
|
4008
|
+
)
|
|
4009
|
+
self.pg_instance.cursor().execute(
|
|
4010
|
+
f"COPY {marker_table} TO '{out_file}' DELIMITER ',' CSV HEADER;"
|
|
4011
|
+
)
|
|
4012
|
+
|
|
4013
|
+
def import_marker_table(
|
|
4014
|
+
self,
|
|
4015
|
+
*,
|
|
4016
|
+
target_table: Optional[str] = None,
|
|
4017
|
+
marker_table: Optional[str] = None,
|
|
4018
|
+
target_column: str = "id",
|
|
4019
|
+
marker_column: Optional[str] = None,
|
|
4020
|
+
target_column_type: Optional[str] = None,
|
|
4021
|
+
in_file: Optional[str] = None,
|
|
4022
|
+
) -> None:
|
|
4023
|
+
"""Loads the contents of a marker table from a CSV file from disk.
|
|
4024
|
+
|
|
4025
|
+
The table will be created if it does not exist already. If the marker table exists already, all current markings (but
|
|
4026
|
+
not the marked rows themselves) are removed. Afterwards, the new markings are imported.
|
|
4027
|
+
|
|
4028
|
+
Parameters
|
|
4029
|
+
----------
|
|
4030
|
+
target_table : Optional[str], optional
|
|
4031
|
+
The name of the target table for which the marker has been created. This can be used to infer the name of the
|
|
4032
|
+
marker table if the defaults have been used.
|
|
4033
|
+
marker_table : Optional[str], optional
|
|
4034
|
+
The name of the marker table. Can be omitted if the default name has been used and `target_table` is specified.
|
|
4035
|
+
target_column : str, optional
|
|
4036
|
+
The column that contains the values used to identify the rows to be deleted in the target table. Defaults to *id*.
|
|
4037
|
+
marker_table : Optional[str], optional
|
|
4038
|
+
The name of the marker table that should store the row identifiers. Defaults to
|
|
4039
|
+
*<target table name>_delete_markers*.
|
|
4040
|
+
target_column_type : Optional[str], optional
|
|
4041
|
+
The datatype of the target column. If this parameter is not given, `target_table` has to be specified to infer the
|
|
4042
|
+
proper datatype from the schema metadata.
|
|
4043
|
+
in_file : Optional[str], optional
|
|
4044
|
+
The name and path of the CSV file to read. If omitted, the name will be `<marker table name>.csv` and the
|
|
4045
|
+
file will be loaded in the current working directory. If specified, an absolute path must be used.
|
|
4046
|
+
|
|
4047
|
+
Raises
|
|
4048
|
+
------
|
|
4049
|
+
ValueError
|
|
4050
|
+
If neither `target_table` nor `marker_table` are given.
|
|
4051
|
+
|
|
4052
|
+
See Also
|
|
4053
|
+
--------
|
|
4054
|
+
export_marker_table
|
|
4055
|
+
remove_marked
|
|
4056
|
+
"""
|
|
4057
|
+
if not target_table and not marker_table:
|
|
4058
|
+
raise ValueError("Either marker table or target table are required!")
|
|
4059
|
+
marker_table = (
|
|
4060
|
+
f"{target_table}_delete_marker" if marker_table is None else marker_table
|
|
4061
|
+
)
|
|
4062
|
+
marker_column = (
|
|
4063
|
+
f"{target_table}_{target_column}"
|
|
4064
|
+
if marker_column is None
|
|
4065
|
+
else marker_column
|
|
4066
|
+
)
|
|
4067
|
+
in_file = (
|
|
4068
|
+
pathlib.Path(f"{marker_table}.csv").absolute()
|
|
4069
|
+
if in_file is None
|
|
4070
|
+
else in_file
|
|
4071
|
+
)
|
|
4072
|
+
|
|
4073
|
+
if target_column_type is None:
|
|
4074
|
+
target_col_ref = ColumnReference(
|
|
4075
|
+
target_column, TableReference(target_table)
|
|
4076
|
+
)
|
|
4077
|
+
target_column_type = self.pg_instance.schema().datatype(target_col_ref)
|
|
4078
|
+
|
|
4079
|
+
marker_create_query = textwrap.dedent(f"""
|
|
4080
|
+
CREATE TABLE IF NOT EXISTS {marker_table} (
|
|
4081
|
+
marker_idx BIGSERIAL PRIMARY KEY,
|
|
4082
|
+
{marker_column} {target_column_type}
|
|
4083
|
+
);
|
|
4084
|
+
""")
|
|
4085
|
+
marker_import_query = textwrap.dedent(f"""
|
|
4086
|
+
COPY {marker_table}(marker_idx, {marker_column})
|
|
4087
|
+
FROM '{in_file}'
|
|
4088
|
+
DELIMITER ','
|
|
4089
|
+
CSV HEADER;
|
|
4090
|
+
""")
|
|
4091
|
+
with self.pg_instance.obtain_new_local_connection() as conn:
|
|
4092
|
+
cursor = conn.cursor()
|
|
4093
|
+
cursor.execute(marker_create_query)
|
|
4094
|
+
cursor.execute(f"DELETE FROM {marker_table}")
|
|
4095
|
+
cursor.execute(marker_import_query)
|
|
4096
|
+
|
|
4097
|
+
def remove_marked(
|
|
4098
|
+
self,
|
|
4099
|
+
target_table: str,
|
|
4100
|
+
*,
|
|
4101
|
+
target_column: str = "id",
|
|
4102
|
+
marker_table: Optional[str] = None,
|
|
4103
|
+
marker_column: Optional[str] = None,
|
|
4104
|
+
vacuum: bool = False,
|
|
4105
|
+
) -> None:
|
|
4106
|
+
"""Deletes rows according to their primary keys stored in a marker table.
|
|
4107
|
+
|
|
4108
|
+
Parameters
|
|
4109
|
+
----------
|
|
4110
|
+
target_table : str
|
|
4111
|
+
The table from which the rows should be removed.
|
|
4112
|
+
target_column : str, optional
|
|
4113
|
+
A column of the target table that is used to identify rows matching the marked rows to remove. Defaults to *id*.
|
|
4114
|
+
marker_table : Optional[str], optional
|
|
4115
|
+
A table containing marks of the rows to delete. Defaults to *<target table>_delete_markers*.
|
|
4116
|
+
marker_column : Optional[str], optional
|
|
4117
|
+
A column of the marker table that contains the values of the columns to remove. Defaults to
|
|
4118
|
+
*<target table>_<target column>*.
|
|
4119
|
+
vacuum : bool, optional
|
|
4120
|
+
Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
|
|
4121
|
+
forces a refresh of all statistics.
|
|
4122
|
+
|
|
4123
|
+
See Also
|
|
4124
|
+
--------
|
|
4125
|
+
generate_marker_table
|
|
4126
|
+
"""
|
|
4127
|
+
# TODO: align parameter types with TableReference and ColumnReference
|
|
4128
|
+
marker_table = (
|
|
4129
|
+
f"{target_table}_delete_marker" if marker_table is None else marker_table
|
|
4130
|
+
)
|
|
4131
|
+
marker_column = (
|
|
4132
|
+
f"{target_table}_{target_column}"
|
|
4133
|
+
if marker_column is None
|
|
4134
|
+
else marker_column
|
|
4135
|
+
)
|
|
4136
|
+
removal_query = textwrap.dedent(f"""
|
|
4137
|
+
DELETE FROM {target_table}
|
|
4138
|
+
WHERE EXISTS (SELECT 1 FROM {marker_table}
|
|
4139
|
+
WHERE {marker_table}.{marker_column} = {target_table}.{target_column})""")
|
|
4140
|
+
self._perform_removal(removal_query, vacuum)
|
|
4141
|
+
|
|
4142
|
+
def _perform_removal(self, removal_query: str, vacuum: bool) -> None:
|
|
4143
|
+
"""Executes a specific removal query and optionally cleans up the storage system.
|
|
4144
|
+
|
|
4145
|
+
Parameters
|
|
4146
|
+
----------
|
|
4147
|
+
removal_query : str
|
|
4148
|
+
The query that describes the desired delete operation.
|
|
4149
|
+
vacuum : bool
|
|
4150
|
+
Whether the database should be vacuumed after deletion. This optimizes the page layout by compacting the pages and
|
|
4151
|
+
forces a refresh of all statistics.
|
|
4152
|
+
"""
|
|
4153
|
+
with self.pg_instance.obtain_new_local_connection() as conn:
|
|
4154
|
+
cursor = conn.cursor()
|
|
4155
|
+
cursor.execute(removal_query)
|
|
4156
|
+
if vacuum:
|
|
4157
|
+
# We can't use the with-syntax here because VACUUM cannot be executed inside a transaction
|
|
4158
|
+
conn = self.pg_instance.obtain_new_local_connection()
|
|
4159
|
+
conn.autocommit = True
|
|
4160
|
+
cursor = conn.cursor()
|
|
4161
|
+
# We really need a full vacuum due to cascading deletes
|
|
4162
|
+
cursor.execute("VACUUM FULL ANALYZE;")
|
|
4163
|
+
cursor.close()
|
|
4164
|
+
conn.close()
|
|
4165
|
+
|
|
4166
|
+
def _determine_row_cnt(
|
|
4167
|
+
self, table: str, n_rows: Optional[int], row_pct: Optional[float]
|
|
4168
|
+
) -> int:
|
|
4169
|
+
"""Calculates the absolute number of rows to delete while also performing sanity checks.
|
|
4170
|
+
|
|
4171
|
+
Parameters
|
|
4172
|
+
----------
|
|
4173
|
+
table : str
|
|
4174
|
+
The table from which rows should be deleted. This is necessary to determine the current row count.
|
|
4175
|
+
n_rows : Optional[int]
|
|
4176
|
+
The absolute number of rows to delete.
|
|
4177
|
+
row_pct : Optional[float]
|
|
4178
|
+
The fraction in (0, 1) of rows to delete.
|
|
4179
|
+
|
|
4180
|
+
Returns
|
|
4181
|
+
-------
|
|
4182
|
+
int
|
|
4183
|
+
The absolute number rows to delete. This is equal to `n_rows` if that parameter was given. Otherwise, the number is
|
|
4184
|
+
inferred from the `row_pct` and the current number of tuples in the table.
|
|
4185
|
+
|
|
4186
|
+
Raises
|
|
4187
|
+
------
|
|
4188
|
+
ValueError
|
|
4189
|
+
If either both or neither `n_rows` and `row_pct` was given or any of the parameters is outside of the allowed
|
|
4190
|
+
range.
|
|
4191
|
+
"""
|
|
4192
|
+
if n_rows is None and row_pct is None:
|
|
4193
|
+
raise ValueError(
|
|
4194
|
+
"Either absolute number of rows or row percentage must be given"
|
|
4195
|
+
)
|
|
4196
|
+
if n_rows is not None and row_pct is not None:
|
|
4197
|
+
raise ValueError(
|
|
4198
|
+
"Cannot use both absolute number of rows and row percentage"
|
|
4199
|
+
)
|
|
4200
|
+
|
|
4201
|
+
if n_rows is not None and not n_rows > 0:
|
|
4202
|
+
raise ValueError("Not a valid row count: " + str(n_rows))
|
|
4203
|
+
elif n_rows is not None and n_rows > 0:
|
|
4204
|
+
return n_rows
|
|
4205
|
+
|
|
4206
|
+
if not 0.0 < row_pct < 1.0:
|
|
4207
|
+
raise ValueError("Not a valid row percentage: " + str(row_pct))
|
|
4208
|
+
|
|
4209
|
+
total_n_rows = self.pg_instance.statistics().total_rows(
|
|
4210
|
+
TableReference(table), cache_enabled=False, emulated=True
|
|
4211
|
+
)
|
|
4212
|
+
if total_n_rows is None:
|
|
4213
|
+
raise StateError(
|
|
4214
|
+
"Could not determine total number of rows for table " + table
|
|
4215
|
+
)
|
|
4216
|
+
return round(row_pct * total_n_rows)
|