PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/db/mysql.py
ADDED
|
@@ -0,0 +1,1195 @@
|
|
|
1
|
+
"""Contains the MySQL implementation of the Database interface.
|
|
2
|
+
|
|
3
|
+
The current implementation has a number of limitations. Some are caused by fundamental restrictions of how MySQL
|
|
4
|
+
optimizes and executes queries, while others are caused by the sheer implementation effort that would have to be
|
|
5
|
+
invested to implement the corresponding feature in MySQL.
|
|
6
|
+
|
|
7
|
+
The most important restrictions are as follows:
|
|
8
|
+
|
|
9
|
+
No support for parsing EXPLAIN ANALYZE plans. Calling the corresponding MysqlOptimizer.analyze_plan method raises a
|
|
10
|
+
``NotImplementedError``. This is because MySQL currently (i.e. as of version 8.0) only provides EXPLAIN ANALYZE plans
|
|
11
|
+
in TREE output format, which is not exhaustively documented and appears fairly irregular. This makes parsing the
|
|
12
|
+
output fairly hard.
|
|
13
|
+
|
|
14
|
+
Restrictions of the query hint generation: query execution in MySQL differs fundamentally from the way queries are
|
|
15
|
+
executed in more traditional systems such as PostgreSQL or Oracle. MySQL makes heavy usage of clustered indexes,
|
|
16
|
+
meaning that all tuples in a table are automatically stored in a B-Tree according to the primary key index. As a
|
|
17
|
+
consequence, MySQL strongly favors the usage of (Index-) Nested Loop Joins during query execution and rarely resorts to
|
|
18
|
+
other operators. In fact, the only fundamentally different join operator available is the Hash Join. This operator is
|
|
19
|
+
only used if a equality join should be executed between columns that do not have an index available. Therefore, it is
|
|
20
|
+
not possible to disable Nested Loop Joins entirely, nor can the usage of Hash Joins be enforced. Instead, query hints
|
|
21
|
+
can only disable the usage of Hash Joins, or *recommend* their usage. But whether or not they are actually applied is
|
|
22
|
+
up to the MySQL query optimizer. A similar thing happens for the join order: although MySQL provides a number of hints
|
|
23
|
+
related to the join order optimization, these hints are not always enforced. More specifically, to the best of our
|
|
24
|
+
knowledge, it is not possible to enforce the branches in the join order and MySQL heavily favors left-deep query plans.
|
|
25
|
+
Therefore, the generation of join order hints only works for linear join orders for now.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import configparser
|
|
31
|
+
import dataclasses
|
|
32
|
+
import json
|
|
33
|
+
import math
|
|
34
|
+
import numbers
|
|
35
|
+
import os
|
|
36
|
+
import textwrap
|
|
37
|
+
import warnings
|
|
38
|
+
from collections.abc import Iterable, Sequence
|
|
39
|
+
from typing import Any, Optional
|
|
40
|
+
|
|
41
|
+
import mysql.connector
|
|
42
|
+
|
|
43
|
+
from .. import qal, util
|
|
44
|
+
from .._core import (
|
|
45
|
+
Cardinality,
|
|
46
|
+
JoinOperator,
|
|
47
|
+
PhysicalOperator,
|
|
48
|
+
ScanOperator,
|
|
49
|
+
UnboundColumnError,
|
|
50
|
+
VirtualTableError,
|
|
51
|
+
)
|
|
52
|
+
from .._hints import (
|
|
53
|
+
HintType,
|
|
54
|
+
PhysicalOperatorAssignment,
|
|
55
|
+
PlanParameterization,
|
|
56
|
+
operators_from_plan,
|
|
57
|
+
)
|
|
58
|
+
from .._jointree import JoinTree, jointree_from_plan, parameters_from_plan
|
|
59
|
+
from .._qep import QueryPlan
|
|
60
|
+
from ..qal import transform
|
|
61
|
+
from ..qal._qal import (
|
|
62
|
+
CastExpression,
|
|
63
|
+
ColumnReference,
|
|
64
|
+
Explain,
|
|
65
|
+
Hint,
|
|
66
|
+
SqlExpression,
|
|
67
|
+
SqlQuery,
|
|
68
|
+
StaticValueExpression,
|
|
69
|
+
TableReference,
|
|
70
|
+
)
|
|
71
|
+
from ..util import Version
|
|
72
|
+
from ._db import (
|
|
73
|
+
Cursor,
|
|
74
|
+
Database,
|
|
75
|
+
DatabasePool,
|
|
76
|
+
DatabaseSchema,
|
|
77
|
+
DatabaseStatistics,
|
|
78
|
+
HintService,
|
|
79
|
+
OptimizerInterface,
|
|
80
|
+
UnsupportedDatabaseFeatureError,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclasses.dataclass(frozen=True)
|
|
85
|
+
class MysqlConnectionArguments:
|
|
86
|
+
"""Captures all relevant parameters that customize the way the connection to a MySQL instance is establised.
|
|
87
|
+
|
|
88
|
+
The only required parameters are the user that should connect to the database and the name of the database to
|
|
89
|
+
connect to.
|
|
90
|
+
See [1]_ for the different parameters' meaning.
|
|
91
|
+
|
|
92
|
+
References
|
|
93
|
+
----------
|
|
94
|
+
|
|
95
|
+
.. [1] https://dev.mysql.com/doc/connector-python/en/connector-python-connectargs.html
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
user: str
|
|
99
|
+
database: str
|
|
100
|
+
password: str = ""
|
|
101
|
+
host: str = "127.0.0.1"
|
|
102
|
+
port: int = 3306
|
|
103
|
+
use_unicode: bool = True
|
|
104
|
+
charset: str = "utf8mb4"
|
|
105
|
+
autocommit: bool = True
|
|
106
|
+
sql_mode: str = "ANSI"
|
|
107
|
+
|
|
108
|
+
def parameters(self) -> dict[str, str | int | bool]:
|
|
109
|
+
"""Provides all arguments in one neat ``dict``.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
dict[str, str | int | bool]
|
|
114
|
+
A mapping from parameter name to parameter value.
|
|
115
|
+
"""
|
|
116
|
+
return dataclasses.asdict(self)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class MysqlInterface(Database):
|
|
120
|
+
"""MySQL-specific implementation of the general `Database` interface."""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
connection_args: MysqlConnectionArguments,
|
|
125
|
+
system_name: str = "MySQL",
|
|
126
|
+
*,
|
|
127
|
+
cache_enabled: bool = True,
|
|
128
|
+
) -> None:
|
|
129
|
+
"""Generates a new database interface and establishes a connection to the specified database server.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
connection_args : MysqlConnectionArguments
|
|
134
|
+
Configuration and required information to establish a connection to some MySQL instance.
|
|
135
|
+
system_name : str, optional
|
|
136
|
+
The name of the current database. Typically, this can be used to query the `DatabasePool` for this very
|
|
137
|
+
instance. Defaults to ``"MySQL"``.
|
|
138
|
+
cache_enabled : bool, optional
|
|
139
|
+
Whether or not caching of complicated database queries should be enabled by default. Defaults to ``True``.
|
|
140
|
+
"""
|
|
141
|
+
self.connection_args = connection_args
|
|
142
|
+
self._cnx = mysql.connector.connect(**connection_args.parameters())
|
|
143
|
+
self._cur = self._cnx.cursor(buffered=True)
|
|
144
|
+
|
|
145
|
+
self._db_schema = MysqlSchemaInterface(self)
|
|
146
|
+
self._db_stats = MysqlStatisticsInterface(self)
|
|
147
|
+
super().__init__(system_name, cache_enabled=cache_enabled)
|
|
148
|
+
|
|
149
|
+
def schema(self) -> MysqlSchemaInterface:
|
|
150
|
+
return self._db_schema
|
|
151
|
+
|
|
152
|
+
def statistics(self) -> MysqlStatisticsInterface:
|
|
153
|
+
return self._db_stats
|
|
154
|
+
|
|
155
|
+
def hinting(self) -> HintService:
|
|
156
|
+
return MysqlHintService(self)
|
|
157
|
+
|
|
158
|
+
def execute_query(
|
|
159
|
+
self,
|
|
160
|
+
query: SqlQuery | str,
|
|
161
|
+
*,
|
|
162
|
+
cache_enabled: Optional[bool] = None,
|
|
163
|
+
raw: bool = False,
|
|
164
|
+
) -> Any:
|
|
165
|
+
cache_enabled = cache_enabled or (cache_enabled is None and self._cache_enabled)
|
|
166
|
+
query = self._prepare_query_execution(query)
|
|
167
|
+
|
|
168
|
+
if cache_enabled and query in self._query_cache:
|
|
169
|
+
query_result = self._query_cache[query]
|
|
170
|
+
else:
|
|
171
|
+
self._cur.execute(query)
|
|
172
|
+
query_result = self._cur.fetchall()
|
|
173
|
+
if cache_enabled:
|
|
174
|
+
self._inflate_query_cache()
|
|
175
|
+
self._query_cache[query] = query_result
|
|
176
|
+
|
|
177
|
+
if raw:
|
|
178
|
+
return query_result
|
|
179
|
+
|
|
180
|
+
# simplify the query result as much as possible: [(42, 24)] becomes (42, 24) and [(1,), (2,)] becomes [1, 2]
|
|
181
|
+
# [(42, 24), (4.2, 2.4)] is left as-is
|
|
182
|
+
if not query_result:
|
|
183
|
+
return []
|
|
184
|
+
result_structure = query_result[0] # what do the result tuples look like?
|
|
185
|
+
if len(result_structure) == 1: # do we have just one column?
|
|
186
|
+
query_result = [
|
|
187
|
+
row[0] for row in query_result
|
|
188
|
+
] # if it is just one column, unwrap it
|
|
189
|
+
return (
|
|
190
|
+
query_result if len(query_result) > 1 else query_result[0]
|
|
191
|
+
) # if it is just one row, unwrap it
|
|
192
|
+
|
|
193
|
+
def optimizer(self) -> OptimizerInterface:
|
|
194
|
+
return MysqlOptimizer(self)
|
|
195
|
+
|
|
196
|
+
def database_name(self) -> str:
|
|
197
|
+
self._cur.execute("SELECT DATABASE();")
|
|
198
|
+
db_name = self._cur.fetchone()[0]
|
|
199
|
+
return db_name
|
|
200
|
+
|
|
201
|
+
def database_system_version(self) -> Version:
|
|
202
|
+
self._cur.execute("SELECT VERSION();")
|
|
203
|
+
version = self._cur.fetchone()[0]
|
|
204
|
+
return Version(version)
|
|
205
|
+
|
|
206
|
+
def server_mode(self) -> str:
|
|
207
|
+
"""Provides the current settings in the ``sql_mode`` MySQL variable.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
str
|
|
212
|
+
The ``sql_mode`` value, exactly as it is returned by the server. Typically, this is a list of
|
|
213
|
+
comma-separated features.
|
|
214
|
+
"""
|
|
215
|
+
self._cur.execute("SELECT @@session.sql_mode")
|
|
216
|
+
return self._cur.fetchone()[0]
|
|
217
|
+
|
|
218
|
+
def describe(self) -> dict:
|
|
219
|
+
base_info = {
|
|
220
|
+
"system_name": self.database_system_name(),
|
|
221
|
+
"system_version": self.database_system_version(),
|
|
222
|
+
"database": self.database_name(),
|
|
223
|
+
"statistics_settings": {
|
|
224
|
+
"emulated": self._db_stats.emulated,
|
|
225
|
+
"cache_enabled": self._db_stats.cache_enabled,
|
|
226
|
+
},
|
|
227
|
+
}
|
|
228
|
+
self._cur.execute("SHOW VARIABLES")
|
|
229
|
+
system_config = self._cur.fetchall()
|
|
230
|
+
base_info["system_settings"] = dict(system_config)
|
|
231
|
+
return base_info
|
|
232
|
+
|
|
233
|
+
def reset_connection(self) -> None:
|
|
234
|
+
self._cur.close()
|
|
235
|
+
self._cnx.cmd_reset_connection()
|
|
236
|
+
self._cur = self._cnx.cursor()
|
|
237
|
+
|
|
238
|
+
def cursor(self) -> Cursor:
|
|
239
|
+
return self._cur
|
|
240
|
+
|
|
241
|
+
def close(self) -> None:
|
|
242
|
+
self._cur.close()
|
|
243
|
+
self._cnx.close()
|
|
244
|
+
|
|
245
|
+
def _prepare_query_execution(
|
|
246
|
+
self, query: SqlQuery | str, *, drop_explain: bool = False
|
|
247
|
+
) -> str:
|
|
248
|
+
"""Provides the query in a unified format, taking care of preparatory statements as necessary.
|
|
249
|
+
|
|
250
|
+
`drop_explain` can be used to remove any EXPLAIN clauses from the query. Note that all actions that require
|
|
251
|
+
the "semantics" of the query to be known (e.g. EXPLAIN modifications or query hints) and are therefore only
|
|
252
|
+
executed for instances of the qal queries.
|
|
253
|
+
"""
|
|
254
|
+
if not isinstance(query, SqlQuery):
|
|
255
|
+
return query
|
|
256
|
+
|
|
257
|
+
if drop_explain:
|
|
258
|
+
query = transform.drop_clause(query, Explain)
|
|
259
|
+
if query.hints and query.hints.preparatory_statements:
|
|
260
|
+
self._cur.execute(query.hints.preparatory_statements)
|
|
261
|
+
query = transform.drop_hints(query, preparatory_statements_only=True)
|
|
262
|
+
return str(query)
|
|
263
|
+
|
|
264
|
+
def _obtain_query_plan(self, query: str) -> dict:
|
|
265
|
+
if not query.startswith("EXPLAIN FORMAT = JSON"):
|
|
266
|
+
query = "EXPLAIN FORMAT = JSON " + query
|
|
267
|
+
self._cur.execute(query)
|
|
268
|
+
result = self._cur.fetchone()[0]
|
|
269
|
+
return json.loads(result)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class MysqlSchemaInterface(DatabaseSchema):
|
|
273
|
+
def __init__(self, mysql_db: MysqlInterface):
|
|
274
|
+
super().__init__(mysql_db)
|
|
275
|
+
|
|
276
|
+
def lookup_column(
|
|
277
|
+
self,
|
|
278
|
+
column: ColumnReference | str,
|
|
279
|
+
candidate_tables: list[TableReference],
|
|
280
|
+
*,
|
|
281
|
+
expect_match: bool = False,
|
|
282
|
+
) -> Optional[TableReference]:
|
|
283
|
+
column = column.name if isinstance(column, ColumnReference) else column
|
|
284
|
+
|
|
285
|
+
for table in candidate_tables:
|
|
286
|
+
table_columns = self._fetch_columns(table)
|
|
287
|
+
if column in table_columns:
|
|
288
|
+
return table
|
|
289
|
+
|
|
290
|
+
if not expect_match:
|
|
291
|
+
return None
|
|
292
|
+
candidate_tables = [tab.full_name for tab in candidate_tables]
|
|
293
|
+
raise ValueError(
|
|
294
|
+
f"Column {column} not found in candidate tables {candidate_tables}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def is_primary_key(self, column: ColumnReference) -> bool:
|
|
298
|
+
if not column.table:
|
|
299
|
+
raise UnboundColumnError(column)
|
|
300
|
+
if column.table.virtual:
|
|
301
|
+
raise VirtualTableError(column.table)
|
|
302
|
+
index_map = self._fetch_indexes(column.table)
|
|
303
|
+
return index_map.get(column.name, False)
|
|
304
|
+
|
|
305
|
+
def has_secondary_index(self, column: ColumnReference) -> bool:
|
|
306
|
+
if not column.table:
|
|
307
|
+
raise UnboundColumnError(column)
|
|
308
|
+
if column.table.virtual:
|
|
309
|
+
raise VirtualTableError(column.table)
|
|
310
|
+
index_map = self._fetch_indexes(column.table)
|
|
311
|
+
|
|
312
|
+
# The index map contains an entry for each attribute that actually has an index. The value is True, if the
|
|
313
|
+
# attribute (which is known to be indexed), is even the Primary Key
|
|
314
|
+
# Our method should return False in two cases: 1) the attribute is not indexed at all; and 2) the attribute
|
|
315
|
+
# actually is the Primary key. Therefore, by assuming it is the PK in case of absence, we get the correct
|
|
316
|
+
# value.
|
|
317
|
+
return not index_map.get(column.name, True)
|
|
318
|
+
|
|
319
|
+
def indexes_on(self, column: ColumnReference) -> set[str]:
|
|
320
|
+
if not column.table:
|
|
321
|
+
raise UnboundColumnError(column)
|
|
322
|
+
if column.table.virtual:
|
|
323
|
+
raise VirtualTableError(column.table)
|
|
324
|
+
query_template = (
|
|
325
|
+
"SELECT index_name FROM information_schema.statistics "
|
|
326
|
+
"WHERE table_name = %s AND column_name = %s"
|
|
327
|
+
)
|
|
328
|
+
self._db.cursor().execute(query_template, (column.table.full_name, column.name))
|
|
329
|
+
result_set = self._db.cursor().fetchall()
|
|
330
|
+
return {index[0] for index in result_set}
|
|
331
|
+
|
|
332
|
+
def foreign_keys_on(self, column: ColumnReference) -> set[ColumnReference]:
|
|
333
|
+
if not column.table:
|
|
334
|
+
raise UnboundColumnError(column)
|
|
335
|
+
if column.table.virtual:
|
|
336
|
+
raise VirtualTableError(column.table)
|
|
337
|
+
query_template = (
|
|
338
|
+
"SELECT referenced_table_name, referenced_column_name "
|
|
339
|
+
"FROM information_schema.key_column_usage "
|
|
340
|
+
"WHERE table_name = %s AND column_name = %s AND referenced_column_name IS NOT NULL"
|
|
341
|
+
)
|
|
342
|
+
self._db.cursor().execute(query_template, (column.table.full_name, column.name))
|
|
343
|
+
result_set = self._db.cursor().fetchall()
|
|
344
|
+
return {
|
|
345
|
+
ColumnReference(table=TableReference(name=table), name=col)
|
|
346
|
+
for table, col in result_set
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
def datatype(self, column: ColumnReference) -> str:
|
|
350
|
+
if not column.table:
|
|
351
|
+
raise UnboundColumnError(column)
|
|
352
|
+
if column.table.virtual:
|
|
353
|
+
raise VirtualTableError(column.table)
|
|
354
|
+
query_template = (
|
|
355
|
+
"SELECT column_type FROM information_schema.columns "
|
|
356
|
+
"WHERE table_name = %s AND column_name = %s"
|
|
357
|
+
)
|
|
358
|
+
self._db.cursor().execute(query_template, (column.table.full_name, column.name))
|
|
359
|
+
result_set = self._db.cursor().fetchone()
|
|
360
|
+
return str(result_set[0])
|
|
361
|
+
|
|
362
|
+
def is_nullable(self, column) -> bool:
|
|
363
|
+
if not column.table:
|
|
364
|
+
raise UnboundColumnError(column)
|
|
365
|
+
if column.table.virtual:
|
|
366
|
+
raise VirtualTableError(column.table)
|
|
367
|
+
query_template = (
|
|
368
|
+
"SELECT is_nullable FROM information_schema.columns "
|
|
369
|
+
"WHERE table_name = %s AND column_name = %s"
|
|
370
|
+
)
|
|
371
|
+
self._db.cursor().execute(query_template, (column.table.full_name, column.name))
|
|
372
|
+
result_set = self._db.cursor().fetchone()
|
|
373
|
+
return result_set[0] == "YES"
|
|
374
|
+
|
|
375
|
+
def _fetch_columns(self, table: TableReference) -> list[str]:
|
|
376
|
+
query_template = (
|
|
377
|
+
"SELECT column_name FROM information_schema.columns WHERE table_name = %s"
|
|
378
|
+
)
|
|
379
|
+
self._db.cursor().execute(query_template, (table.full_name,))
|
|
380
|
+
result_set = self._db.cursor().fetchall()
|
|
381
|
+
return [col[0] for col in result_set]
|
|
382
|
+
|
|
383
|
+
def _fetch_indexes(self, table: TableReference) -> dict[str, bool]:
|
|
384
|
+
index_query = textwrap.dedent("""
|
|
385
|
+
SELECT column_name, column_key = 'PRI'
|
|
386
|
+
FROM information_schema.columns
|
|
387
|
+
WHERE table_name = %s AND column_key <> ''
|
|
388
|
+
""")
|
|
389
|
+
self._db.cursor().execute(index_query, table.full_name)
|
|
390
|
+
result_set = self._db.cursor().fetchall()
|
|
391
|
+
index_map = dict(result_set)
|
|
392
|
+
return index_map
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class MysqlStatisticsInterface(DatabaseStatistics):
|
|
396
|
+
def __init__(self, mysql_db: MysqlInterface):
|
|
397
|
+
super().__init__(mysql_db)
|
|
398
|
+
|
|
399
|
+
def _retrieve_total_rows_from_stats(self, table: TableReference) -> Optional[int]:
|
|
400
|
+
count_query = (
|
|
401
|
+
"SELECT table_rows FROM information_schema.tables WHERE table_name = %s"
|
|
402
|
+
)
|
|
403
|
+
self._db.cursor().execute(count_query, table.full_name)
|
|
404
|
+
count = self._db.cursor().fetchone()[0]
|
|
405
|
+
return count
|
|
406
|
+
|
|
407
|
+
def _retrieve_distinct_values_from_stats(
|
|
408
|
+
self, column: ColumnReference
|
|
409
|
+
) -> Optional[int]:
|
|
410
|
+
stats_query = (
|
|
411
|
+
"SELECT cardinality FROM information_schema.statistics "
|
|
412
|
+
"WHERE table_name = %s AND column_name = %s"
|
|
413
|
+
)
|
|
414
|
+
self._db.cursor().execute(stats_query, (column.table.full_name, column.name))
|
|
415
|
+
distinct_vals: Optional[int] = self._db.cursor().fetchone()
|
|
416
|
+
if distinct_vals is None and not self.enable_emulation_fallback:
|
|
417
|
+
return distinct_vals
|
|
418
|
+
elif distinct_vals is None:
|
|
419
|
+
return self._calculate_distinct_values(column, cache_enabled=True)
|
|
420
|
+
else:
|
|
421
|
+
return distinct_vals
|
|
422
|
+
|
|
423
|
+
def _retrieve_min_max_values_from_stats(
|
|
424
|
+
self, column: ColumnReference
|
|
425
|
+
) -> Optional[tuple[Any, Any]]:
|
|
426
|
+
if not self.enable_emulation_fallback:
|
|
427
|
+
raise UnsupportedDatabaseFeatureError(self._db, "min/max value statistics")
|
|
428
|
+
return self._calculate_min_max_values(column, cache_enabled=True)
|
|
429
|
+
|
|
430
|
+
def _retrieve_most_common_values_from_stats(
|
|
431
|
+
self, column: ColumnReference, k: int
|
|
432
|
+
) -> Sequence[tuple[Any, int]]:
|
|
433
|
+
if not self.enable_emulation_fallback:
|
|
434
|
+
raise UnsupportedDatabaseFeatureError(
|
|
435
|
+
self._db, "most common values statistics"
|
|
436
|
+
)
|
|
437
|
+
return self._calculate_most_common_values(column, k=k, cache_enabled=True)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
MysqlJoinHints = {JoinOperator.HashJoin, JoinOperator.NestedLoopJoin}
|
|
441
|
+
MysqlScanHints = {ScanOperator.IndexScan, ScanOperator.SequentialScan}
|
|
442
|
+
MysqlPlanHints = {HintType.LinearJoinOrder, HintType.Operator}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
class _MysqlExplainClause(Explain):
|
|
446
|
+
def __init__(self, original_clause: Explain):
|
|
447
|
+
super().__init__(original_clause.analyze, original_clause.target_format)
|
|
448
|
+
|
|
449
|
+
def __str__(self) -> str:
|
|
450
|
+
explain_body = ""
|
|
451
|
+
if self.analyze:
|
|
452
|
+
explain_body += " ANALYZE"
|
|
453
|
+
if self.target_format:
|
|
454
|
+
explain_body += f" FORMAT={self.target_format}"
|
|
455
|
+
return "EXPLAIN" + explain_body
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
class _MysqlStaticValueExpression(StaticValueExpression):
|
|
459
|
+
def __init__(self, original_expression: StaticValueExpression) -> None:
|
|
460
|
+
super().__init__(original_expression.value)
|
|
461
|
+
|
|
462
|
+
def __str__(self) -> str:
|
|
463
|
+
return (
|
|
464
|
+
f"{self.value}"
|
|
465
|
+
if isinstance(self.value, numbers.Number)
|
|
466
|
+
else f'"{self.value}"'
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class _MysqlCastExpression(CastExpression):
|
|
471
|
+
def __init__(self, original_expression: CastExpression) -> None:
|
|
472
|
+
super().__init__(
|
|
473
|
+
original_expression.casted_expression, original_expression.target_type
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
def __str__(self) -> str:
|
|
477
|
+
return f"CAST({self.casted_expression} AS {self.target_type})"
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _replace_static_vals(e: SqlExpression) -> SqlExpression:
|
|
481
|
+
return _MysqlStaticValueExpression(e) if isinstance(e, StaticValueExpression) else e
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def _replace_casts(e: SqlExpression) -> SqlExpression:
|
|
485
|
+
return _MysqlCastExpression(e) if isinstance(e, CastExpression) else e
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _generate_join_order_hint(join_order: Optional[JoinTree]) -> str:
|
|
489
|
+
if not join_order:
|
|
490
|
+
return ""
|
|
491
|
+
|
|
492
|
+
join_order_text = ", ".join(table.identifier() for table in join_order.itertables())
|
|
493
|
+
return f" JOIN_ORDER({join_order_text})"
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
MysqlOptimizerHints = {
|
|
497
|
+
JoinOperator.NestedLoopJoin: "NO_BNL",
|
|
498
|
+
JoinOperator.HashJoin: "BNL",
|
|
499
|
+
ScanOperator.SequentialScan: "NO_INDEX",
|
|
500
|
+
ScanOperator.IndexScan: "INDEX",
|
|
501
|
+
ScanOperator.IndexOnlyScan: "INDEX",
|
|
502
|
+
ScanOperator.BitmapScan: "INDEX_MERGE",
|
|
503
|
+
}
|
|
504
|
+
"""See https://dev.mysql.com/doc/refman/8.0/en/optimizer-hints.html"""
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _generate_operator_hints(
|
|
508
|
+
physical_operators: Optional[PhysicalOperatorAssignment],
|
|
509
|
+
) -> str:
|
|
510
|
+
if not physical_operators:
|
|
511
|
+
return ""
|
|
512
|
+
hints = []
|
|
513
|
+
|
|
514
|
+
for table, scan_assignment in physical_operators.scan_operators.items():
|
|
515
|
+
table_key = table.identifier()
|
|
516
|
+
operator = MysqlOptimizerHints[scan_assignment.operator]
|
|
517
|
+
hints.append(f" {operator}({table_key})")
|
|
518
|
+
|
|
519
|
+
for join, join_assignment in physical_operators.join_operators.items():
|
|
520
|
+
join_key = ", ".join(tab.identifier() for tab in join)
|
|
521
|
+
operator = MysqlOptimizerHints[join_assignment.operator]
|
|
522
|
+
hints.append(f" {operator}({join_key})")
|
|
523
|
+
|
|
524
|
+
if physical_operators.intermediate_operators:
|
|
525
|
+
warnings.warn("Cannot generate intermediate operator hints for MySQL.")
|
|
526
|
+
|
|
527
|
+
return "\n".join(hints)
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
MysqlSwitchableOptimizations = {JoinOperator.HashJoin: "block_nested_loop"}
|
|
531
|
+
"""See https://dev.mysql.com/doc/refman/8.0/en/switchable-optimizations.html"""
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _escape_setting(setting) -> str:
|
|
535
|
+
"""Transforms the setting variable into a string that can be used in an SQL query."""
|
|
536
|
+
if isinstance(setting, float) or isinstance(setting, int):
|
|
537
|
+
return str(setting)
|
|
538
|
+
elif isinstance(setting, bool):
|
|
539
|
+
return "TRUE" if setting else "FALSE"
|
|
540
|
+
return f"'{setting}'"
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _generate_prep_statements(
|
|
544
|
+
physical_operators: Optional[PhysicalOperatorAssignment],
|
|
545
|
+
plan_parameters: Optional[PlanParameterization],
|
|
546
|
+
) -> str:
|
|
547
|
+
statements = []
|
|
548
|
+
if physical_operators:
|
|
549
|
+
switchable_optimizations = []
|
|
550
|
+
for operator, enabled in physical_operators.global_settings.items():
|
|
551
|
+
value = "on" if enabled else "off"
|
|
552
|
+
switchable_optimizations.append(
|
|
553
|
+
f"{MysqlSwitchableOptimizations[operator]}={value}"
|
|
554
|
+
)
|
|
555
|
+
if switchable_optimizations:
|
|
556
|
+
optimizer_switch = ",".join(switchable_optimizations)
|
|
557
|
+
statements.append(f"SET @@optimizer_switch='{optimizer_switch}';")
|
|
558
|
+
|
|
559
|
+
if plan_parameters:
|
|
560
|
+
for setting, value in plan_parameters.system_settings.items():
|
|
561
|
+
statements.append(f"SET {setting}={_escape_setting(value)};")
|
|
562
|
+
|
|
563
|
+
return "\n".join(statements) if statements else ""
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
class MysqlHintService(HintService):
|
|
567
|
+
def __init__(self, mysql_instance: MysqlInterface) -> None:
|
|
568
|
+
super().__init__()
|
|
569
|
+
self._mysql_instance = mysql_instance
|
|
570
|
+
|
|
571
|
+
def generate_hints(
|
|
572
|
+
self,
|
|
573
|
+
query: SqlQuery,
|
|
574
|
+
plan: Optional[QueryPlan] = None,
|
|
575
|
+
*,
|
|
576
|
+
join_order: Optional[JoinTree] = None,
|
|
577
|
+
physical_operators: Optional[PhysicalOperatorAssignment] = None,
|
|
578
|
+
plan_parameters: Optional[PlanParameterization] = None,
|
|
579
|
+
) -> SqlQuery:
|
|
580
|
+
if join_order and not join_order.is_linear():
|
|
581
|
+
raise UnsupportedDatabaseFeatureError(
|
|
582
|
+
self._mysql_instance,
|
|
583
|
+
"Can only enforce join order for linear join trees for now",
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
if plan is not None:
|
|
587
|
+
join_order = jointree_from_plan(plan)
|
|
588
|
+
physical_operators = operators_from_plan(plan)
|
|
589
|
+
plan_parameters = parameters_from_plan(plan)
|
|
590
|
+
|
|
591
|
+
join_order_hint = _generate_join_order_hint(join_order)
|
|
592
|
+
operator_hint = _generate_operator_hints(physical_operators)
|
|
593
|
+
prep_statements = _generate_prep_statements(physical_operators, plan_parameters)
|
|
594
|
+
|
|
595
|
+
if not join_order_hint and not operator_hint:
|
|
596
|
+
return query
|
|
597
|
+
|
|
598
|
+
final_hint_block = (
|
|
599
|
+
"/*+\n"
|
|
600
|
+
+ "\n".join(hint for hint in (join_order_hint, operator_hint) if hint)
|
|
601
|
+
+ "\n*/"
|
|
602
|
+
)
|
|
603
|
+
hint_clause = Hint(prep_statements, final_hint_block)
|
|
604
|
+
return transform.add_clause(query, hint_clause)
|
|
605
|
+
|
|
606
|
+
def format_query(self, query: SqlQuery) -> str:
|
|
607
|
+
updated_query = query
|
|
608
|
+
|
|
609
|
+
if updated_query.is_explain():
|
|
610
|
+
transform.replace_clause(query, _MysqlExplainClause(query.explain))
|
|
611
|
+
|
|
612
|
+
if "ANSI_QUOTES" not in self._mysql_instance.server_mode():
|
|
613
|
+
updated_query = transform.replace_expressions(
|
|
614
|
+
updated_query, _replace_static_vals
|
|
615
|
+
)
|
|
616
|
+
updated_query = transform.replace_expressions(updated_query, _replace_casts)
|
|
617
|
+
|
|
618
|
+
return qal.format_quick(updated_query, inline_hint_block=True)
|
|
619
|
+
|
|
620
|
+
def supports_hint(self, hint: PhysicalOperator | HintType) -> bool:
|
|
621
|
+
return hint in MysqlJoinHints | MysqlScanHints | MysqlPlanHints
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class MysqlOptimizer(OptimizerInterface):
|
|
625
|
+
def __init__(self, mysql_instance: MysqlInterface) -> None:
|
|
626
|
+
self._mysql_instance = mysql_instance
|
|
627
|
+
|
|
628
|
+
def query_plan(self, query: SqlQuery | str) -> QueryPlan:
|
|
629
|
+
if isinstance(query, SqlQuery):
|
|
630
|
+
prepared_query = self._mysql_instance._prepare_query_execution(
|
|
631
|
+
query, drop_explain=True
|
|
632
|
+
)
|
|
633
|
+
query_for_plan = query
|
|
634
|
+
else:
|
|
635
|
+
prepared_query = query
|
|
636
|
+
query_for_plan = None
|
|
637
|
+
raw_query_plan = self._mysql_instance._obtain_query_plan(prepared_query)
|
|
638
|
+
query_plan = parse_mysql_explain_plan(query_for_plan, raw_query_plan)
|
|
639
|
+
return query_plan.as_qep()
|
|
640
|
+
|
|
641
|
+
def analyze_plan(self, query: SqlQuery) -> QueryPlan:
|
|
642
|
+
raise NotImplementedError("MySQL interface does not support ANALYZE plans yet")
|
|
643
|
+
|
|
644
|
+
def cardinality_estimate(self, query: SqlQuery | str) -> Cardinality:
|
|
645
|
+
return self.query_plan(query).estimated_cardinality
|
|
646
|
+
|
|
647
|
+
def cost_estimate(self, query: SqlQuery | str) -> float:
|
|
648
|
+
return self.query_plan(query).cost
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def _parse_mysql_connection(config_file: str) -> MysqlConnectionArguments:
|
|
652
|
+
config = configparser.ConfigParser()
|
|
653
|
+
config.read(config_file)
|
|
654
|
+
if "MYSQL" not in config:
|
|
655
|
+
raise ValueError("Malformed MySQL config file: no [MYSQL] section found.")
|
|
656
|
+
mysql_config = config["MYSQL"]
|
|
657
|
+
|
|
658
|
+
if "User" not in mysql_config or "Database" not in mysql_config:
|
|
659
|
+
raise ValueError(
|
|
660
|
+
"Malformed MySQL config file: "
|
|
661
|
+
"'User' and 'Database' keys are required in the [MYSQL] section."
|
|
662
|
+
)
|
|
663
|
+
user = mysql_config["User"]
|
|
664
|
+
database = mysql_config["Database"]
|
|
665
|
+
|
|
666
|
+
optional_settings = {}
|
|
667
|
+
for key in [
|
|
668
|
+
"Password",
|
|
669
|
+
"Host",
|
|
670
|
+
"Port",
|
|
671
|
+
"UseUnicode",
|
|
672
|
+
"Charset",
|
|
673
|
+
"AutoCommit",
|
|
674
|
+
"SqlMode",
|
|
675
|
+
]:
|
|
676
|
+
if key not in mysql_config:
|
|
677
|
+
continue
|
|
678
|
+
optional_settings[util.camel_case2snake_case(key)] = mysql_config[key]
|
|
679
|
+
return MysqlConnectionArguments(user, database, **optional_settings)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def connect(
|
|
683
|
+
*,
|
|
684
|
+
name: str = "mysql",
|
|
685
|
+
connection_args: Optional[MysqlConnectionArguments] = None,
|
|
686
|
+
config_file: str = ".mysql_connection.config",
|
|
687
|
+
cache_enabled: Optional[bool] = None,
|
|
688
|
+
private: bool = False,
|
|
689
|
+
) -> MysqlInterface:
|
|
690
|
+
db_pool = DatabasePool.get_instance()
|
|
691
|
+
if config_file and not connection_args:
|
|
692
|
+
if not os.path.exists(config_file):
|
|
693
|
+
raise ValueError(
|
|
694
|
+
"Config file was given, but does not exist: " + config_file
|
|
695
|
+
)
|
|
696
|
+
connection_args = _parse_mysql_connection(config_file)
|
|
697
|
+
elif not connection_args:
|
|
698
|
+
raise ValueError(
|
|
699
|
+
"Connect string or config file are required to connect to MySQL"
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
mysql_db = MysqlInterface(
|
|
703
|
+
connection_args, system_name=name, cache_enabled=cache_enabled
|
|
704
|
+
)
|
|
705
|
+
if not private:
|
|
706
|
+
db_pool.register_database(name, mysql_db)
|
|
707
|
+
return mysql_db
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
# The next several functions are concerned with MySQL EXPLAIN query plans. Although in theory MySQL offers some great
|
|
711
|
+
# tools to inspect query plans produced by the optimizer (having 3 different output formats: tabular, human-readable
|
|
712
|
+
# plan trees and JSON data), these output formats differ in the information they provide. Only the JSON format provides
|
|
713
|
+
# all the details that we are interested in (and makes them harder to access then when using the tree output for
|
|
714
|
+
# example).
|
|
715
|
+
# Sadly, the JSON output is not available when using EXPLAIN ANALYZE to match the optimizer's expectation
|
|
716
|
+
# with the reality encoutered upon query execution. Since parsing the EXPLAIN trees is quite difficult, we restrict
|
|
717
|
+
# ourselves to plain EXPLAIN plans for now and maybe integrate EXPLAIN ANALYZE plans in the future along with a
|
|
718
|
+
# dedicated parser for its structure.
|
|
719
|
+
# What makes the situation with the JSON-formatted EXPLAIN plans pretty bad is the fact that the structure of the
|
|
720
|
+
# provided JSON document is barely documented and seems incosistent at best (see
|
|
721
|
+
# https://mariadb.com/kb/en/explain-format-json/ for example). Therefore, our JSON-based parser strongly follows a
|
|
722
|
+
# similar implementation, namely the "visual explain" feature of the MySQL Workbench. They also need to traverse the
|
|
723
|
+
# JSON-based EXPLAIN plans, but this time to generate a graphical representation of the information. Still, the
|
|
724
|
+
# traversal and attribute access logic can be re-used by a great deal. It is even implemented in Python! Nevertheless,
|
|
725
|
+
# the code there is often barely documented so a lot of guesswork is still left for us to do. See
|
|
726
|
+
# https://github.com/mysql/mysql-workbench/blob/8.0/plugins/wb.query.analysis/explain_renderer.py for the Workbench
|
|
727
|
+
# implementation that our code is based on. The best explanation of how the different attributes in the JSON document
|
|
728
|
+
# should be interpreted is contained in the MySQL worklog entry to implement parts of the JSON output:
|
|
729
|
+
# https://dev.mysql.com/worklog/task/?id=6510
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def _lookup_table(
|
|
733
|
+
alias: str, candidate_tables: Iterable[TableReference]
|
|
734
|
+
) -> TableReference:
|
|
735
|
+
"""Searches for a specific table in a list of candidate tables.
|
|
736
|
+
|
|
737
|
+
If no candidate table has the given `alias`, the full names are used instead. If still no table matches, a
|
|
738
|
+
`KeyError` is raised.
|
|
739
|
+
|
|
740
|
+
This function is necessary, because MySQL does not contain the complete table names in the output. If that were
|
|
741
|
+
the case, we could construct our `TableReference` objects directly based on this information. Instead, MySQL
|
|
742
|
+
provides the "identifier" of the tables, i.e. the alias if the tables was aliased or the full name otherwise. In
|
|
743
|
+
order to build the correct `TableReference` objects that also line up with the tables contained in the `SqlQuery`
|
|
744
|
+
object for the same query, we need to take this detour and lookup the correct tables.
|
|
745
|
+
|
|
746
|
+
Parameters
|
|
747
|
+
----------
|
|
748
|
+
alias : str
|
|
749
|
+
The table alias to search for. This does not have to be an alias, but could be a full table name just as well.
|
|
750
|
+
candidate_tables : Iterable[TableReference]
|
|
751
|
+
The tables that could potentially have the given alias. `_lookup_table` assumes that at least one of the
|
|
752
|
+
candidates matches.
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
TableReference
|
|
757
|
+
The table with the given alias or full name.
|
|
758
|
+
"""
|
|
759
|
+
table_map = {tab.full_name: tab for tab in candidate_tables}
|
|
760
|
+
|
|
761
|
+
# alias takes precedence over full_name in case of conflicts
|
|
762
|
+
table_map |= {tab.alias: tab for tab in candidate_tables}
|
|
763
|
+
return table_map[alias]
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
_MysqlExplainNodeTypes = {
|
|
767
|
+
"nested_loop",
|
|
768
|
+
"table",
|
|
769
|
+
"optimized_away_subqueries",
|
|
770
|
+
"grouping_operation",
|
|
771
|
+
"ordering_operation",
|
|
772
|
+
"duplicate_removal",
|
|
773
|
+
"union_result",
|
|
774
|
+
"buffer_result",
|
|
775
|
+
"select_list_subqueries",
|
|
776
|
+
}
|
|
777
|
+
"""The different nodes that can occurr in the MySQL EXPLAIN output which correspond to actual operators.
|
|
778
|
+
|
|
779
|
+
Derived from ExplainContext.handle_query_block in mysql_renderer.py
|
|
780
|
+
"""
|
|
781
|
+
|
|
782
|
+
_MysqlMetadataNodes = {
|
|
783
|
+
"cost_info",
|
|
784
|
+
"rows_examined_per_scan",
|
|
785
|
+
"rows_produced_per_join",
|
|
786
|
+
"filtered",
|
|
787
|
+
}
|
|
788
|
+
"""The metadata contained in the MySQL EXPLAIN output that we are interested in.
|
|
789
|
+
|
|
790
|
+
For some reason, the MySQL authors decided that it was a good idea to merge this information with the normal operator
|
|
791
|
+
nodes and not denote the operator tree in any special way.
|
|
792
|
+
"""
|
|
793
|
+
|
|
794
|
+
_Cost, _IdxLookup, _IdxMerge, _TabScan = (
|
|
795
|
+
"Const",
|
|
796
|
+
"Index Lookup",
|
|
797
|
+
"Index Merge",
|
|
798
|
+
"Table Scan",
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
_MysqlJoinSourceTypes = {
|
|
802
|
+
"system": _Cost,
|
|
803
|
+
"const": _Cost,
|
|
804
|
+
"eq_ref": _IdxLookup,
|
|
805
|
+
"ref": _IdxLookup,
|
|
806
|
+
"fulltext": _IdxLookup,
|
|
807
|
+
"ref_or_null": _IdxLookup,
|
|
808
|
+
"index_merge": _IdxMerge,
|
|
809
|
+
"unique_subquery": _IdxLookup,
|
|
810
|
+
"index_subquery": _IdxLookup,
|
|
811
|
+
"range": _IdxLookup,
|
|
812
|
+
"index": _IdxLookup,
|
|
813
|
+
"ALL": _TabScan,
|
|
814
|
+
}
|
|
815
|
+
"""The different ways (Nested Loop) joins can be executed with a single input table.
|
|
816
|
+
|
|
817
|
+
See https://dev.mysql.com/doc/refman/8.0/en/explain-output.html#explain-join-types for details
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
_MysqlJoinTypes = {
|
|
822
|
+
"Block Nested Loop": "Block Nested Loop",
|
|
823
|
+
"Batched Key Access": "Batched Key Access",
|
|
824
|
+
"Batched Key Access (unique)": "Batched Key Access",
|
|
825
|
+
"hash join": "Hash Join", # the lower-case is intentional and not a bug..
|
|
826
|
+
}
|
|
827
|
+
"""The different join algorithms supported by MySQL.
|
|
828
|
+
|
|
829
|
+
See https://dev.mysql.com/doc/refman/8.0/en/explain-output.html#explain-extra-information for the listing.
|
|
830
|
+
"""
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def _parse_cost_info(explain_data: dict) -> tuple[float, float]:
|
|
834
|
+
"""Extracts the relevant cost information from a MySQL EXPLAIN node.
|
|
835
|
+
|
|
836
|
+
Parameters
|
|
837
|
+
----------
|
|
838
|
+
explain_data : dict
|
|
839
|
+
The current EXPLAIN node. Nodes without cost information are handled gracefully.
|
|
840
|
+
|
|
841
|
+
Returns
|
|
842
|
+
-------
|
|
843
|
+
tuple[float, float]
|
|
844
|
+
A tuple of ``(scan cost, join cost)``. Remember that MySQL merges join nodes and scan nodes in the JSON-based
|
|
845
|
+
EXPLAIN output. If the node does not contain any cost information, a ``NaN`` tuple will be returned instead.
|
|
846
|
+
"""
|
|
847
|
+
if "cost_info" not in explain_data:
|
|
848
|
+
return math.nan, math.nan
|
|
849
|
+
cost_info: dict = explain_data["cost_info"]
|
|
850
|
+
|
|
851
|
+
read_cost = cost_info.get("read_cost", "")
|
|
852
|
+
read_cost = float(read_cost) if read_cost else 0
|
|
853
|
+
|
|
854
|
+
eval_cost = cost_info.get("eval_cost", "")
|
|
855
|
+
eval_cost = float(eval_cost) if eval_cost else 0
|
|
856
|
+
|
|
857
|
+
scan_cost = read_cost + eval_cost
|
|
858
|
+
scan_cost = scan_cost if scan_cost else math.nan
|
|
859
|
+
|
|
860
|
+
join_cost = cost_info.get("prefix_cost", "")
|
|
861
|
+
join_cost = float(join_cost) if join_cost else math.nan
|
|
862
|
+
return scan_cost, join_cost
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def _parse_cardinality_info(explain_data: dict) -> tuple[float, float]:
|
|
866
|
+
"""Extracts the relevant cardinality information from a MySQL EXPLAIN node.
|
|
867
|
+
|
|
868
|
+
Parameters
|
|
869
|
+
----------
|
|
870
|
+
explain_data : dict
|
|
871
|
+
The current EXPLAIN node. Nodes without cardinality information are handled gracefully.
|
|
872
|
+
|
|
873
|
+
Returns
|
|
874
|
+
-------
|
|
875
|
+
tuple[float, float]
|
|
876
|
+
A tuple of ``(scan cardinality, join cardinality)``. Remember that MySQL merges join nodes and scan nodes in
|
|
877
|
+
the JSON-based EXPLAIN output. The scan cardinality accounts for all filter predicates. If no scan or join
|
|
878
|
+
cardinality can be determined, a ``NaN`` is used instead.
|
|
879
|
+
"""
|
|
880
|
+
table_cardinality = explain_data.get("rows_examined_per_scan", "")
|
|
881
|
+
table_cardinality = float(table_cardinality) if table_cardinality else math.nan
|
|
882
|
+
|
|
883
|
+
filtered = explain_data.get("filtered")
|
|
884
|
+
filtered = float(filtered) if filtered else math.nan
|
|
885
|
+
selectivity = filtered / 100
|
|
886
|
+
scan_cardinality = selectivity * table_cardinality
|
|
887
|
+
|
|
888
|
+
join_cardinality = explain_data.get("rows_produced_per_join", "")
|
|
889
|
+
join_cardinality = float(join_cardinality) if join_cardinality else math.nan
|
|
890
|
+
return scan_cardinality, join_cardinality
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
def _determine_join_type(explain_data: dict) -> str:
|
|
894
|
+
if "using_join_buffer" not in explain_data:
|
|
895
|
+
return "Nested Loop"
|
|
896
|
+
return _MysqlJoinTypes[explain_data["using_join_buffer"]]
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
def _parse_mysql_join_node(
|
|
900
|
+
query: Optional[SqlQuery], node_name: str, explain_data: list
|
|
901
|
+
) -> Optional[MysqlExplainNode]:
|
|
902
|
+
first_table, *remaining_tables = explain_data
|
|
903
|
+
first_node = _parse_next_mysql_explain_node(query, first_table)
|
|
904
|
+
current_node = first_node
|
|
905
|
+
for next_table in remaining_tables:
|
|
906
|
+
next_node = _parse_next_mysql_explain_node(query, next_table)
|
|
907
|
+
current_node.next_node = next_node
|
|
908
|
+
current_node = next_node
|
|
909
|
+
return first_node
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def _parse_mysql_table_node(
|
|
913
|
+
query: Optional[SqlQuery], node_name: str, explain_data: dict
|
|
914
|
+
) -> Optional[MysqlExplainNode]:
|
|
915
|
+
scanned_table = (
|
|
916
|
+
_lookup_table(explain_data["table_name"], query.tables())
|
|
917
|
+
if query is not None
|
|
918
|
+
else None
|
|
919
|
+
)
|
|
920
|
+
scan_type = _MysqlJoinSourceTypes[
|
|
921
|
+
explain_data["access_type"]
|
|
922
|
+
] # tables are mostly scanned as part of a join
|
|
923
|
+
join_type = _determine_join_type(explain_data)
|
|
924
|
+
scan_cost, join_cost = _parse_cost_info(explain_data)
|
|
925
|
+
scan_card, join_card = _parse_cardinality_info(explain_data)
|
|
926
|
+
|
|
927
|
+
subquery = (
|
|
928
|
+
_parse_next_mysql_explain_node(
|
|
929
|
+
query, explain_data["materialized_from_subquery"]
|
|
930
|
+
)
|
|
931
|
+
if "materialized_from_subquery" in explain_data
|
|
932
|
+
else None
|
|
933
|
+
)
|
|
934
|
+
table_node = MysqlExplainNode(
|
|
935
|
+
scan_type,
|
|
936
|
+
join_type,
|
|
937
|
+
table=scanned_table,
|
|
938
|
+
scan_cost=scan_cost,
|
|
939
|
+
join_cost=join_cost,
|
|
940
|
+
scan_cardinality_estimate=scan_card,
|
|
941
|
+
join_cardinality_estimate=join_card,
|
|
942
|
+
subquery_node=subquery,
|
|
943
|
+
)
|
|
944
|
+
return table_node
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _parse_mysql_wrapper_node(
|
|
948
|
+
query: Optional[SqlQuery], node_name: str, explain_data: dict
|
|
949
|
+
) -> Optional[MysqlExplainNode]:
|
|
950
|
+
scan_cost, join_cost = _parse_cost_info(explain_data)
|
|
951
|
+
scan_card, join_card = _parse_cardinality_info(explain_data)
|
|
952
|
+
source_node = _parse_next_mysql_explain_node(query, explain_data)
|
|
953
|
+
pretty_node_name = node_name.replace(
|
|
954
|
+
"_", " "
|
|
955
|
+
).title() # "grouping_operation" -> "Grouping Operation"
|
|
956
|
+
return MysqlExplainNode(
|
|
957
|
+
subquery_node=source_node,
|
|
958
|
+
node_type=pretty_node_name,
|
|
959
|
+
scan_cost=scan_cost,
|
|
960
|
+
join_cost=join_cost,
|
|
961
|
+
scan_cardinality_estimate=scan_card,
|
|
962
|
+
join_cardinality_estimate=join_card,
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def _parse_mysql_explain_node(
|
|
967
|
+
query: Optional[SqlQuery], node_name: str, explain_data: dict | list
|
|
968
|
+
) -> Optional[MysqlExplainNode]:
|
|
969
|
+
if not explain_data:
|
|
970
|
+
return None
|
|
971
|
+
|
|
972
|
+
if node_name == "nested_loop":
|
|
973
|
+
assert isinstance(explain_data, list)
|
|
974
|
+
return _parse_mysql_join_node(query, node_name, explain_data)
|
|
975
|
+
elif node_name == "table":
|
|
976
|
+
assert isinstance(explain_data, dict)
|
|
977
|
+
return _parse_mysql_table_node(query, node_name, explain_data)
|
|
978
|
+
else:
|
|
979
|
+
explain_data = (
|
|
980
|
+
explain_data["query_block"]
|
|
981
|
+
if "query_block" in explain_data
|
|
982
|
+
else explain_data
|
|
983
|
+
)
|
|
984
|
+
return _parse_mysql_wrapper_node(query, node_name, explain_data)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def _parse_next_mysql_explain_node(
|
|
988
|
+
query: Optional[SqlQuery], explain_data: dict
|
|
989
|
+
) -> Optional[MysqlExplainNode]:
|
|
990
|
+
for info_key, node_data in explain_data.items():
|
|
991
|
+
if info_key in _MysqlExplainNodeTypes:
|
|
992
|
+
return _parse_mysql_explain_node(query, info_key, node_data)
|
|
993
|
+
raise ValueError("No known node found: " + str(explain_data))
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def parse_mysql_explain_plan(
|
|
997
|
+
query: Optional[SqlQuery], explain_data: dict
|
|
998
|
+
) -> MysqlExplainPlan:
|
|
999
|
+
explain_data = explain_data["query_block"]
|
|
1000
|
+
query_cost = explain_data.get("cost_info", {}).get("query_cost", math.nan)
|
|
1001
|
+
|
|
1002
|
+
# the EXPLAIN plan should only have a single root node, but we do not know which operator it is (the JSON document
|
|
1003
|
+
# contains the nodes directly as keys, not under a normalized name, remember?). Therefore, we simply iterate over
|
|
1004
|
+
# all entries in the JSON document and check if the current key is a valid operator name. This is exactly, what
|
|
1005
|
+
# _parse_next_mysql_explain_node does.
|
|
1006
|
+
plan_root = _parse_next_mysql_explain_node(query, explain_data)
|
|
1007
|
+
assert plan_root is not None
|
|
1008
|
+
return MysqlExplainPlan(plan_root, query_cost)
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
_MysqlExplainScanNodes = {
|
|
1012
|
+
_IdxLookup: ScanOperator.IndexScan,
|
|
1013
|
+
_IdxMerge: ScanOperator.BitmapScan,
|
|
1014
|
+
_TabScan: ScanOperator.SequentialScan,
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
_MysqlExplainJoinNodes = {
|
|
1019
|
+
"Block Nested Loop": JoinOperator.NestedLoopJoin,
|
|
1020
|
+
"Batched Key Access": JoinOperator.NestedLoopJoin,
|
|
1021
|
+
"Hash Join": JoinOperator.HashJoin,
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _node_sequence_to_qep(nodes: Sequence[MysqlExplainNode]) -> QueryPlan:
|
|
1026
|
+
assert nodes
|
|
1027
|
+
if len(nodes) == 1:
|
|
1028
|
+
return nodes[0]._make_qep_node_for_scan()
|
|
1029
|
+
|
|
1030
|
+
if len(nodes) == 2:
|
|
1031
|
+
final_table, first_table = nodes
|
|
1032
|
+
final_qep = final_table._make_qep_node_for_scan()
|
|
1033
|
+
first_qep = first_table._make_qep_node_for_scan()
|
|
1034
|
+
join_operator = _MysqlExplainJoinNodes.get(
|
|
1035
|
+
final_table.join_type, JoinOperator.NestedLoopJoin
|
|
1036
|
+
)
|
|
1037
|
+
join_node = QueryPlan(
|
|
1038
|
+
final_table.join_type,
|
|
1039
|
+
operator=join_operator,
|
|
1040
|
+
children=[first_qep, final_qep],
|
|
1041
|
+
estimated_cost=final_table.join_cost,
|
|
1042
|
+
estimated_cardinality=Cardinality(final_table.join_cardinality_estimate),
|
|
1043
|
+
)
|
|
1044
|
+
return join_node
|
|
1045
|
+
|
|
1046
|
+
if len(nodes) > 2:
|
|
1047
|
+
final_table, *former_tables = nodes
|
|
1048
|
+
former_qep = _node_sequence_to_qep(former_tables)
|
|
1049
|
+
final_qep = final_table._make_qep_node_for_scan()
|
|
1050
|
+
|
|
1051
|
+
join_operator = _MysqlExplainJoinNodes.get(
|
|
1052
|
+
final_table.join_type, JoinOperator.NestedLoopJoin
|
|
1053
|
+
)
|
|
1054
|
+
join_node = QueryPlan(
|
|
1055
|
+
final_table.join_type,
|
|
1056
|
+
operator=join_operator,
|
|
1057
|
+
children=[former_qep, final_qep],
|
|
1058
|
+
estimated_cost=final_table.join_cost,
|
|
1059
|
+
estimated_cardinality=Cardinality(final_table.join_cardinality_estimate),
|
|
1060
|
+
)
|
|
1061
|
+
return join_node
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
class MysqlExplainNode:
|
|
1065
|
+
def __init__(
|
|
1066
|
+
self,
|
|
1067
|
+
scan_type: str = "",
|
|
1068
|
+
join_type: str = "",
|
|
1069
|
+
next_node: Optional[MysqlExplainNode] = None,
|
|
1070
|
+
*,
|
|
1071
|
+
node_type: Optional[str] = None,
|
|
1072
|
+
table: Optional[TableReference] = None,
|
|
1073
|
+
scan_cost: float = math.nan,
|
|
1074
|
+
join_cost: float = math.nan,
|
|
1075
|
+
scan_cardinality_estimate: float = math.nan,
|
|
1076
|
+
join_cardinality_estimate: float = math.nan,
|
|
1077
|
+
subquery_node: Optional[MysqlExplainNode] = None,
|
|
1078
|
+
) -> None:
|
|
1079
|
+
self.scan_type = scan_type
|
|
1080
|
+
self.join_type = join_type
|
|
1081
|
+
self.node_type = node_type
|
|
1082
|
+
self.next_node = next_node
|
|
1083
|
+
self.table = table
|
|
1084
|
+
self.scan_cost = scan_cost
|
|
1085
|
+
self.join_cost = join_cost
|
|
1086
|
+
self.scan_cardinality_estimate = scan_cardinality_estimate
|
|
1087
|
+
self.join_cardinality_estimate = join_cardinality_estimate
|
|
1088
|
+
self.subquery = subquery_node
|
|
1089
|
+
|
|
1090
|
+
def as_qep(self) -> QueryPlan:
|
|
1091
|
+
if self.node_type is not None:
|
|
1092
|
+
subquery_plan = (
|
|
1093
|
+
[self.subquery.as_qep()] if self.subquery is not None else []
|
|
1094
|
+
)
|
|
1095
|
+
own_node = QueryPlan(
|
|
1096
|
+
self.node_type,
|
|
1097
|
+
base_table=self.table,
|
|
1098
|
+
children=subquery_plan,
|
|
1099
|
+
estimated_cost=self.join_cost,
|
|
1100
|
+
estimated_cardinality=Cardinality(self.join_cardinality_estimate),
|
|
1101
|
+
)
|
|
1102
|
+
return own_node
|
|
1103
|
+
|
|
1104
|
+
if not self.next_node:
|
|
1105
|
+
return self._make_qep_node_for_scan()
|
|
1106
|
+
|
|
1107
|
+
node_sequence = self._collect_node_sequence()
|
|
1108
|
+
return _node_sequence_to_qep(node_sequence)
|
|
1109
|
+
|
|
1110
|
+
def inspect(self, *, _indendation: int = 0) -> str:
|
|
1111
|
+
prefix = " " * _indendation + "-> " if _indendation else ""
|
|
1112
|
+
own_str = f"{prefix}{self}" if prefix else self._scan_str()
|
|
1113
|
+
|
|
1114
|
+
if self.subquery is not None:
|
|
1115
|
+
subquery_str = self.subquery.inspect(_indendation=_indendation)
|
|
1116
|
+
return "\n".join((own_str, subquery_str))
|
|
1117
|
+
|
|
1118
|
+
if self.next_node is None:
|
|
1119
|
+
return own_str
|
|
1120
|
+
|
|
1121
|
+
next_str = self.next_node.inspect(_indendation=_indendation + 2)
|
|
1122
|
+
return "\n".join((own_str, next_str))
|
|
1123
|
+
|
|
1124
|
+
def _collect_node_sequence(self) -> list[MysqlExplainNode]:
|
|
1125
|
+
if not self.next_node:
|
|
1126
|
+
return [self]
|
|
1127
|
+
return self.next_node._collect_node_sequence() + [self]
|
|
1128
|
+
|
|
1129
|
+
def _make_qep_node_for_scan(self) -> QueryPlan:
|
|
1130
|
+
return QueryPlan(
|
|
1131
|
+
self.scan_type,
|
|
1132
|
+
base_table=self.table,
|
|
1133
|
+
operator=_MysqlExplainScanNodes.get(self.scan_type),
|
|
1134
|
+
estimated_cost=self.scan_cost,
|
|
1135
|
+
estimated_cardinality=Cardinality(self.scan_cardinality_estimate),
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
def _join_str(self) -> str:
|
|
1139
|
+
if self.node_type is not None:
|
|
1140
|
+
join_str = (
|
|
1141
|
+
f"Join[cost={self.join_cost}, cardinality={self.join_cardinality_estimate}]"
|
|
1142
|
+
if not math.isnan(self.join_cost)
|
|
1143
|
+
or not math.isnan(self.join_cardinality_estimate)
|
|
1144
|
+
else ""
|
|
1145
|
+
)
|
|
1146
|
+
else:
|
|
1147
|
+
join_str = f"{self.join_type} [cost={self.join_cost}, cardinality={self.join_cardinality_estimate}]"
|
|
1148
|
+
return join_str
|
|
1149
|
+
|
|
1150
|
+
def _scan_str(self) -> str:
|
|
1151
|
+
if self.node_type is not None:
|
|
1152
|
+
scan_str = (
|
|
1153
|
+
f"Scan[cost={self.scan_cost}, cardinality={self.scan_cardinality_estimate}]"
|
|
1154
|
+
if not math.isnan(self.scan_cost)
|
|
1155
|
+
or not math.isnan(self.scan_cardinality_estimate)
|
|
1156
|
+
else ""
|
|
1157
|
+
)
|
|
1158
|
+
else:
|
|
1159
|
+
scan_str = f"{self.scan_type} [cost={self.scan_cost}, cardinality={self.scan_cardinality_estimate}]"
|
|
1160
|
+
if self.table is not None:
|
|
1161
|
+
scan_str += f" ON {self.table}"
|
|
1162
|
+
return scan_str
|
|
1163
|
+
|
|
1164
|
+
def __repr__(self) -> str:
|
|
1165
|
+
return str(self)
|
|
1166
|
+
|
|
1167
|
+
def __str__(self) -> str:
|
|
1168
|
+
join_str, scan_str = self._join_str(), self._scan_str()
|
|
1169
|
+
if self.node_type is not None:
|
|
1170
|
+
node_str = str(self.node_type)
|
|
1171
|
+
if join_str:
|
|
1172
|
+
node_str += " " + join_str
|
|
1173
|
+
if scan_str:
|
|
1174
|
+
node_str += " " + scan_str
|
|
1175
|
+
return node_str
|
|
1176
|
+
|
|
1177
|
+
return f"{join_str} USING {scan_str}"
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
class MysqlExplainPlan:
|
|
1181
|
+
def __init__(self, root: MysqlExplainNode, total_cost: float) -> None:
|
|
1182
|
+
self.root = root
|
|
1183
|
+
self.total_cost = total_cost
|
|
1184
|
+
|
|
1185
|
+
def as_qep(self) -> QueryPlan:
|
|
1186
|
+
return self.root.as_qep()
|
|
1187
|
+
|
|
1188
|
+
def inspect(self) -> str:
|
|
1189
|
+
return self.root.inspect()
|
|
1190
|
+
|
|
1191
|
+
def __repr__(self) -> str:
|
|
1192
|
+
return str(self)
|
|
1193
|
+
|
|
1194
|
+
def __str__(self) -> str:
|
|
1195
|
+
return f"Plan cost={self.total_cost}, Root={self.root}"
|