PostBOUND 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- postbound/__init__.py +211 -0
- postbound/_base.py +6 -0
- postbound/_bench.py +1012 -0
- postbound/_core.py +1153 -0
- postbound/_hints.py +1373 -0
- postbound/_jointree.py +1079 -0
- postbound/_pipelines.py +1121 -0
- postbound/_qep.py +1986 -0
- postbound/_stages.py +876 -0
- postbound/_validation.py +734 -0
- postbound/db/__init__.py +72 -0
- postbound/db/_db.py +2348 -0
- postbound/db/_duckdb.py +785 -0
- postbound/db/mysql.py +1195 -0
- postbound/db/postgres.py +4216 -0
- postbound/experiments/__init__.py +12 -0
- postbound/experiments/analysis.py +674 -0
- postbound/experiments/benchmarking.py +54 -0
- postbound/experiments/ceb.py +877 -0
- postbound/experiments/interactive.py +105 -0
- postbound/experiments/querygen.py +334 -0
- postbound/experiments/workloads.py +980 -0
- postbound/optimizer/__init__.py +92 -0
- postbound/optimizer/__init__.pyi +73 -0
- postbound/optimizer/_cardinalities.py +369 -0
- postbound/optimizer/_joingraph.py +1150 -0
- postbound/optimizer/dynprog.py +1825 -0
- postbound/optimizer/enumeration.py +432 -0
- postbound/optimizer/native.py +539 -0
- postbound/optimizer/noopt.py +54 -0
- postbound/optimizer/presets.py +147 -0
- postbound/optimizer/randomized.py +650 -0
- postbound/optimizer/tonic.py +1479 -0
- postbound/optimizer/ues.py +1607 -0
- postbound/qal/__init__.py +343 -0
- postbound/qal/_qal.py +9678 -0
- postbound/qal/formatter.py +1089 -0
- postbound/qal/parser.py +2344 -0
- postbound/qal/relalg.py +4257 -0
- postbound/qal/transform.py +2184 -0
- postbound/shortcuts.py +70 -0
- postbound/util/__init__.py +46 -0
- postbound/util/_errors.py +33 -0
- postbound/util/collections.py +490 -0
- postbound/util/dataframe.py +71 -0
- postbound/util/dicts.py +330 -0
- postbound/util/jsonize.py +68 -0
- postbound/util/logging.py +106 -0
- postbound/util/misc.py +168 -0
- postbound/util/networkx.py +401 -0
- postbound/util/numbers.py +438 -0
- postbound/util/proc.py +107 -0
- postbound/util/stats.py +37 -0
- postbound/util/system.py +48 -0
- postbound/util/typing.py +35 -0
- postbound/vis/__init__.py +5 -0
- postbound/vis/fdl.py +69 -0
- postbound/vis/graphs.py +48 -0
- postbound/vis/optimizer.py +538 -0
- postbound/vis/plots.py +84 -0
- postbound/vis/tonic.py +70 -0
- postbound/vis/trees.py +105 -0
- postbound-0.19.0.dist-info/METADATA +355 -0
- postbound-0.19.0.dist-info/RECORD +67 -0
- postbound-0.19.0.dist-info/WHEEL +5 -0
- postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
- postbound-0.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,980 @@
|
|
|
1
|
+
"""Provides abstractions to represent entire query workloads and utilities to read some pre-defined instances.
|
|
2
|
+
|
|
3
|
+
The main abstraction provided by this class is the `Workload`. A number of utility functions to read collections of queries
|
|
4
|
+
from different sources and input formats into workload objects exist as well. The pre-defined workloads include the
|
|
5
|
+
Join Order Benchmark [1]_ (including JOB-light [2]_ and JOB-Complex [3]_), Star Schema Benchmark [4]_, Stack Benchmark [5]_
|
|
6
|
+
and Stats Benchmark [6]_.
|
|
7
|
+
|
|
8
|
+
PostBOUND stores the workload queries in a dedicated directory, located relative to the user's home directory at
|
|
9
|
+
*$HOME/.postbound/*. If a workload is requested for the first time, it will be downloaded automatically. Therefore, the initial
|
|
10
|
+
usage of a novel workload may take slightly longer than usual.
|
|
11
|
+
|
|
12
|
+
References
|
|
13
|
+
----------
|
|
14
|
+
|
|
15
|
+
.. [1] Viktor Leis et al.: How Good Are Query Optimizers, Really? (Proc. VLDB Endow. 9, 3 (2015))
|
|
16
|
+
.. [2] Andreas Kipf et al.: Learned Cardinalities: Estimating Correlated Joins with Deep Learning. (CIDR'2019)
|
|
17
|
+
.. [3] Johannes Wehrstein.: JOB-Complex: A Challenging Benchmark for Traditional & Learned Query Optimization. (AIDB'2025)
|
|
18
|
+
.. [4] Patrick E. O'Neil et al.: The Star Schema Benchmark and Augmented Fact Table Indexing. (TPCTC'2009)
|
|
19
|
+
.. [5] Ryan Marcus et al.: Bao: Making Learned Query Optimization Practical. (SIGMOD'2021)
|
|
20
|
+
.. [6] Yuxing Han et al.: Cardinality Estimation in DBMS: A Comprehensive Benchmark Evaluation (Proc. VLDB Endow. 15, 4 (2022))
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import collections
|
|
26
|
+
import pathlib
|
|
27
|
+
import random
|
|
28
|
+
import typing
|
|
29
|
+
import urllib.request
|
|
30
|
+
import zipfile
|
|
31
|
+
from collections.abc import Callable, Hashable, Iterable, Sequence
|
|
32
|
+
from typing import Literal, Optional
|
|
33
|
+
|
|
34
|
+
import natsort
|
|
35
|
+
import pandas as pd
|
|
36
|
+
|
|
37
|
+
from .. import util
|
|
38
|
+
from ..db._db import DatabasePool
|
|
39
|
+
from ..qal import parser
|
|
40
|
+
from ..qal._qal import SqlQuery
|
|
41
|
+
|
|
42
|
+
_WorkloadSources = {
|
|
43
|
+
"job": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/qQEBsM2Zx4x9BBW",
|
|
44
|
+
"job-complex": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/MBFejJXSdHbnoix",
|
|
45
|
+
"job-light": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/q4b9Mq6C485CnXw",
|
|
46
|
+
"ssb": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/iXD5p3J5q6DwdbQ",
|
|
47
|
+
"stack": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/AQgxPe9KrNGJ5nT",
|
|
48
|
+
"stats": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/STTdpKR3LB5ojt3",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _fetch_workload(name: str) -> pathlib.Path:
|
|
53
|
+
"""Determines the local path of a workload, downloading it if necessary."""
|
|
54
|
+
name = name.lower()
|
|
55
|
+
workload_dir = pathlib.Path.home() / ".postbound" / "workloads" / name
|
|
56
|
+
if workload_dir.exists():
|
|
57
|
+
return workload_dir
|
|
58
|
+
workload_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
archive_url = _WorkloadSources.get(name)
|
|
61
|
+
if not archive_url:
|
|
62
|
+
raise ValueError(f"No known source for workload '{name}'")
|
|
63
|
+
|
|
64
|
+
archive_file = workload_dir.parent / f"{name}.zip"
|
|
65
|
+
urllib.request.urlretrieve(archive_url, archive_file)
|
|
66
|
+
|
|
67
|
+
with zipfile.ZipFile(archive_file, "r") as zip:
|
|
68
|
+
zip.extractall(workload_dir)
|
|
69
|
+
|
|
70
|
+
archive_file.unlink()
|
|
71
|
+
return workload_dir
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
LabelType = typing.TypeVar("LabelType", bound=Hashable)
|
|
75
|
+
"""The labels that are used to identify individual queries in a workload."""
|
|
76
|
+
|
|
77
|
+
NewLabelType = typing.TypeVar("NewLabelType", bound=Hashable)
|
|
78
|
+
"""In case of mutations of the workload labels, this denotes the new type of the labels after the mutation."""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Workload(collections.UserDict[LabelType, SqlQuery]):
|
|
82
|
+
"""A workload collects a number of queries (read: benchmark) and provides utilities to operate on them conveniently.
|
|
83
|
+
|
|
84
|
+
In addition to the actual queries, each query is annotated by a label that can be used to retrieve the query more
|
|
85
|
+
nicely. E.g. for queries in the Join Order Benchmark, access by their index is supported - such as ``job["1a"]``. Labels
|
|
86
|
+
can be arbitrary types as long as they are hashable. Since the workload inherits from dict, the label can be used directly
|
|
87
|
+
to fetch the associated query (and will raise ``KeyError`` instances for unknown labels).
|
|
88
|
+
|
|
89
|
+
Each workload can be given a name, which is mainly intended for readability in ``__str__`` methods and does not serve
|
|
90
|
+
a functional purpose. However, it may be good practice to use a normalized name that can be used in different contexts
|
|
91
|
+
such as in file names, etc.
|
|
92
|
+
|
|
93
|
+
When using methods that allow iteration over the queries, they will typically be returned in order according to the natural
|
|
94
|
+
order of the query labels. However, since workloads can be shuffled randomly, this order can also be destroyed.
|
|
95
|
+
|
|
96
|
+
A workload is implemented as an immutable data object. Therefore, it is not possible/not intended to change the contents
|
|
97
|
+
of a workload object later on. All methods that mutate the contents instead provide new workload instances.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
queries : dict[LabelType, SqlQuery]
|
|
102
|
+
The queries that form the actual workload
|
|
103
|
+
name : str, optional
|
|
104
|
+
A name that can be used to identify or represent the workload, by default ``""``.
|
|
105
|
+
root : Optional[pathlib.Path], optional
|
|
106
|
+
The root directory that contains the workload queries. This is mainly used to somehow identify the workload when no
|
|
107
|
+
name is given or the workload contents do not match the expected queries. Defaults to ``None``.
|
|
108
|
+
|
|
109
|
+
Notes
|
|
110
|
+
-----
|
|
111
|
+
Workloads support many of the Python builtin-methods thanks to inheriting from ``UserDict``. Namely, the *len*, *iter* and
|
|
112
|
+
*in* methods work as expected on the labels. Furthermore, multiple workload objects can be added, subtracted and
|
|
113
|
+
intersected using set semantics. Subtraction and intersection also work based on individual labels.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def read(
|
|
118
|
+
root_dir: str,
|
|
119
|
+
*,
|
|
120
|
+
query_file_pattern: str = "*.sql",
|
|
121
|
+
name: str = "",
|
|
122
|
+
label_prefix: str = "",
|
|
123
|
+
file_encoding: str = "utf-8",
|
|
124
|
+
bind_columns: bool = True,
|
|
125
|
+
) -> Workload[str]:
|
|
126
|
+
"""Reads all SQL queries from a specific directory into a workload object.
|
|
127
|
+
|
|
128
|
+
This method assumes that the queries are stored in individual files, one query per file. The query labels will be
|
|
129
|
+
constructed based on the file name of the source files. For example, a query contained in file ``q-1-1.sql`` will
|
|
130
|
+
receive label ``q-1-1`` (note that the trailing file extension is dropped). If the `label_prefix` is given, it will be
|
|
131
|
+
inserted before the file name-based label.
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
root_dir : str
|
|
136
|
+
Directory containing the individual query files
|
|
137
|
+
query_file_pattern : str, optional
|
|
138
|
+
File name pattern that is shared by all query files. Only files matching the pattern will be read and each matching
|
|
139
|
+
file is assumed to be a valid workload query. This is resolved as a glob expression. Defaults to ``"*.sql"``
|
|
140
|
+
name : str, optional
|
|
141
|
+
An optional name that can be used to identify the workload. Empty by default.
|
|
142
|
+
label_prefix : str, optional
|
|
143
|
+
A prefix to add before each query label. Empty by default. Notice that the prefix will be prepended as-is, i.e. no
|
|
144
|
+
separator character is inserted. If a separator is desired, it has to be part of the prefix.
|
|
145
|
+
file_encoding : str, optional
|
|
146
|
+
The encoding of the query files. All files must share the same encoding. Defaults to UTF-8 encoding.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
Workload[str]
|
|
151
|
+
A workload consisting of all query files contained in the root directory.
|
|
152
|
+
|
|
153
|
+
See Also
|
|
154
|
+
--------
|
|
155
|
+
pathlib.Path.glob
|
|
156
|
+
"""
|
|
157
|
+
queries: dict[str, SqlQuery] = {}
|
|
158
|
+
root = pathlib.Path(root_dir)
|
|
159
|
+
|
|
160
|
+
for query_file_path in root.glob(query_file_pattern):
|
|
161
|
+
with open(query_file_path, "r", encoding=file_encoding) as query_file:
|
|
162
|
+
raw_contents = query_file.readlines()
|
|
163
|
+
query_contents = "\n".join([line for line in raw_contents])
|
|
164
|
+
try:
|
|
165
|
+
parsed_query = parser.parse_query(
|
|
166
|
+
query_contents, bind_columns=bind_columns
|
|
167
|
+
)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
raise ValueError(f"Could not parse query from {query_file_path}", e)
|
|
170
|
+
query_label = query_file_path.stem
|
|
171
|
+
queries[label_prefix + query_label] = parsed_query
|
|
172
|
+
|
|
173
|
+
return Workload(queries, name=name, root=root)
|
|
174
|
+
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
queries: dict[LabelType, SqlQuery],
|
|
178
|
+
name: str = "",
|
|
179
|
+
root: Optional[pathlib.Path] = None,
|
|
180
|
+
) -> None:
|
|
181
|
+
super().__init__(queries)
|
|
182
|
+
self._name = name
|
|
183
|
+
self._root = root
|
|
184
|
+
|
|
185
|
+
self._sorted_labels = natsort.natsorted(list(self.keys()))
|
|
186
|
+
self._sorted_queries: list[SqlQuery] = []
|
|
187
|
+
self._update_query_order()
|
|
188
|
+
|
|
189
|
+
self._label_mapping = util.dicts.invert(self.data)
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def name(self) -> str:
|
|
193
|
+
"""Provides the name of the workload.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
str
|
|
198
|
+
The name or an empty string if no name has been specified.
|
|
199
|
+
"""
|
|
200
|
+
return self._name
|
|
201
|
+
|
|
202
|
+
def queries(self) -> Sequence[SqlQuery]:
|
|
203
|
+
"""Provides all queries in the workload in natural order (according to their labels).
|
|
204
|
+
|
|
205
|
+
If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
|
|
206
|
+
|
|
207
|
+
Returns
|
|
208
|
+
-------
|
|
209
|
+
Sequence[SqlQuery]
|
|
210
|
+
The queries
|
|
211
|
+
"""
|
|
212
|
+
return list(self._sorted_queries)
|
|
213
|
+
|
|
214
|
+
def labels(self) -> Sequence[LabelType]:
|
|
215
|
+
"""Provides all query labels of the workload in natural order.
|
|
216
|
+
|
|
217
|
+
If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
Sequence[LabelType]
|
|
222
|
+
The labels
|
|
223
|
+
"""
|
|
224
|
+
return list(self._sorted_labels)
|
|
225
|
+
|
|
226
|
+
def entries(self) -> Sequence[tuple[LabelType, SqlQuery]]:
|
|
227
|
+
"""Provides all (label, query) pairs in the workload, in natural order of the query labels.
|
|
228
|
+
|
|
229
|
+
If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
Sequence[tuple[LabelType, SqlQuery]]
|
|
234
|
+
The queries along with their labels
|
|
235
|
+
"""
|
|
236
|
+
return list(zip(self._sorted_labels, self._sorted_queries))
|
|
237
|
+
|
|
238
|
+
def head(self) -> Optional[tuple[LabelType, SqlQuery]]:
|
|
239
|
+
"""Provides the first query in the workload.
|
|
240
|
+
|
|
241
|
+
The first query is determined according to the natural order of the query labels by default. If that order was manually
|
|
242
|
+
destroyed, e.g. by shuffling, the shuffled order is used.
|
|
243
|
+
|
|
244
|
+
There is no policy to break ties in the order. An arbitrary query can be returned in this case.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
Optional[tuple[LabelType, SqlQuery]]
|
|
249
|
+
The first query, if there is at least one query in the workload. ``None`` otherwise.
|
|
250
|
+
"""
|
|
251
|
+
if not self._sorted_labels:
|
|
252
|
+
return None
|
|
253
|
+
return self._sorted_labels[0], self._sorted_queries[0]
|
|
254
|
+
|
|
255
|
+
def label_of(self, query: SqlQuery) -> LabelType:
|
|
256
|
+
"""Provides the label of the given query.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
query : SqlQuery
|
|
261
|
+
The query to check
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
LabelType
|
|
266
|
+
The corresponding label
|
|
267
|
+
|
|
268
|
+
Raises
|
|
269
|
+
------
|
|
270
|
+
KeyError
|
|
271
|
+
If the query is not part of the workload
|
|
272
|
+
"""
|
|
273
|
+
return self._label_mapping[query]
|
|
274
|
+
|
|
275
|
+
def with_labels(self, labels: Iterable[LabelType]) -> Workload[LabelType]:
|
|
276
|
+
"""Provides a new workload that contains only the queries with the specified labels.
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
labels : Iterable[LabelType]
|
|
281
|
+
The labels to include in the new workload
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
Workload[LabelType]
|
|
286
|
+
A workload that contains only the queries with the specified labels
|
|
287
|
+
"""
|
|
288
|
+
labels = set(labels)
|
|
289
|
+
selected_queries = {
|
|
290
|
+
label: query for label, query in self.data.items() if label in labels
|
|
291
|
+
}
|
|
292
|
+
return Workload(selected_queries, name=self._name, root=self._root)
|
|
293
|
+
|
|
294
|
+
def first(self, n: int) -> Workload[LabelType]:
|
|
295
|
+
"""Provides the first `n` queries of the workload, according to the natural order of the query labels.
|
|
296
|
+
|
|
297
|
+
If there are less than `n` queries in the workload, all queries will be returned. Similar to other methods that rely
|
|
298
|
+
on some sort of ordering of the queries, if the natural order has been manually broken due to shuffling, the shuffled
|
|
299
|
+
order is used instead.
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
n : int
|
|
304
|
+
The number of queries that should be returned
|
|
305
|
+
|
|
306
|
+
Returns
|
|
307
|
+
-------
|
|
308
|
+
Workload[LabelType]
|
|
309
|
+
A workload consisting of the first `n` queries of the current workload
|
|
310
|
+
"""
|
|
311
|
+
first_n_labels = self._sorted_labels[:n]
|
|
312
|
+
sub_workload = {label: self.data[label] for label in first_n_labels}
|
|
313
|
+
return Workload(sub_workload, self._name, self._root)
|
|
314
|
+
|
|
315
|
+
def last(self, n: int) -> Workload[LabelType]:
|
|
316
|
+
"""Provides the last `n` queries of the workload, according to the natural order of the query labels.
|
|
317
|
+
|
|
318
|
+
If there are less than `n` queries in the workload, all queries will be returned. Similar to other methods that rely
|
|
319
|
+
on some sort of ordering of the queries, if the natural order has been manually broken due to shuffling, the shuffled
|
|
320
|
+
order is used instead.
|
|
321
|
+
|
|
322
|
+
Parameters
|
|
323
|
+
----------
|
|
324
|
+
n : int
|
|
325
|
+
The number of queries that should be returned
|
|
326
|
+
|
|
327
|
+
Returns
|
|
328
|
+
-------
|
|
329
|
+
Workload[LabelType]
|
|
330
|
+
A workload consisting of the last `n` queries of the current workload
|
|
331
|
+
"""
|
|
332
|
+
last_n_labels = self._sorted_labels[-n:]
|
|
333
|
+
sub_workload = {label: self.data[label] for label in last_n_labels}
|
|
334
|
+
return Workload(sub_workload, self._name, self._root)
|
|
335
|
+
|
|
336
|
+
def pick_random(self, n: int) -> Workload[LabelType]:
|
|
337
|
+
"""Constructs a new workload, consisting of randomly selected queries from this workload.
|
|
338
|
+
|
|
339
|
+
The new workload will once again be ordered according to the natural ordering of the labels.
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
----------
|
|
343
|
+
n : int
|
|
344
|
+
The number of queries to choose. If there are less queries in the workload, all will be selected.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
Workload[LabelType]
|
|
349
|
+
A workload consisting of `n` unique random queries from this workload
|
|
350
|
+
"""
|
|
351
|
+
n = min(n, len(self._sorted_queries))
|
|
352
|
+
selected_labels = random.sample(self._sorted_labels, n)
|
|
353
|
+
sub_workload = {label: self.data[label] for label in selected_labels}
|
|
354
|
+
return Workload(sub_workload, self._name, self._root)
|
|
355
|
+
|
|
356
|
+
def with_prefix(self, label_prefix: LabelType) -> Workload[LabelType]:
|
|
357
|
+
"""Filters the workload for all queries that have a lablel starting with a specific prefix.
|
|
358
|
+
|
|
359
|
+
This method requires that all label instances provide a `startswith` method (as is the case for simple string
|
|
360
|
+
labels). Most significantly, this means that integer-based indexing does not work with for the prefix-based filter.
|
|
361
|
+
The *See Also* section provides some means to mitigate this problem.
|
|
362
|
+
|
|
363
|
+
Parameters
|
|
364
|
+
----------
|
|
365
|
+
label_prefix : LabelType
|
|
366
|
+
The prefix to filter for
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
Workload[LabelType]
|
|
371
|
+
All queries of this workload that have a label with a matching prefix. Queries will be sorted according to the
|
|
372
|
+
natural order of their labels again.
|
|
373
|
+
|
|
374
|
+
Raises
|
|
375
|
+
------
|
|
376
|
+
ValueError
|
|
377
|
+
If the prefix type does not provide a `startswith` method.
|
|
378
|
+
|
|
379
|
+
See Also
|
|
380
|
+
--------
|
|
381
|
+
relabel - to change the labels into a type that provides `startswith`
|
|
382
|
+
filter_by - to perform a custom prefix check for other types
|
|
383
|
+
"""
|
|
384
|
+
if "startswith" not in dir(label_prefix):
|
|
385
|
+
raise ValueError("label_prefix must have startswith() method")
|
|
386
|
+
prefix_queries = {
|
|
387
|
+
label: query
|
|
388
|
+
for label, query in self.data.items()
|
|
389
|
+
if label.startswith(label_prefix)
|
|
390
|
+
}
|
|
391
|
+
return Workload(prefix_queries, name=self._name, root=self._root)
|
|
392
|
+
|
|
393
|
+
def filter_by(
|
|
394
|
+
self, predicate: Callable[[LabelType, SqlQuery], bool]
|
|
395
|
+
) -> Workload[LabelType]:
|
|
396
|
+
"""Provides all queries from the workload that match a specific predicate.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
predicate : Callable[[LabelType, SqlQuery], bool]
|
|
401
|
+
The filter condition. All queries that pass the check are included in the new workload. The filter predicate
|
|
402
|
+
receives the label and the query for each query in the input
|
|
403
|
+
|
|
404
|
+
Returns
|
|
405
|
+
-------
|
|
406
|
+
Workload[LabelType]
|
|
407
|
+
All queries that passed the filter condition check. Queries will be sorted according to the natural order of their
|
|
408
|
+
labels again.
|
|
409
|
+
"""
|
|
410
|
+
matching_queries = {
|
|
411
|
+
label: query
|
|
412
|
+
for label, query in self.data.items()
|
|
413
|
+
if predicate(label, query)
|
|
414
|
+
}
|
|
415
|
+
return Workload(matching_queries, name=self._name, root=self._root)
|
|
416
|
+
|
|
417
|
+
def relabel(
|
|
418
|
+
self, label_provider: Callable[[LabelType, SqlQuery], NewLabelType]
|
|
419
|
+
) -> Workload[NewLabelType]:
|
|
420
|
+
"""Constructs a new workload, leaving the queries intact but replacing the labels.
|
|
421
|
+
|
|
422
|
+
The new workload will ordered according to the natural order of the new labels.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
label_provider : Callable[[LabelType, SqlQuery], NewLabelType]
|
|
427
|
+
Replacement method that maps all old labels to the new label values. This method has to provide unique labels. If
|
|
428
|
+
that is not the case, conflicts will be resolved but in an arbitrary way. The replacement receives the old label
|
|
429
|
+
as well as the query as input and produces the new label value.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
Workload[NewLabelType]
|
|
434
|
+
All queries of the current workload, but with new labels
|
|
435
|
+
"""
|
|
436
|
+
relabeled_queries = {
|
|
437
|
+
label_provider(current_label, query): query
|
|
438
|
+
for current_label, query in self.data.items()
|
|
439
|
+
}
|
|
440
|
+
return Workload(relabeled_queries, self._name, self._root)
|
|
441
|
+
|
|
442
|
+
def shuffle(self) -> Workload[LabelType]:
|
|
443
|
+
"""Randomly changes the order of the queries in the workload.
|
|
444
|
+
|
|
445
|
+
Returns
|
|
446
|
+
-------
|
|
447
|
+
Workload[LabelType]
|
|
448
|
+
All queries of the current workload, but with the queries in a different order
|
|
449
|
+
"""
|
|
450
|
+
shuffled_workload = Workload(self.data, self._name, self._root)
|
|
451
|
+
shuffled_workload._sorted_labels = random.sample(
|
|
452
|
+
self._sorted_labels, k=len(self)
|
|
453
|
+
)
|
|
454
|
+
shuffled_workload._update_query_order()
|
|
455
|
+
return shuffled_workload
|
|
456
|
+
|
|
457
|
+
def ordered(self) -> Workload[LabelType]:
|
|
458
|
+
"""Enforces the natural ordering of the queries according to their labels.
|
|
459
|
+
|
|
460
|
+
Returns
|
|
461
|
+
-------
|
|
462
|
+
Workload[LabelType]
|
|
463
|
+
All queries of the current workload, but in their natural order.
|
|
464
|
+
"""
|
|
465
|
+
return Workload(self.data, self._name, self._root)
|
|
466
|
+
|
|
467
|
+
def _update_query_order(self) -> None:
|
|
468
|
+
"""Enforces that the order of the queries matches the order of the labels."""
|
|
469
|
+
self._sorted_queries = [self.data[label] for label in self._sorted_labels]
|
|
470
|
+
|
|
471
|
+
def __add__(self, other: Workload[LabelType]) -> Workload[LabelType]:
|
|
472
|
+
if not isinstance(other, Workload):
|
|
473
|
+
raise TypeError("Can only add workloads together")
|
|
474
|
+
return Workload(
|
|
475
|
+
other.data | self.data, name=self._name, root=self._root
|
|
476
|
+
) # retain own labels in case of conflict
|
|
477
|
+
|
|
478
|
+
def __sub__(self, other: Workload[LabelType]) -> Workload[LabelType]:
|
|
479
|
+
if not isinstance(other, Workload) and isinstance(other, Iterable):
|
|
480
|
+
labels_to_remove = set(other)
|
|
481
|
+
reduced_workload = {
|
|
482
|
+
label: query
|
|
483
|
+
for label, query in self.data.items()
|
|
484
|
+
if label not in labels_to_remove
|
|
485
|
+
}
|
|
486
|
+
return Workload(reduced_workload, name=self._name, root=self._root)
|
|
487
|
+
elif not isinstance(other, Workload):
|
|
488
|
+
raise TypeError("Expected workload or labels to subtract")
|
|
489
|
+
return Workload(
|
|
490
|
+
util.dicts.difference(self.data, other.data),
|
|
491
|
+
name=self._name,
|
|
492
|
+
root=self._root,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
def __and__(self, other: Workload[LabelType]) -> Workload[LabelType]:
|
|
496
|
+
if not isinstance(other, Workload) and isinstance(other, Iterable):
|
|
497
|
+
labels_to_include = set(other)
|
|
498
|
+
reduced_workload = {
|
|
499
|
+
label: query
|
|
500
|
+
for label, query in self.data.items()
|
|
501
|
+
if label in labels_to_include
|
|
502
|
+
}
|
|
503
|
+
return Workload(reduced_workload, name=self._name, root=self._root)
|
|
504
|
+
elif not isinstance(other, Workload):
|
|
505
|
+
raise TypeError("Expected workload or labels to compute union")
|
|
506
|
+
return Workload(
|
|
507
|
+
util.dicts.intersection(self.data, other.data),
|
|
508
|
+
name=self._name,
|
|
509
|
+
root=self._root,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
def __or__(self, other: Workload[LabelType]) -> Workload[LabelType]:
|
|
513
|
+
if not isinstance(other, Workload):
|
|
514
|
+
raise TypeError("Can only compute union of workloads")
|
|
515
|
+
return Workload(
|
|
516
|
+
other.data | self.data, name=self._name, root=self._root
|
|
517
|
+
) # retain own labels in case of conflict
|
|
518
|
+
|
|
519
|
+
def __repr__(self) -> str:
|
|
520
|
+
return str(self)
|
|
521
|
+
|
|
522
|
+
def __str__(self) -> str:
|
|
523
|
+
if self._name:
|
|
524
|
+
return f"Workload: {self._name} ({len(self)} queries)"
|
|
525
|
+
elif self._root:
|
|
526
|
+
return f"Workload: {self._root.stem} ({len(self)} queries)"
|
|
527
|
+
else:
|
|
528
|
+
return f"Workload: {len(self)} queries"
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def read_workload(
|
|
532
|
+
path: str,
|
|
533
|
+
name: str = "",
|
|
534
|
+
*,
|
|
535
|
+
query_file_pattern: str = "*.sql",
|
|
536
|
+
recurse_subdirectories: bool = False,
|
|
537
|
+
query_label_prefix: str = "",
|
|
538
|
+
file_encoding: str = "utf-8",
|
|
539
|
+
bind_columns: bool = True,
|
|
540
|
+
) -> Workload[str]:
|
|
541
|
+
"""Loads a workload consisting of multiple different files, potentially scattered in multiple directories
|
|
542
|
+
|
|
543
|
+
The main advantage of this method over using `Workload.read` directly is the support for recursive directory layouts: it
|
|
544
|
+
can traverse subdirectories relative to the specified root and collect all workload files in a recursive manner. If
|
|
545
|
+
subdirectories are used, their names will be used as prefixes to the query label, which is still inferred from the query
|
|
546
|
+
file name.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
path : str
|
|
551
|
+
The root directory containing the workload files. Each query is expected to be stored in its own file.
|
|
552
|
+
name : str, optional
|
|
553
|
+
The name of the workload, by default ""
|
|
554
|
+
query_file_pattern : str, optional
|
|
555
|
+
A glob pattern that all query files have to match. All files that match the pattern are assumed to be valid query
|
|
556
|
+
files. Defaults to ``"*.sql"``
|
|
557
|
+
recurse_subdirectories : bool, optional
|
|
558
|
+
Whether query files in subdirectories should be read as well. Defaults to ``False``, which emulates the behaviour of
|
|
559
|
+
`Workload.read`
|
|
560
|
+
query_label_prefix : str, optional
|
|
561
|
+
A global prefix that should be added to all labels, no matter their placement in subdirectories. Defaults to an empty
|
|
562
|
+
string.
|
|
563
|
+
file_encoding : str, optional
|
|
564
|
+
The encoding of the query files. All files must share a common encoding. Defaults to UTF-8
|
|
565
|
+
|
|
566
|
+
Returns
|
|
567
|
+
-------
|
|
568
|
+
Workload[str]
|
|
569
|
+
The workload
|
|
570
|
+
"""
|
|
571
|
+
base_dir_workload = Workload.read(
|
|
572
|
+
path,
|
|
573
|
+
name=name,
|
|
574
|
+
query_file_pattern=query_file_pattern,
|
|
575
|
+
label_prefix=query_label_prefix,
|
|
576
|
+
file_encoding=file_encoding,
|
|
577
|
+
bind_columns=bind_columns,
|
|
578
|
+
)
|
|
579
|
+
if not recurse_subdirectories:
|
|
580
|
+
return base_dir_workload
|
|
581
|
+
|
|
582
|
+
merged_queries = dict(base_dir_workload.data)
|
|
583
|
+
root_dir = pathlib.Path(path)
|
|
584
|
+
for subdir in root_dir.iterdir():
|
|
585
|
+
if not subdir.is_dir():
|
|
586
|
+
continue
|
|
587
|
+
subdir_prefix = (
|
|
588
|
+
(query_label_prefix + "/")
|
|
589
|
+
if query_label_prefix and not query_label_prefix.endswith("/")
|
|
590
|
+
else query_label_prefix
|
|
591
|
+
)
|
|
592
|
+
subdir_prefix += subdir.stem + "/"
|
|
593
|
+
subdir_workload = read_workload(
|
|
594
|
+
str(subdir),
|
|
595
|
+
query_file_pattern=query_file_pattern,
|
|
596
|
+
recurse_subdirectories=True,
|
|
597
|
+
query_label_prefix=subdir_prefix,
|
|
598
|
+
bind_columns=bind_columns,
|
|
599
|
+
)
|
|
600
|
+
merged_queries |= subdir_workload.data
|
|
601
|
+
return Workload(merged_queries, name, root_dir)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def read_batch_workload(
|
|
605
|
+
filename: str, name: str = "", *, file_encoding: str = "utf-8"
|
|
606
|
+
) -> Workload[int]:
|
|
607
|
+
"""Loads a workload consisting of multiple queries from a single file.
|
|
608
|
+
|
|
609
|
+
The input file has to contain one valid SQL query per line. While empty lines are skipped, any non-SQL line will
|
|
610
|
+
raise an Error.
|
|
611
|
+
|
|
612
|
+
The workload will have numeric labels: the query in the first line will have label 1, the second one label 2 and
|
|
613
|
+
so on.
|
|
614
|
+
|
|
615
|
+
Parameters
|
|
616
|
+
----------
|
|
617
|
+
filename : str
|
|
618
|
+
The file to load. The extension does not matter, as long as it contains plain text and each query is placed on a single
|
|
619
|
+
and separate line.
|
|
620
|
+
name : str, optional
|
|
621
|
+
The name of the workload. If omitted, this defaults to the file name.
|
|
622
|
+
file_encoding : str, optional
|
|
623
|
+
The encoding of the workload file. Defaults to UTF-8
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
Workload[int]
|
|
628
|
+
The workload
|
|
629
|
+
"""
|
|
630
|
+
filepath = pathlib.Path(filename)
|
|
631
|
+
name = name if name else filepath.stem
|
|
632
|
+
with open(filename, "r", encoding=file_encoding) as query_file:
|
|
633
|
+
raw_queries = query_file.readlines()
|
|
634
|
+
parsed_queries = [parser.parse_query(raw) for raw in raw_queries if raw]
|
|
635
|
+
return generate_workload(parsed_queries, name=name, workload_root=filepath)
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def read_csv_workload(
|
|
639
|
+
filename: str,
|
|
640
|
+
name: str = "",
|
|
641
|
+
*,
|
|
642
|
+
query_column: str = "query",
|
|
643
|
+
label_column: Optional[str] = None,
|
|
644
|
+
file_encoding: str = "utf-8",
|
|
645
|
+
pd_args: Optional[dict] = None,
|
|
646
|
+
) -> Workload[str] | Workload[int]:
|
|
647
|
+
"""Loads a workload consisting of queries from a CSV column.
|
|
648
|
+
|
|
649
|
+
All queries are expected to be contained in the same column and each query is expected to be put onto its own row.
|
|
650
|
+
|
|
651
|
+
The column containing the actual queries can be configured via the `query_column` parameter. Likewise, the
|
|
652
|
+
CSV file can already provide query labels in the `label_column` column. If this parameter is omitted, labels will
|
|
653
|
+
be inferred based on the row number.
|
|
654
|
+
|
|
655
|
+
Parameters
|
|
656
|
+
----------
|
|
657
|
+
filename : str
|
|
658
|
+
The name of the CSV file to read. The extension does not matter, as long as the file can be read by the pandas CSV
|
|
659
|
+
parser. The parser can receive additional arguments via the `pd_args` parameter.
|
|
660
|
+
name : str, optional
|
|
661
|
+
The name of the workload. If omitted, this defaults to the file name.
|
|
662
|
+
query_column : str, optional
|
|
663
|
+
The CSV column that contains the workload queries. All rows of that column will be read, by default "query"
|
|
664
|
+
label_column : Optional[str], optional
|
|
665
|
+
The column containing the query labels. Each will receive a label from the `label_column` of the same row. If omitted,
|
|
666
|
+
labels will be inferred based on the row number.
|
|
667
|
+
file_encoding : str, optional
|
|
668
|
+
The encoding of the CSV file. Defaults to UTF-8.
|
|
669
|
+
pd_args : Optional[dict]
|
|
670
|
+
Additional arguments to customize the behaviour of the `pandas.read_csv` method. They will be forwarded as-is. Consult
|
|
671
|
+
the documentation of this method for more details on the allowed parameters and their functionality.
|
|
672
|
+
|
|
673
|
+
Returns
|
|
674
|
+
-------
|
|
675
|
+
Workload[str] | Workload[int]
|
|
676
|
+
The workload. It has string labels if `label_column` was provided, or numerical labels otherwise.
|
|
677
|
+
|
|
678
|
+
See Also
|
|
679
|
+
---------
|
|
680
|
+
pandas.read_csv
|
|
681
|
+
"""
|
|
682
|
+
filepath = pathlib.Path(filename)
|
|
683
|
+
name = name if name else filepath.stem
|
|
684
|
+
columns = [query_column] + [label_column] if label_column else []
|
|
685
|
+
|
|
686
|
+
if pd_args is not None:
|
|
687
|
+
# Prepare the pd_args to not overwrite any of our custom parameters
|
|
688
|
+
pd_args = dict(pd_args)
|
|
689
|
+
pd_args.pop("usecols", None)
|
|
690
|
+
pd_args.pop("converters", None)
|
|
691
|
+
pd_args.pop("encoding", None)
|
|
692
|
+
|
|
693
|
+
workload_df = pd.read_csv(
|
|
694
|
+
filename,
|
|
695
|
+
usecols=columns,
|
|
696
|
+
converters={query_column: parser.parse_query},
|
|
697
|
+
encoding=file_encoding,
|
|
698
|
+
**pd_args,
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
queries = workload_df[query_column].tolist()
|
|
702
|
+
if label_column:
|
|
703
|
+
labels = workload_df[label_column].tolist()
|
|
704
|
+
label_provider = dict(zip(queries, labels))
|
|
705
|
+
else:
|
|
706
|
+
label_provider = None
|
|
707
|
+
|
|
708
|
+
return generate_workload(
|
|
709
|
+
queries, name=name, labels=label_provider, workload_root=filepath
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def generate_workload(
|
|
714
|
+
queries: Iterable[SqlQuery],
|
|
715
|
+
*,
|
|
716
|
+
name: str = "",
|
|
717
|
+
labels: Optional[dict[SqlQuery, LabelType]] = None,
|
|
718
|
+
workload_root: Optional[pathlib.Path] = None,
|
|
719
|
+
) -> Workload[LabelType]:
|
|
720
|
+
"""Wraps a number of queries in a workload object.
|
|
721
|
+
|
|
722
|
+
The queries can receive optional labels, and will receive numerical labels according to their position in the `queries`
|
|
723
|
+
iterable if no explicit labels are provided (counting from 1).
|
|
724
|
+
|
|
725
|
+
The workload will be named according to the optional `name` parameter. If this fails, the name will be inferred from the
|
|
726
|
+
optional `workload_root`. If this fails as well, an empty name will be used.
|
|
727
|
+
|
|
728
|
+
Parameters
|
|
729
|
+
----------
|
|
730
|
+
queries : Iterable[SqlQuery]
|
|
731
|
+
The queries that should form the workload. This is only enumerated a single time, hence the iterable can "spent" its
|
|
732
|
+
items.
|
|
733
|
+
name : str, optional
|
|
734
|
+
The name of the workload, by default ""
|
|
735
|
+
labels : Optional[dict[SqlQuery, LabelType]], optional
|
|
736
|
+
The labels of the workload queries. Defaults to ``None``, in which case numerical labels will be used. In the first
|
|
737
|
+
case the label type is inferred from the dictionary values. In the second case, it will be `int`.
|
|
738
|
+
workload_root : Optional[pathlib.Path], optional
|
|
739
|
+
The directory or file that originally contained the workload queries. Defaults to ``None`` if this is not known or not
|
|
740
|
+
appropriate (e.g. for workloads that are read from a remote source)
|
|
741
|
+
|
|
742
|
+
Returns
|
|
743
|
+
-------
|
|
744
|
+
Workload[LabelType]
|
|
745
|
+
The workload
|
|
746
|
+
"""
|
|
747
|
+
name = name if name else (workload_root.stem if workload_root else "")
|
|
748
|
+
if not labels:
|
|
749
|
+
labels: dict[SqlQuery, int] = {
|
|
750
|
+
query: idx + 1 for idx, query in enumerate(queries)
|
|
751
|
+
}
|
|
752
|
+
workload_contents = util.dicts.invert(labels)
|
|
753
|
+
return Workload(workload_contents, name, workload_root)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _assert_workload_loaded(workload: Workload[LabelType], expected_dir: str) -> None:
|
|
757
|
+
"""Ensures that workload queries have been read successfully. The expected directory is used for error messages."""
|
|
758
|
+
if not workload:
|
|
759
|
+
expected_dir = pathlib.Path.home() / ".postbound" / "workloads"
|
|
760
|
+
raise ValueError(
|
|
761
|
+
f"Could not load {workload.name} workload. "
|
|
762
|
+
f"Please check {expected_dir} and make sure that it contains valid query files. "
|
|
763
|
+
f"If it does, please open an issue at https://github.com/rbergm/PostBOUND to report the problem."
|
|
764
|
+
f"If there are no queries in the directory, try re-running the workload access and open an issue if the problem "
|
|
765
|
+
f"persists."
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def job(
|
|
770
|
+
*,
|
|
771
|
+
flavor: Literal["default", "light", "complex"] = "default",
|
|
772
|
+
file_encoding: str = "utf-8",
|
|
773
|
+
) -> Workload[str]:
|
|
774
|
+
"""Reads the Join Order Benchmark, with labels according to the original paper (e.g. *1a*, *21c*, etc.).
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
Parameters
|
|
778
|
+
----------
|
|
779
|
+
flavor : Literal["default", "light", "complex"], optional
|
|
780
|
+
The flavor of the JOB benchmark to load. The default flavor is the original JOB benchmark. Use "light" or "complex" to
|
|
781
|
+
load the respective variants.
|
|
782
|
+
file_encoding : str, optional
|
|
783
|
+
The encoding of the query files, by default UTF-8.
|
|
784
|
+
|
|
785
|
+
Returns
|
|
786
|
+
-------
|
|
787
|
+
Workload[str]
|
|
788
|
+
The workload
|
|
789
|
+
|
|
790
|
+
See Also
|
|
791
|
+
--------
|
|
792
|
+
job_light
|
|
793
|
+
job_complex
|
|
794
|
+
|
|
795
|
+
References
|
|
796
|
+
----------
|
|
797
|
+
|
|
798
|
+
.. Viktor Leis et al.: "How Good Are Query Optimizers, Really?" (Proc. VLDB Endow. 9, 3 (2015))
|
|
799
|
+
"""
|
|
800
|
+
if flavor == "light":
|
|
801
|
+
return job_light(file_encoding=file_encoding)
|
|
802
|
+
elif flavor == "complex":
|
|
803
|
+
return job_complex(file_encoding=file_encoding)
|
|
804
|
+
|
|
805
|
+
workload_dir = _fetch_workload("JOB")
|
|
806
|
+
# JOB only uses aliases column references, so no need for explicit binding
|
|
807
|
+
job_workload = Workload.read(
|
|
808
|
+
workload_dir, name="JOB", file_encoding=file_encoding, bind_columns=False
|
|
809
|
+
)
|
|
810
|
+
_assert_workload_loaded(job_workload, workload_dir)
|
|
811
|
+
return job_workload
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def job_light(*, file_encoding: str = "utf-8") -> Workload[str]:
|
|
815
|
+
"""Reads the JOB-light benchmark, with numeric query labels (1, 2, 3, ...).
|
|
816
|
+
|
|
817
|
+
Parameters
|
|
818
|
+
----------
|
|
819
|
+
file_encoding : str, optional
|
|
820
|
+
The encoding of the query files, by default UTF-8.
|
|
821
|
+
|
|
822
|
+
Returns
|
|
823
|
+
-------
|
|
824
|
+
Workload[str]
|
|
825
|
+
The workload
|
|
826
|
+
|
|
827
|
+
References
|
|
828
|
+
----------
|
|
829
|
+
|
|
830
|
+
.. Andreas Kipf et al.: "Learned Cardinalities: Estimating Correlated Joins with Deep Learning" (CIDR'2019)
|
|
831
|
+
"""
|
|
832
|
+
workload_dir = _fetch_workload("job-light")
|
|
833
|
+
# JOB-light only uses aliases column references, so no need for explicit binding
|
|
834
|
+
job_light_workload = Workload.read(
|
|
835
|
+
workload_dir, name="JOB-light", file_encoding=file_encoding, bind_columns=False
|
|
836
|
+
)
|
|
837
|
+
_assert_workload_loaded(job_light_workload, workload_dir)
|
|
838
|
+
return job_light_workload
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def job_complex(*, file_encoding: str = "utf-8") -> Workload[str]:
|
|
842
|
+
"""Reads the JOB-complex benchmark, with numeric query labels (1, 2, 3, ...).
|
|
843
|
+
|
|
844
|
+
Parameters
|
|
845
|
+
----------
|
|
846
|
+
file_encoding : str, optional
|
|
847
|
+
The encoding of the query files, by default "utf-8".
|
|
848
|
+
|
|
849
|
+
Returns
|
|
850
|
+
-------
|
|
851
|
+
Workload[str]
|
|
852
|
+
The workload
|
|
853
|
+
|
|
854
|
+
References
|
|
855
|
+
----------
|
|
856
|
+
|
|
857
|
+
.. Johannes Wehrstein et al.: "JOB-Complex: A Challenging Benchmark for Traditional & Learned Query Optimization"
|
|
858
|
+
(AIDB'2025)
|
|
859
|
+
"""
|
|
860
|
+
workload_dir = _fetch_workload("job-complex")
|
|
861
|
+
# JOB-complex only uses aliases column references, so no need for explicit binding
|
|
862
|
+
job_complex_workload = Workload.read(
|
|
863
|
+
workload_dir,
|
|
864
|
+
name="JOB-complex",
|
|
865
|
+
file_encoding=file_encoding,
|
|
866
|
+
bind_columns=False,
|
|
867
|
+
)
|
|
868
|
+
_assert_workload_loaded(job_complex_workload, workload_dir)
|
|
869
|
+
return job_complex_workload
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def ssb(
|
|
873
|
+
*, file_encoding: str = "utf-8", bind_columns: Optional[bool] = None
|
|
874
|
+
) -> Workload[str]:
|
|
875
|
+
"""Reads the Star Schema Benchmark, with labels according to the original data (e.g. *q1-1*, *q3-2*, etc.).
|
|
876
|
+
|
|
877
|
+
Parameters
|
|
878
|
+
----------
|
|
879
|
+
file_encoding : str, optional
|
|
880
|
+
The encoding of the query files, by default UTF-8.
|
|
881
|
+
bind_columns : Optional[bool], optional
|
|
882
|
+
Whether all columns in the queries should be bound to their respective tables. Since SSB does not use qualified
|
|
883
|
+
columns names, these must be inferred from the database schema. Thus, an active database connection is required to
|
|
884
|
+
bind the columns. By default, binding is attempted if there is an active database connection.
|
|
885
|
+
|
|
886
|
+
Returns
|
|
887
|
+
-------
|
|
888
|
+
Workload[str]
|
|
889
|
+
The workload
|
|
890
|
+
|
|
891
|
+
References
|
|
892
|
+
----------
|
|
893
|
+
|
|
894
|
+
.. Patrick E. O'Neil et al.: "The Star Schema Benchmark and Augmented Fact Table Indexing." (TPCTC'2009)
|
|
895
|
+
"""
|
|
896
|
+
bind_columns = (
|
|
897
|
+
bind_columns
|
|
898
|
+
if bind_columns is not None
|
|
899
|
+
else not DatabasePool.get_instance().empty()
|
|
900
|
+
)
|
|
901
|
+
workload_dir = _fetch_workload("ssb")
|
|
902
|
+
ssb_workload = Workload.read(
|
|
903
|
+
workload_dir, name="SSB", file_encoding=file_encoding, bind_columns=bind_columns
|
|
904
|
+
)
|
|
905
|
+
_assert_workload_loaded(ssb_workload, workload_dir)
|
|
906
|
+
return ssb_workload
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def stack(
|
|
910
|
+
*,
|
|
911
|
+
file_encoding: str = "utf-8",
|
|
912
|
+
bind_columns: Optional[bool] = None,
|
|
913
|
+
) -> Workload[str]:
|
|
914
|
+
"""Reads the Stack Benchmark, as shipped with the PostBOUND repository.
|
|
915
|
+
|
|
916
|
+
Most queries use semi-numeric labels consisting of the context and the query number, e.g., *q1/q1-001*. However, some
|
|
917
|
+
queries have completely random names, such as *q16/fc8f97968b9fce81df4011c8175eada15541abe0*. Still, all queries are
|
|
918
|
+
grouped into one of 16 contexts.
|
|
919
|
+
|
|
920
|
+
Parameters
|
|
921
|
+
----------
|
|
922
|
+
file_encoding : str, optional
|
|
923
|
+
The encoding of the query files, by default UTF-8.
|
|
924
|
+
bind_columns : Optional[bool], optional
|
|
925
|
+
Whether all columns in the queries should be bound to their respective tables. Since Stack does not use qualified
|
|
926
|
+
columns names in some queries, these must be inferred from the database schema. Thus, an active database connection is
|
|
927
|
+
required to bind the columns. By default, binding is attempted if there is an active database connection.
|
|
928
|
+
|
|
929
|
+
Returns
|
|
930
|
+
-------
|
|
931
|
+
Workload[str]
|
|
932
|
+
The workload.
|
|
933
|
+
|
|
934
|
+
References
|
|
935
|
+
----------
|
|
936
|
+
|
|
937
|
+
.. Ryan Marcus et al.: "Bao: Making Learned Query Optimization Practical." (SIGMOD'2021)
|
|
938
|
+
"""
|
|
939
|
+
bind_columns = (
|
|
940
|
+
bind_columns
|
|
941
|
+
if bind_columns is not None
|
|
942
|
+
else not DatabasePool.get_instance().empty()
|
|
943
|
+
)
|
|
944
|
+
workload_dir = _fetch_workload("stack")
|
|
945
|
+
|
|
946
|
+
stack_workload = read_workload(
|
|
947
|
+
workload_dir,
|
|
948
|
+
"Stack",
|
|
949
|
+
recurse_subdirectories=True,
|
|
950
|
+
file_encoding=file_encoding,
|
|
951
|
+
bind_columns=bind_columns,
|
|
952
|
+
)
|
|
953
|
+
_assert_workload_loaded(stack_workload, workload_dir)
|
|
954
|
+
return stack_workload
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def stats(*, file_encoding: str = "utf-8") -> Workload[str]:
|
|
958
|
+
"""Reads the Stats Benchmark, with semi-numeric query labels (e.g. *q-1*, *q-2*, etc.).
|
|
959
|
+
|
|
960
|
+
Parameters
|
|
961
|
+
----------
|
|
962
|
+
file_encoding : str, optional
|
|
963
|
+
The encoding of the query files, by default UTF-8.
|
|
964
|
+
|
|
965
|
+
Returns
|
|
966
|
+
-------
|
|
967
|
+
Workload[str]
|
|
968
|
+
The workload
|
|
969
|
+
|
|
970
|
+
References
|
|
971
|
+
----------
|
|
972
|
+
|
|
973
|
+
.. Yuxing Han et al.: Cardinality Estimation in DBMS: A Comprehensive Benchmark Evaluation (Proc. VLDB Endow. 15, 4 (2022))
|
|
974
|
+
"""
|
|
975
|
+
workload_dir = _fetch_workload("stats")
|
|
976
|
+
stats_workload = Workload.read(
|
|
977
|
+
workload_dir, name="Stats", file_encoding=file_encoding, bind_columns=False
|
|
978
|
+
)
|
|
979
|
+
_assert_workload_loaded(stats_workload, workload_dir)
|
|
980
|
+
return stats_workload
|