PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,980 @@
1
+ """Provides abstractions to represent entire query workloads and utilities to read some pre-defined instances.
2
+
3
+ The main abstraction provided by this class is the `Workload`. A number of utility functions to read collections of queries
4
+ from different sources and input formats into workload objects exist as well. The pre-defined workloads include the
5
+ Join Order Benchmark [1]_ (including JOB-light [2]_ and JOB-Complex [3]_), Star Schema Benchmark [4]_, Stack Benchmark [5]_
6
+ and Stats Benchmark [6]_.
7
+
8
+ PostBOUND stores the workload queries in a dedicated directory, located relative to the user's home directory at
9
+ *$HOME/.postbound/*. If a workload is requested for the first time, it will be downloaded automatically. Therefore, the initial
10
+ usage of a novel workload may take slightly longer than usual.
11
+
12
+ References
13
+ ----------
14
+
15
+ .. [1] Viktor Leis et al.: How Good Are Query Optimizers, Really? (Proc. VLDB Endow. 9, 3 (2015))
16
+ .. [2] Andreas Kipf et al.: Learned Cardinalities: Estimating Correlated Joins with Deep Learning. (CIDR'2019)
17
+ .. [3] Johannes Wehrstein.: JOB-Complex: A Challenging Benchmark for Traditional & Learned Query Optimization. (AIDB'2025)
18
+ .. [4] Patrick E. O'Neil et al.: The Star Schema Benchmark and Augmented Fact Table Indexing. (TPCTC'2009)
19
+ .. [5] Ryan Marcus et al.: Bao: Making Learned Query Optimization Practical. (SIGMOD'2021)
20
+ .. [6] Yuxing Han et al.: Cardinality Estimation in DBMS: A Comprehensive Benchmark Evaluation (Proc. VLDB Endow. 15, 4 (2022))
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import collections
26
+ import pathlib
27
+ import random
28
+ import typing
29
+ import urllib.request
30
+ import zipfile
31
+ from collections.abc import Callable, Hashable, Iterable, Sequence
32
+ from typing import Literal, Optional
33
+
34
+ import natsort
35
+ import pandas as pd
36
+
37
+ from .. import util
38
+ from ..db._db import DatabasePool
39
+ from ..qal import parser
40
+ from ..qal._qal import SqlQuery
41
+
42
+ _WorkloadSources = {
43
+ "job": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/qQEBsM2Zx4x9BBW",
44
+ "job-complex": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/MBFejJXSdHbnoix",
45
+ "job-light": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/q4b9Mq6C485CnXw",
46
+ "ssb": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/iXD5p3J5q6DwdbQ",
47
+ "stack": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/AQgxPe9KrNGJ5nT",
48
+ "stats": "https://db4701.inf.tu-dresden.de:8443/public.php/dav/files/STTdpKR3LB5ojt3",
49
+ }
50
+
51
+
52
+ def _fetch_workload(name: str) -> pathlib.Path:
53
+ """Determines the local path of a workload, downloading it if necessary."""
54
+ name = name.lower()
55
+ workload_dir = pathlib.Path.home() / ".postbound" / "workloads" / name
56
+ if workload_dir.exists():
57
+ return workload_dir
58
+ workload_dir.mkdir(parents=True, exist_ok=True)
59
+
60
+ archive_url = _WorkloadSources.get(name)
61
+ if not archive_url:
62
+ raise ValueError(f"No known source for workload '{name}'")
63
+
64
+ archive_file = workload_dir.parent / f"{name}.zip"
65
+ urllib.request.urlretrieve(archive_url, archive_file)
66
+
67
+ with zipfile.ZipFile(archive_file, "r") as zip:
68
+ zip.extractall(workload_dir)
69
+
70
+ archive_file.unlink()
71
+ return workload_dir
72
+
73
+
74
+ LabelType = typing.TypeVar("LabelType", bound=Hashable)
75
+ """The labels that are used to identify individual queries in a workload."""
76
+
77
+ NewLabelType = typing.TypeVar("NewLabelType", bound=Hashable)
78
+ """In case of mutations of the workload labels, this denotes the new type of the labels after the mutation."""
79
+
80
+
81
+ class Workload(collections.UserDict[LabelType, SqlQuery]):
82
+ """A workload collects a number of queries (read: benchmark) and provides utilities to operate on them conveniently.
83
+
84
+ In addition to the actual queries, each query is annotated by a label that can be used to retrieve the query more
85
+ nicely. E.g. for queries in the Join Order Benchmark, access by their index is supported - such as ``job["1a"]``. Labels
86
+ can be arbitrary types as long as they are hashable. Since the workload inherits from dict, the label can be used directly
87
+ to fetch the associated query (and will raise ``KeyError`` instances for unknown labels).
88
+
89
+ Each workload can be given a name, which is mainly intended for readability in ``__str__`` methods and does not serve
90
+ a functional purpose. However, it may be good practice to use a normalized name that can be used in different contexts
91
+ such as in file names, etc.
92
+
93
+ When using methods that allow iteration over the queries, they will typically be returned in order according to the natural
94
+ order of the query labels. However, since workloads can be shuffled randomly, this order can also be destroyed.
95
+
96
+ A workload is implemented as an immutable data object. Therefore, it is not possible/not intended to change the contents
97
+ of a workload object later on. All methods that mutate the contents instead provide new workload instances.
98
+
99
+ Parameters
100
+ ----------
101
+ queries : dict[LabelType, SqlQuery]
102
+ The queries that form the actual workload
103
+ name : str, optional
104
+ A name that can be used to identify or represent the workload, by default ``""``.
105
+ root : Optional[pathlib.Path], optional
106
+ The root directory that contains the workload queries. This is mainly used to somehow identify the workload when no
107
+ name is given or the workload contents do not match the expected queries. Defaults to ``None``.
108
+
109
+ Notes
110
+ -----
111
+ Workloads support many of the Python builtin-methods thanks to inheriting from ``UserDict``. Namely, the *len*, *iter* and
112
+ *in* methods work as expected on the labels. Furthermore, multiple workload objects can be added, subtracted and
113
+ intersected using set semantics. Subtraction and intersection also work based on individual labels.
114
+ """
115
+
116
+ @staticmethod
117
+ def read(
118
+ root_dir: str,
119
+ *,
120
+ query_file_pattern: str = "*.sql",
121
+ name: str = "",
122
+ label_prefix: str = "",
123
+ file_encoding: str = "utf-8",
124
+ bind_columns: bool = True,
125
+ ) -> Workload[str]:
126
+ """Reads all SQL queries from a specific directory into a workload object.
127
+
128
+ This method assumes that the queries are stored in individual files, one query per file. The query labels will be
129
+ constructed based on the file name of the source files. For example, a query contained in file ``q-1-1.sql`` will
130
+ receive label ``q-1-1`` (note that the trailing file extension is dropped). If the `label_prefix` is given, it will be
131
+ inserted before the file name-based label.
132
+
133
+ Parameters
134
+ ----------
135
+ root_dir : str
136
+ Directory containing the individual query files
137
+ query_file_pattern : str, optional
138
+ File name pattern that is shared by all query files. Only files matching the pattern will be read and each matching
139
+ file is assumed to be a valid workload query. This is resolved as a glob expression. Defaults to ``"*.sql"``
140
+ name : str, optional
141
+ An optional name that can be used to identify the workload. Empty by default.
142
+ label_prefix : str, optional
143
+ A prefix to add before each query label. Empty by default. Notice that the prefix will be prepended as-is, i.e. no
144
+ separator character is inserted. If a separator is desired, it has to be part of the prefix.
145
+ file_encoding : str, optional
146
+ The encoding of the query files. All files must share the same encoding. Defaults to UTF-8 encoding.
147
+
148
+ Returns
149
+ -------
150
+ Workload[str]
151
+ A workload consisting of all query files contained in the root directory.
152
+
153
+ See Also
154
+ --------
155
+ pathlib.Path.glob
156
+ """
157
+ queries: dict[str, SqlQuery] = {}
158
+ root = pathlib.Path(root_dir)
159
+
160
+ for query_file_path in root.glob(query_file_pattern):
161
+ with open(query_file_path, "r", encoding=file_encoding) as query_file:
162
+ raw_contents = query_file.readlines()
163
+ query_contents = "\n".join([line for line in raw_contents])
164
+ try:
165
+ parsed_query = parser.parse_query(
166
+ query_contents, bind_columns=bind_columns
167
+ )
168
+ except Exception as e:
169
+ raise ValueError(f"Could not parse query from {query_file_path}", e)
170
+ query_label = query_file_path.stem
171
+ queries[label_prefix + query_label] = parsed_query
172
+
173
+ return Workload(queries, name=name, root=root)
174
+
175
+ def __init__(
176
+ self,
177
+ queries: dict[LabelType, SqlQuery],
178
+ name: str = "",
179
+ root: Optional[pathlib.Path] = None,
180
+ ) -> None:
181
+ super().__init__(queries)
182
+ self._name = name
183
+ self._root = root
184
+
185
+ self._sorted_labels = natsort.natsorted(list(self.keys()))
186
+ self._sorted_queries: list[SqlQuery] = []
187
+ self._update_query_order()
188
+
189
+ self._label_mapping = util.dicts.invert(self.data)
190
+
191
+ @property
192
+ def name(self) -> str:
193
+ """Provides the name of the workload.
194
+
195
+ Returns
196
+ -------
197
+ str
198
+ The name or an empty string if no name has been specified.
199
+ """
200
+ return self._name
201
+
202
+ def queries(self) -> Sequence[SqlQuery]:
203
+ """Provides all queries in the workload in natural order (according to their labels).
204
+
205
+ If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
206
+
207
+ Returns
208
+ -------
209
+ Sequence[SqlQuery]
210
+ The queries
211
+ """
212
+ return list(self._sorted_queries)
213
+
214
+ def labels(self) -> Sequence[LabelType]:
215
+ """Provides all query labels of the workload in natural order.
216
+
217
+ If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
218
+
219
+ Returns
220
+ -------
221
+ Sequence[LabelType]
222
+ The labels
223
+ """
224
+ return list(self._sorted_labels)
225
+
226
+ def entries(self) -> Sequence[tuple[LabelType, SqlQuery]]:
227
+ """Provides all (label, query) pairs in the workload, in natural order of the query labels.
228
+
229
+ If the natural order was manually destroyed, e.g. by shuffling, the shuffled order is used.
230
+
231
+ Returns
232
+ -------
233
+ Sequence[tuple[LabelType, SqlQuery]]
234
+ The queries along with their labels
235
+ """
236
+ return list(zip(self._sorted_labels, self._sorted_queries))
237
+
238
+ def head(self) -> Optional[tuple[LabelType, SqlQuery]]:
239
+ """Provides the first query in the workload.
240
+
241
+ The first query is determined according to the natural order of the query labels by default. If that order was manually
242
+ destroyed, e.g. by shuffling, the shuffled order is used.
243
+
244
+ There is no policy to break ties in the order. An arbitrary query can be returned in this case.
245
+
246
+ Returns
247
+ -------
248
+ Optional[tuple[LabelType, SqlQuery]]
249
+ The first query, if there is at least one query in the workload. ``None`` otherwise.
250
+ """
251
+ if not self._sorted_labels:
252
+ return None
253
+ return self._sorted_labels[0], self._sorted_queries[0]
254
+
255
+ def label_of(self, query: SqlQuery) -> LabelType:
256
+ """Provides the label of the given query.
257
+
258
+ Parameters
259
+ ----------
260
+ query : SqlQuery
261
+ The query to check
262
+
263
+ Returns
264
+ -------
265
+ LabelType
266
+ The corresponding label
267
+
268
+ Raises
269
+ ------
270
+ KeyError
271
+ If the query is not part of the workload
272
+ """
273
+ return self._label_mapping[query]
274
+
275
+ def with_labels(self, labels: Iterable[LabelType]) -> Workload[LabelType]:
276
+ """Provides a new workload that contains only the queries with the specified labels.
277
+
278
+ Parameters
279
+ ----------
280
+ labels : Iterable[LabelType]
281
+ The labels to include in the new workload
282
+
283
+ Returns
284
+ -------
285
+ Workload[LabelType]
286
+ A workload that contains only the queries with the specified labels
287
+ """
288
+ labels = set(labels)
289
+ selected_queries = {
290
+ label: query for label, query in self.data.items() if label in labels
291
+ }
292
+ return Workload(selected_queries, name=self._name, root=self._root)
293
+
294
+ def first(self, n: int) -> Workload[LabelType]:
295
+ """Provides the first `n` queries of the workload, according to the natural order of the query labels.
296
+
297
+ If there are less than `n` queries in the workload, all queries will be returned. Similar to other methods that rely
298
+ on some sort of ordering of the queries, if the natural order has been manually broken due to shuffling, the shuffled
299
+ order is used instead.
300
+
301
+ Parameters
302
+ ----------
303
+ n : int
304
+ The number of queries that should be returned
305
+
306
+ Returns
307
+ -------
308
+ Workload[LabelType]
309
+ A workload consisting of the first `n` queries of the current workload
310
+ """
311
+ first_n_labels = self._sorted_labels[:n]
312
+ sub_workload = {label: self.data[label] for label in first_n_labels}
313
+ return Workload(sub_workload, self._name, self._root)
314
+
315
+ def last(self, n: int) -> Workload[LabelType]:
316
+ """Provides the last `n` queries of the workload, according to the natural order of the query labels.
317
+
318
+ If there are less than `n` queries in the workload, all queries will be returned. Similar to other methods that rely
319
+ on some sort of ordering of the queries, if the natural order has been manually broken due to shuffling, the shuffled
320
+ order is used instead.
321
+
322
+ Parameters
323
+ ----------
324
+ n : int
325
+ The number of queries that should be returned
326
+
327
+ Returns
328
+ -------
329
+ Workload[LabelType]
330
+ A workload consisting of the last `n` queries of the current workload
331
+ """
332
+ last_n_labels = self._sorted_labels[-n:]
333
+ sub_workload = {label: self.data[label] for label in last_n_labels}
334
+ return Workload(sub_workload, self._name, self._root)
335
+
336
+ def pick_random(self, n: int) -> Workload[LabelType]:
337
+ """Constructs a new workload, consisting of randomly selected queries from this workload.
338
+
339
+ The new workload will once again be ordered according to the natural ordering of the labels.
340
+
341
+ Parameters
342
+ ----------
343
+ n : int
344
+ The number of queries to choose. If there are less queries in the workload, all will be selected.
345
+
346
+ Returns
347
+ -------
348
+ Workload[LabelType]
349
+ A workload consisting of `n` unique random queries from this workload
350
+ """
351
+ n = min(n, len(self._sorted_queries))
352
+ selected_labels = random.sample(self._sorted_labels, n)
353
+ sub_workload = {label: self.data[label] for label in selected_labels}
354
+ return Workload(sub_workload, self._name, self._root)
355
+
356
+ def with_prefix(self, label_prefix: LabelType) -> Workload[LabelType]:
357
+ """Filters the workload for all queries that have a lablel starting with a specific prefix.
358
+
359
+ This method requires that all label instances provide a `startswith` method (as is the case for simple string
360
+ labels). Most significantly, this means that integer-based indexing does not work with for the prefix-based filter.
361
+ The *See Also* section provides some means to mitigate this problem.
362
+
363
+ Parameters
364
+ ----------
365
+ label_prefix : LabelType
366
+ The prefix to filter for
367
+
368
+ Returns
369
+ -------
370
+ Workload[LabelType]
371
+ All queries of this workload that have a label with a matching prefix. Queries will be sorted according to the
372
+ natural order of their labels again.
373
+
374
+ Raises
375
+ ------
376
+ ValueError
377
+ If the prefix type does not provide a `startswith` method.
378
+
379
+ See Also
380
+ --------
381
+ relabel - to change the labels into a type that provides `startswith`
382
+ filter_by - to perform a custom prefix check for other types
383
+ """
384
+ if "startswith" not in dir(label_prefix):
385
+ raise ValueError("label_prefix must have startswith() method")
386
+ prefix_queries = {
387
+ label: query
388
+ for label, query in self.data.items()
389
+ if label.startswith(label_prefix)
390
+ }
391
+ return Workload(prefix_queries, name=self._name, root=self._root)
392
+
393
+ def filter_by(
394
+ self, predicate: Callable[[LabelType, SqlQuery], bool]
395
+ ) -> Workload[LabelType]:
396
+ """Provides all queries from the workload that match a specific predicate.
397
+
398
+ Parameters
399
+ ----------
400
+ predicate : Callable[[LabelType, SqlQuery], bool]
401
+ The filter condition. All queries that pass the check are included in the new workload. The filter predicate
402
+ receives the label and the query for each query in the input
403
+
404
+ Returns
405
+ -------
406
+ Workload[LabelType]
407
+ All queries that passed the filter condition check. Queries will be sorted according to the natural order of their
408
+ labels again.
409
+ """
410
+ matching_queries = {
411
+ label: query
412
+ for label, query in self.data.items()
413
+ if predicate(label, query)
414
+ }
415
+ return Workload(matching_queries, name=self._name, root=self._root)
416
+
417
+ def relabel(
418
+ self, label_provider: Callable[[LabelType, SqlQuery], NewLabelType]
419
+ ) -> Workload[NewLabelType]:
420
+ """Constructs a new workload, leaving the queries intact but replacing the labels.
421
+
422
+ The new workload will ordered according to the natural order of the new labels.
423
+
424
+ Parameters
425
+ ----------
426
+ label_provider : Callable[[LabelType, SqlQuery], NewLabelType]
427
+ Replacement method that maps all old labels to the new label values. This method has to provide unique labels. If
428
+ that is not the case, conflicts will be resolved but in an arbitrary way. The replacement receives the old label
429
+ as well as the query as input and produces the new label value.
430
+
431
+ Returns
432
+ -------
433
+ Workload[NewLabelType]
434
+ All queries of the current workload, but with new labels
435
+ """
436
+ relabeled_queries = {
437
+ label_provider(current_label, query): query
438
+ for current_label, query in self.data.items()
439
+ }
440
+ return Workload(relabeled_queries, self._name, self._root)
441
+
442
+ def shuffle(self) -> Workload[LabelType]:
443
+ """Randomly changes the order of the queries in the workload.
444
+
445
+ Returns
446
+ -------
447
+ Workload[LabelType]
448
+ All queries of the current workload, but with the queries in a different order
449
+ """
450
+ shuffled_workload = Workload(self.data, self._name, self._root)
451
+ shuffled_workload._sorted_labels = random.sample(
452
+ self._sorted_labels, k=len(self)
453
+ )
454
+ shuffled_workload._update_query_order()
455
+ return shuffled_workload
456
+
457
+ def ordered(self) -> Workload[LabelType]:
458
+ """Enforces the natural ordering of the queries according to their labels.
459
+
460
+ Returns
461
+ -------
462
+ Workload[LabelType]
463
+ All queries of the current workload, but in their natural order.
464
+ """
465
+ return Workload(self.data, self._name, self._root)
466
+
467
+ def _update_query_order(self) -> None:
468
+ """Enforces that the order of the queries matches the order of the labels."""
469
+ self._sorted_queries = [self.data[label] for label in self._sorted_labels]
470
+
471
+ def __add__(self, other: Workload[LabelType]) -> Workload[LabelType]:
472
+ if not isinstance(other, Workload):
473
+ raise TypeError("Can only add workloads together")
474
+ return Workload(
475
+ other.data | self.data, name=self._name, root=self._root
476
+ ) # retain own labels in case of conflict
477
+
478
+ def __sub__(self, other: Workload[LabelType]) -> Workload[LabelType]:
479
+ if not isinstance(other, Workload) and isinstance(other, Iterable):
480
+ labels_to_remove = set(other)
481
+ reduced_workload = {
482
+ label: query
483
+ for label, query in self.data.items()
484
+ if label not in labels_to_remove
485
+ }
486
+ return Workload(reduced_workload, name=self._name, root=self._root)
487
+ elif not isinstance(other, Workload):
488
+ raise TypeError("Expected workload or labels to subtract")
489
+ return Workload(
490
+ util.dicts.difference(self.data, other.data),
491
+ name=self._name,
492
+ root=self._root,
493
+ )
494
+
495
+ def __and__(self, other: Workload[LabelType]) -> Workload[LabelType]:
496
+ if not isinstance(other, Workload) and isinstance(other, Iterable):
497
+ labels_to_include = set(other)
498
+ reduced_workload = {
499
+ label: query
500
+ for label, query in self.data.items()
501
+ if label in labels_to_include
502
+ }
503
+ return Workload(reduced_workload, name=self._name, root=self._root)
504
+ elif not isinstance(other, Workload):
505
+ raise TypeError("Expected workload or labels to compute union")
506
+ return Workload(
507
+ util.dicts.intersection(self.data, other.data),
508
+ name=self._name,
509
+ root=self._root,
510
+ )
511
+
512
+ def __or__(self, other: Workload[LabelType]) -> Workload[LabelType]:
513
+ if not isinstance(other, Workload):
514
+ raise TypeError("Can only compute union of workloads")
515
+ return Workload(
516
+ other.data | self.data, name=self._name, root=self._root
517
+ ) # retain own labels in case of conflict
518
+
519
+ def __repr__(self) -> str:
520
+ return str(self)
521
+
522
+ def __str__(self) -> str:
523
+ if self._name:
524
+ return f"Workload: {self._name} ({len(self)} queries)"
525
+ elif self._root:
526
+ return f"Workload: {self._root.stem} ({len(self)} queries)"
527
+ else:
528
+ return f"Workload: {len(self)} queries"
529
+
530
+
531
+ def read_workload(
532
+ path: str,
533
+ name: str = "",
534
+ *,
535
+ query_file_pattern: str = "*.sql",
536
+ recurse_subdirectories: bool = False,
537
+ query_label_prefix: str = "",
538
+ file_encoding: str = "utf-8",
539
+ bind_columns: bool = True,
540
+ ) -> Workload[str]:
541
+ """Loads a workload consisting of multiple different files, potentially scattered in multiple directories
542
+
543
+ The main advantage of this method over using `Workload.read` directly is the support for recursive directory layouts: it
544
+ can traverse subdirectories relative to the specified root and collect all workload files in a recursive manner. If
545
+ subdirectories are used, their names will be used as prefixes to the query label, which is still inferred from the query
546
+ file name.
547
+
548
+ Parameters
549
+ ----------
550
+ path : str
551
+ The root directory containing the workload files. Each query is expected to be stored in its own file.
552
+ name : str, optional
553
+ The name of the workload, by default ""
554
+ query_file_pattern : str, optional
555
+ A glob pattern that all query files have to match. All files that match the pattern are assumed to be valid query
556
+ files. Defaults to ``"*.sql"``
557
+ recurse_subdirectories : bool, optional
558
+ Whether query files in subdirectories should be read as well. Defaults to ``False``, which emulates the behaviour of
559
+ `Workload.read`
560
+ query_label_prefix : str, optional
561
+ A global prefix that should be added to all labels, no matter their placement in subdirectories. Defaults to an empty
562
+ string.
563
+ file_encoding : str, optional
564
+ The encoding of the query files. All files must share a common encoding. Defaults to UTF-8
565
+
566
+ Returns
567
+ -------
568
+ Workload[str]
569
+ The workload
570
+ """
571
+ base_dir_workload = Workload.read(
572
+ path,
573
+ name=name,
574
+ query_file_pattern=query_file_pattern,
575
+ label_prefix=query_label_prefix,
576
+ file_encoding=file_encoding,
577
+ bind_columns=bind_columns,
578
+ )
579
+ if not recurse_subdirectories:
580
+ return base_dir_workload
581
+
582
+ merged_queries = dict(base_dir_workload.data)
583
+ root_dir = pathlib.Path(path)
584
+ for subdir in root_dir.iterdir():
585
+ if not subdir.is_dir():
586
+ continue
587
+ subdir_prefix = (
588
+ (query_label_prefix + "/")
589
+ if query_label_prefix and not query_label_prefix.endswith("/")
590
+ else query_label_prefix
591
+ )
592
+ subdir_prefix += subdir.stem + "/"
593
+ subdir_workload = read_workload(
594
+ str(subdir),
595
+ query_file_pattern=query_file_pattern,
596
+ recurse_subdirectories=True,
597
+ query_label_prefix=subdir_prefix,
598
+ bind_columns=bind_columns,
599
+ )
600
+ merged_queries |= subdir_workload.data
601
+ return Workload(merged_queries, name, root_dir)
602
+
603
+
604
+ def read_batch_workload(
605
+ filename: str, name: str = "", *, file_encoding: str = "utf-8"
606
+ ) -> Workload[int]:
607
+ """Loads a workload consisting of multiple queries from a single file.
608
+
609
+ The input file has to contain one valid SQL query per line. While empty lines are skipped, any non-SQL line will
610
+ raise an Error.
611
+
612
+ The workload will have numeric labels: the query in the first line will have label 1, the second one label 2 and
613
+ so on.
614
+
615
+ Parameters
616
+ ----------
617
+ filename : str
618
+ The file to load. The extension does not matter, as long as it contains plain text and each query is placed on a single
619
+ and separate line.
620
+ name : str, optional
621
+ The name of the workload. If omitted, this defaults to the file name.
622
+ file_encoding : str, optional
623
+ The encoding of the workload file. Defaults to UTF-8
624
+
625
+ Returns
626
+ -------
627
+ Workload[int]
628
+ The workload
629
+ """
630
+ filepath = pathlib.Path(filename)
631
+ name = name if name else filepath.stem
632
+ with open(filename, "r", encoding=file_encoding) as query_file:
633
+ raw_queries = query_file.readlines()
634
+ parsed_queries = [parser.parse_query(raw) for raw in raw_queries if raw]
635
+ return generate_workload(parsed_queries, name=name, workload_root=filepath)
636
+
637
+
638
+ def read_csv_workload(
639
+ filename: str,
640
+ name: str = "",
641
+ *,
642
+ query_column: str = "query",
643
+ label_column: Optional[str] = None,
644
+ file_encoding: str = "utf-8",
645
+ pd_args: Optional[dict] = None,
646
+ ) -> Workload[str] | Workload[int]:
647
+ """Loads a workload consisting of queries from a CSV column.
648
+
649
+ All queries are expected to be contained in the same column and each query is expected to be put onto its own row.
650
+
651
+ The column containing the actual queries can be configured via the `query_column` parameter. Likewise, the
652
+ CSV file can already provide query labels in the `label_column` column. If this parameter is omitted, labels will
653
+ be inferred based on the row number.
654
+
655
+ Parameters
656
+ ----------
657
+ filename : str
658
+ The name of the CSV file to read. The extension does not matter, as long as the file can be read by the pandas CSV
659
+ parser. The parser can receive additional arguments via the `pd_args` parameter.
660
+ name : str, optional
661
+ The name of the workload. If omitted, this defaults to the file name.
662
+ query_column : str, optional
663
+ The CSV column that contains the workload queries. All rows of that column will be read, by default "query"
664
+ label_column : Optional[str], optional
665
+ The column containing the query labels. Each will receive a label from the `label_column` of the same row. If omitted,
666
+ labels will be inferred based on the row number.
667
+ file_encoding : str, optional
668
+ The encoding of the CSV file. Defaults to UTF-8.
669
+ pd_args : Optional[dict]
670
+ Additional arguments to customize the behaviour of the `pandas.read_csv` method. They will be forwarded as-is. Consult
671
+ the documentation of this method for more details on the allowed parameters and their functionality.
672
+
673
+ Returns
674
+ -------
675
+ Workload[str] | Workload[int]
676
+ The workload. It has string labels if `label_column` was provided, or numerical labels otherwise.
677
+
678
+ See Also
679
+ ---------
680
+ pandas.read_csv
681
+ """
682
+ filepath = pathlib.Path(filename)
683
+ name = name if name else filepath.stem
684
+ columns = [query_column] + [label_column] if label_column else []
685
+
686
+ if pd_args is not None:
687
+ # Prepare the pd_args to not overwrite any of our custom parameters
688
+ pd_args = dict(pd_args)
689
+ pd_args.pop("usecols", None)
690
+ pd_args.pop("converters", None)
691
+ pd_args.pop("encoding", None)
692
+
693
+ workload_df = pd.read_csv(
694
+ filename,
695
+ usecols=columns,
696
+ converters={query_column: parser.parse_query},
697
+ encoding=file_encoding,
698
+ **pd_args,
699
+ )
700
+
701
+ queries = workload_df[query_column].tolist()
702
+ if label_column:
703
+ labels = workload_df[label_column].tolist()
704
+ label_provider = dict(zip(queries, labels))
705
+ else:
706
+ label_provider = None
707
+
708
+ return generate_workload(
709
+ queries, name=name, labels=label_provider, workload_root=filepath
710
+ )
711
+
712
+
713
+ def generate_workload(
714
+ queries: Iterable[SqlQuery],
715
+ *,
716
+ name: str = "",
717
+ labels: Optional[dict[SqlQuery, LabelType]] = None,
718
+ workload_root: Optional[pathlib.Path] = None,
719
+ ) -> Workload[LabelType]:
720
+ """Wraps a number of queries in a workload object.
721
+
722
+ The queries can receive optional labels, and will receive numerical labels according to their position in the `queries`
723
+ iterable if no explicit labels are provided (counting from 1).
724
+
725
+ The workload will be named according to the optional `name` parameter. If this fails, the name will be inferred from the
726
+ optional `workload_root`. If this fails as well, an empty name will be used.
727
+
728
+ Parameters
729
+ ----------
730
+ queries : Iterable[SqlQuery]
731
+ The queries that should form the workload. This is only enumerated a single time, hence the iterable can "spent" its
732
+ items.
733
+ name : str, optional
734
+ The name of the workload, by default ""
735
+ labels : Optional[dict[SqlQuery, LabelType]], optional
736
+ The labels of the workload queries. Defaults to ``None``, in which case numerical labels will be used. In the first
737
+ case the label type is inferred from the dictionary values. In the second case, it will be `int`.
738
+ workload_root : Optional[pathlib.Path], optional
739
+ The directory or file that originally contained the workload queries. Defaults to ``None`` if this is not known or not
740
+ appropriate (e.g. for workloads that are read from a remote source)
741
+
742
+ Returns
743
+ -------
744
+ Workload[LabelType]
745
+ The workload
746
+ """
747
+ name = name if name else (workload_root.stem if workload_root else "")
748
+ if not labels:
749
+ labels: dict[SqlQuery, int] = {
750
+ query: idx + 1 for idx, query in enumerate(queries)
751
+ }
752
+ workload_contents = util.dicts.invert(labels)
753
+ return Workload(workload_contents, name, workload_root)
754
+
755
+
756
+ def _assert_workload_loaded(workload: Workload[LabelType], expected_dir: str) -> None:
757
+ """Ensures that workload queries have been read successfully. The expected directory is used for error messages."""
758
+ if not workload:
759
+ expected_dir = pathlib.Path.home() / ".postbound" / "workloads"
760
+ raise ValueError(
761
+ f"Could not load {workload.name} workload. "
762
+ f"Please check {expected_dir} and make sure that it contains valid query files. "
763
+ f"If it does, please open an issue at https://github.com/rbergm/PostBOUND to report the problem."
764
+ f"If there are no queries in the directory, try re-running the workload access and open an issue if the problem "
765
+ f"persists."
766
+ )
767
+
768
+
769
+ def job(
770
+ *,
771
+ flavor: Literal["default", "light", "complex"] = "default",
772
+ file_encoding: str = "utf-8",
773
+ ) -> Workload[str]:
774
+ """Reads the Join Order Benchmark, with labels according to the original paper (e.g. *1a*, *21c*, etc.).
775
+
776
+
777
+ Parameters
778
+ ----------
779
+ flavor : Literal["default", "light", "complex"], optional
780
+ The flavor of the JOB benchmark to load. The default flavor is the original JOB benchmark. Use "light" or "complex" to
781
+ load the respective variants.
782
+ file_encoding : str, optional
783
+ The encoding of the query files, by default UTF-8.
784
+
785
+ Returns
786
+ -------
787
+ Workload[str]
788
+ The workload
789
+
790
+ See Also
791
+ --------
792
+ job_light
793
+ job_complex
794
+
795
+ References
796
+ ----------
797
+
798
+ .. Viktor Leis et al.: "How Good Are Query Optimizers, Really?" (Proc. VLDB Endow. 9, 3 (2015))
799
+ """
800
+ if flavor == "light":
801
+ return job_light(file_encoding=file_encoding)
802
+ elif flavor == "complex":
803
+ return job_complex(file_encoding=file_encoding)
804
+
805
+ workload_dir = _fetch_workload("JOB")
806
+ # JOB only uses aliases column references, so no need for explicit binding
807
+ job_workload = Workload.read(
808
+ workload_dir, name="JOB", file_encoding=file_encoding, bind_columns=False
809
+ )
810
+ _assert_workload_loaded(job_workload, workload_dir)
811
+ return job_workload
812
+
813
+
814
+ def job_light(*, file_encoding: str = "utf-8") -> Workload[str]:
815
+ """Reads the JOB-light benchmark, with numeric query labels (1, 2, 3, ...).
816
+
817
+ Parameters
818
+ ----------
819
+ file_encoding : str, optional
820
+ The encoding of the query files, by default UTF-8.
821
+
822
+ Returns
823
+ -------
824
+ Workload[str]
825
+ The workload
826
+
827
+ References
828
+ ----------
829
+
830
+ .. Andreas Kipf et al.: "Learned Cardinalities: Estimating Correlated Joins with Deep Learning" (CIDR'2019)
831
+ """
832
+ workload_dir = _fetch_workload("job-light")
833
+ # JOB-light only uses aliases column references, so no need for explicit binding
834
+ job_light_workload = Workload.read(
835
+ workload_dir, name="JOB-light", file_encoding=file_encoding, bind_columns=False
836
+ )
837
+ _assert_workload_loaded(job_light_workload, workload_dir)
838
+ return job_light_workload
839
+
840
+
841
+ def job_complex(*, file_encoding: str = "utf-8") -> Workload[str]:
842
+ """Reads the JOB-complex benchmark, with numeric query labels (1, 2, 3, ...).
843
+
844
+ Parameters
845
+ ----------
846
+ file_encoding : str, optional
847
+ The encoding of the query files, by default "utf-8".
848
+
849
+ Returns
850
+ -------
851
+ Workload[str]
852
+ The workload
853
+
854
+ References
855
+ ----------
856
+
857
+ .. Johannes Wehrstein et al.: "JOB-Complex: A Challenging Benchmark for Traditional & Learned Query Optimization"
858
+ (AIDB'2025)
859
+ """
860
+ workload_dir = _fetch_workload("job-complex")
861
+ # JOB-complex only uses aliases column references, so no need for explicit binding
862
+ job_complex_workload = Workload.read(
863
+ workload_dir,
864
+ name="JOB-complex",
865
+ file_encoding=file_encoding,
866
+ bind_columns=False,
867
+ )
868
+ _assert_workload_loaded(job_complex_workload, workload_dir)
869
+ return job_complex_workload
870
+
871
+
872
+ def ssb(
873
+ *, file_encoding: str = "utf-8", bind_columns: Optional[bool] = None
874
+ ) -> Workload[str]:
875
+ """Reads the Star Schema Benchmark, with labels according to the original data (e.g. *q1-1*, *q3-2*, etc.).
876
+
877
+ Parameters
878
+ ----------
879
+ file_encoding : str, optional
880
+ The encoding of the query files, by default UTF-8.
881
+ bind_columns : Optional[bool], optional
882
+ Whether all columns in the queries should be bound to their respective tables. Since SSB does not use qualified
883
+ columns names, these must be inferred from the database schema. Thus, an active database connection is required to
884
+ bind the columns. By default, binding is attempted if there is an active database connection.
885
+
886
+ Returns
887
+ -------
888
+ Workload[str]
889
+ The workload
890
+
891
+ References
892
+ ----------
893
+
894
+ .. Patrick E. O'Neil et al.: "The Star Schema Benchmark and Augmented Fact Table Indexing." (TPCTC'2009)
895
+ """
896
+ bind_columns = (
897
+ bind_columns
898
+ if bind_columns is not None
899
+ else not DatabasePool.get_instance().empty()
900
+ )
901
+ workload_dir = _fetch_workload("ssb")
902
+ ssb_workload = Workload.read(
903
+ workload_dir, name="SSB", file_encoding=file_encoding, bind_columns=bind_columns
904
+ )
905
+ _assert_workload_loaded(ssb_workload, workload_dir)
906
+ return ssb_workload
907
+
908
+
909
+ def stack(
910
+ *,
911
+ file_encoding: str = "utf-8",
912
+ bind_columns: Optional[bool] = None,
913
+ ) -> Workload[str]:
914
+ """Reads the Stack Benchmark, as shipped with the PostBOUND repository.
915
+
916
+ Most queries use semi-numeric labels consisting of the context and the query number, e.g., *q1/q1-001*. However, some
917
+ queries have completely random names, such as *q16/fc8f97968b9fce81df4011c8175eada15541abe0*. Still, all queries are
918
+ grouped into one of 16 contexts.
919
+
920
+ Parameters
921
+ ----------
922
+ file_encoding : str, optional
923
+ The encoding of the query files, by default UTF-8.
924
+ bind_columns : Optional[bool], optional
925
+ Whether all columns in the queries should be bound to their respective tables. Since Stack does not use qualified
926
+ columns names in some queries, these must be inferred from the database schema. Thus, an active database connection is
927
+ required to bind the columns. By default, binding is attempted if there is an active database connection.
928
+
929
+ Returns
930
+ -------
931
+ Workload[str]
932
+ The workload.
933
+
934
+ References
935
+ ----------
936
+
937
+ .. Ryan Marcus et al.: "Bao: Making Learned Query Optimization Practical." (SIGMOD'2021)
938
+ """
939
+ bind_columns = (
940
+ bind_columns
941
+ if bind_columns is not None
942
+ else not DatabasePool.get_instance().empty()
943
+ )
944
+ workload_dir = _fetch_workload("stack")
945
+
946
+ stack_workload = read_workload(
947
+ workload_dir,
948
+ "Stack",
949
+ recurse_subdirectories=True,
950
+ file_encoding=file_encoding,
951
+ bind_columns=bind_columns,
952
+ )
953
+ _assert_workload_loaded(stack_workload, workload_dir)
954
+ return stack_workload
955
+
956
+
957
+ def stats(*, file_encoding: str = "utf-8") -> Workload[str]:
958
+ """Reads the Stats Benchmark, with semi-numeric query labels (e.g. *q-1*, *q-2*, etc.).
959
+
960
+ Parameters
961
+ ----------
962
+ file_encoding : str, optional
963
+ The encoding of the query files, by default UTF-8.
964
+
965
+ Returns
966
+ -------
967
+ Workload[str]
968
+ The workload
969
+
970
+ References
971
+ ----------
972
+
973
+ .. Yuxing Han et al.: Cardinality Estimation in DBMS: A Comprehensive Benchmark Evaluation (Proc. VLDB Endow. 15, 4 (2022))
974
+ """
975
+ workload_dir = _fetch_workload("stats")
976
+ stats_workload = Workload.read(
977
+ workload_dir, name="Stats", file_encoding=file_encoding, bind_columns=False
978
+ )
979
+ _assert_workload_loaded(stats_workload, workload_dir)
980
+ return stats_workload