PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
postbound/_bench.py ADDED
@@ -0,0 +1,1012 @@
1
+ """Utilities to optimize and execute queries and workloads in a reproducible and transparent manner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ import warnings
7
+ from collections.abc import Callable, Iterable
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any, Literal, Optional
12
+
13
+ import natsort
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from . import util
18
+ from ._pipelines import (
19
+ IntegratedOptimizationPipeline,
20
+ MultiStageOptimizationPipeline,
21
+ OptimizationPipeline,
22
+ TextBookOptimizationPipeline,
23
+ )
24
+ from ._stages import (
25
+ CompleteOptimizationAlgorithm,
26
+ CostModel,
27
+ JoinOrderOptimization,
28
+ OptimizationStage,
29
+ ParameterGeneration,
30
+ PhysicalOperatorSelection,
31
+ PlanEnumerator,
32
+ )
33
+ from .db._db import (
34
+ Database,
35
+ DatabasePool,
36
+ PrewarmingSupport,
37
+ StopwatchSupport,
38
+ TimeoutSupport,
39
+ simplify_result_set,
40
+ )
41
+ from .experiments.workloads import Workload, generate_workload
42
+ from .qal import transform
43
+ from .qal._qal import Explain, SqlQuery
44
+ from .util.jsonize import Jsonizable
45
+
46
+ PredefLogger = Literal["tqdm"]
47
+ """Pre-defined loggers that can be used to track progress during workload execution."""
48
+
49
+ ErrorHandling = Literal["raise", "log", "ignore"]
50
+ """How to handle errors during optimization or execution:
51
+
52
+ - *raise*: Raise the exception immediately
53
+ - *log*: Include the failed query in the resulting data frame, just like successful queries. The *status* column will indicate
54
+ the specific error and the *failure reason* column will contain the exception message.
55
+ - *ignore*: Silently ignore the error and do not include the failed query in the resulting data frame
56
+ """
57
+
58
+ ExecStatus = Literal["ok", "timeout", "optimization-error", "execution-error"]
59
+ """Describes the result of a query execution:
60
+
61
+ - *ok*: The query was executed successfully
62
+ - *timeout*: The query was cancelled due to a timeout
63
+ - *optimization-error*: The query could not be optimized by PostBOUND
64
+ - *execution-error*: The query could not be executed by the database system
65
+
66
+ For errors, the actual reason is contained in the `failure_reason` column of the resulting data frame.
67
+ """
68
+
69
+
70
+ @dataclass
71
+ class ExecutionResult:
72
+ """Captures all relevant components of a query optimization and execution result."""
73
+
74
+ query: SqlQuery
75
+ """The query that was executed. If the query was optimized and transformed, these modifications are included."""
76
+
77
+ status: ExecStatus = "ok"
78
+ """Whether the query was executed successfully or not."""
79
+
80
+ query_result: object = None
81
+ """The result set of the query or *None* if the query failed."""
82
+
83
+ optimization_time: float = np.nan
84
+ """The time in seconds it took to optimized the query by PostBOUND.
85
+
86
+ This does not account for optimization by the actual database system and depends heavily on the quality of the
87
+ implementation of the optimization strategies.
88
+
89
+ For queries that were not optimized within PostBOUND, this value is *NaN*.
90
+ """
91
+
92
+ execution_time: float = np.nan
93
+ """The time in seconds it took to execute the (potentially optimized) query by the actual database system.
94
+
95
+ This execution time includes the entire end-to-end processing, i.e. starting with supplying the query to the
96
+ database until the last byte of the result set was transferred back to PostBOUND. Therefore, this duration also
97
+ includes the optimization time by the database system, as well as the entire time for data transfer.
98
+
99
+ A value of *Inf* indicates that the query did not complete successfully and was cancelled due to a timeout. *NaN* encodes
100
+ a failure during optimization or execution. See `status` for more details.
101
+ """
102
+
103
+ @staticmethod
104
+ def passed(
105
+ query: SqlQuery,
106
+ *,
107
+ query_result: object,
108
+ execution_time: float,
109
+ optimization_time: float = np.nan,
110
+ ) -> ExecutionResult:
111
+ """Constructs an `ExecutionResult` for a successfully executed query.
112
+
113
+ The optimization time can be omitted if the query was not optimized in PostBOUND.
114
+ """
115
+ return ExecutionResult(
116
+ query=query,
117
+ status="ok",
118
+ query_result=query_result,
119
+ execution_time=execution_time,
120
+ optimization_time=optimization_time,
121
+ )
122
+
123
+ @staticmethod
124
+ def execution_error(
125
+ query: SqlQuery, *, optimization_time: float = np.nan
126
+ ) -> ExecutionResult:
127
+ """Constructs an `ExecutionResult` for a query that failed during execution."""
128
+ return ExecutionResult(
129
+ query=query,
130
+ status="execution-error",
131
+ query_result=None,
132
+ execution_time=np.nan,
133
+ optimization_time=optimization_time,
134
+ )
135
+
136
+ @staticmethod
137
+ def optimization_error(query: SqlQuery) -> ExecutionResult:
138
+ return ExecutionResult(
139
+ query=query,
140
+ status="optimization-error",
141
+ query_result=None,
142
+ execution_time=np.nan,
143
+ optimization_time=np.nan,
144
+ )
145
+
146
+
147
+ class QueryPreparation:
148
+ """This service handles transformations of input queries that are executed before running the query.
149
+
150
+ These transformations mostly ensure that all queries in a workload provide the same type of result even in face
151
+ of input queries that are structured slightly differently. For example, the preparation service can transform
152
+ all the queries to be executed as *EXPLAIN* or *COUNT(\\*)* queries. Furthermore, the preparation service can
153
+ store SQL statements that have to be executed before running the query. For example, a statement that disables
154
+ parallel execution could be supplied here.
155
+
156
+ Parameters
157
+ ----------
158
+ explain : bool, optional
159
+ Whether to force all queries to be executed as *EXPLAIN* queries, by default *False*
160
+ count_star : bool, optional
161
+ Whether to force all queries to be executed as *COUNT(\\*)* queries, overwriting their default projection. Defaults to
162
+ *False*
163
+ analyze : bool, optional
164
+ Whether to force all queries to be executed as ``EXPLAIN ANALYZE`` queries. Setting this option implies `explain`,
165
+ which therefore does not need to set manually. Defaults to *False*
166
+ prewarm : bool, optional
167
+ For database systems that support prewarming, this inflates the buffer pool with pages from the prepared query.
168
+ preparatory_statements : Optional[list[str]], optional
169
+ Statements that are executed as-is on the database connection before running the query, by default *None*
170
+
171
+ See Also
172
+ --------
173
+ db.PrewarmingSupport : Technical details on how prewarming is implemented in PostBOUND
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ *,
179
+ explain: bool = False,
180
+ count_star: bool = False,
181
+ analyze: bool = False,
182
+ prewarm: bool = False,
183
+ preparatory_statements: Optional[list[str]] = None,
184
+ ) -> None:
185
+ self.explain = explain
186
+ self.analyze = analyze
187
+ self.count_star = count_star
188
+ self.preparatory_stmts = (
189
+ preparatory_statements if preparatory_statements else []
190
+ )
191
+
192
+ if explain and not analyze:
193
+ if prewarm:
194
+ warnings.warn(
195
+ "Ignoring prewarm setting since queries are only explained. Set prewarm manually to overwrite."
196
+ )
197
+ self.prewarm = False
198
+ else:
199
+ self.prewarm = prewarm
200
+
201
+ def prepare_query(self, query: SqlQuery, *, on: Database) -> SqlQuery:
202
+ """Applies the selected transformations to the given input query and executes the preparatory statements
203
+
204
+ Parameters
205
+ ----------
206
+ query : SqlQuery
207
+ The query to prepare
208
+ on : Database
209
+ The database to execute the preparatory statements on
210
+
211
+ Returns
212
+ -------
213
+ SqlQuery
214
+ The prepared query
215
+ """
216
+ if self.analyze:
217
+ query = transform.as_explain(query, Explain.explain_analyze())
218
+ elif self.explain:
219
+ query = transform.as_explain(query, Explain.plan())
220
+
221
+ if self.count_star:
222
+ query = transform.as_count_star_query(query)
223
+
224
+ if self.prewarm:
225
+ if not isinstance(on, PrewarmingSupport):
226
+ warnings.warn(
227
+ "Ignoring prewarm setting since the database does not support prewarming"
228
+ )
229
+ else:
230
+ on.prewarm_tables(query.tables())
231
+
232
+ for stmt in self.preparatory_stmts:
233
+ on.execute_query(stmt, cache_enabled=False)
234
+
235
+ return query
236
+
237
+ def __json__(self) -> util.jsondict:
238
+ return {
239
+ "explain": self.explain,
240
+ "analyze": self.analyze,
241
+ "count_star": self.count_star,
242
+ "prewarm": self.prewarm,
243
+ "preparatory_statements": self.preparatory_stmts,
244
+ }
245
+
246
+
247
+ def _wrap_workload(
248
+ queries: Iterable[SqlQuery] | Workload,
249
+ ) -> Workload:
250
+ """Transforms an iterable of queries into a proper workload object to enable execution by the runner methods."""
251
+ return queries if isinstance(queries, Workload) else generate_workload(queries)
252
+
253
+
254
+ def _wrap_optimization_stage(stage: OptimizationStage) -> OptimizationPipeline:
255
+ """Create an appropriate optimization pipeline for a specific optimization algorithm."""
256
+ target_db = DatabasePool.get_instance().current_database()
257
+ match stage:
258
+ case CompleteOptimizationAlgorithm():
259
+ pipeline = IntegratedOptimizationPipeline(target_db)
260
+ pipeline.setup_optimization_algorithm(stage).build()
261
+ case (
262
+ JoinOrderOptimization()
263
+ | PhysicalOperatorSelection()
264
+ | ParameterGeneration()
265
+ ):
266
+ pipeline = MultiStageOptimizationPipeline(target_db)
267
+ pipeline.use(stage).build()
268
+ case PlanEnumerator() | CostModel():
269
+ # We don't check for CardinalityEstimator here b/c every cardest is also a ParameterGeneration instance
270
+ # and if we don't have a plan enumerator or a cost model (as is the case here), the parameter generation is the
271
+ # much more well-suited stage to wrap
272
+ pipeline = TextBookOptimizationPipeline(target_db)
273
+ pipeline.use(stage).build()
274
+ case _:
275
+ raise TypeError(f"Unsupported optimization stage: {stage}")
276
+ return pipeline
277
+
278
+
279
+ ExecutionTarget = Database | OptimizationPipeline | OptimizationStage
280
+ """Specifies what to do with the workload queries:
281
+
282
+ - providing a `Database` executes the queries as-is on the database
283
+ - passing an `OptimizationPipeline` optimizes the queries using the pipeline before executing them on the target database
284
+ of the pipeline
285
+ - passing an `OptimizationStage` generates an appropriate optimization pipeline for the stage and then proceeds as above.
286
+ Notice that in this mode, the target database is assumed to be the current database of the `DatabasePool`.
287
+
288
+ """
289
+
290
+
291
+ @dataclass
292
+ class _SuccessfullOptimization:
293
+ optimized_query: SqlQuery
294
+ optimization_time: float
295
+
296
+
297
+ @dataclass
298
+ class _FailedOptimization:
299
+ error: Exception
300
+
301
+
302
+ @dataclass
303
+ class _NoOptimization:
304
+ pass
305
+
306
+
307
+ _InternalOptResult = _SuccessfullOptimization | _FailedOptimization | _NoOptimization
308
+
309
+
310
+ def _optimize_query(
311
+ query: SqlQuery, *, pipeline: Optional[OptimizationPipeline] = None
312
+ ) -> _InternalOptResult:
313
+ """Tries to run a query through the optimization pipeline while gracefully handling errors."""
314
+ if pipeline is None:
315
+ return _NoOptimization()
316
+
317
+ try:
318
+ opt_start = time.perf_counter_ns()
319
+ optimized_query = pipeline.optimize_query(query)
320
+ opt_end = time.perf_counter_ns()
321
+ optimization_time = (opt_end - opt_start) / 10**9 # convert to seconds
322
+ return _SuccessfullOptimization(
323
+ optimized_query=optimized_query, optimization_time=optimization_time
324
+ )
325
+ except Exception as e:
326
+ return _FailedOptimization(error=e)
327
+
328
+
329
+ @dataclass
330
+ class _SuccessfullExecution:
331
+ query_result: Any
332
+ exec_time: float
333
+
334
+
335
+ @dataclass
336
+ class _TimeoutExecution:
337
+ timeout: float
338
+
339
+
340
+ @dataclass
341
+ class _FailedExecution:
342
+ error: Exception
343
+
344
+
345
+ _InternalExecResult = _SuccessfullExecution | _TimeoutExecution | _FailedExecution
346
+
347
+
348
+ def _execute_query(
349
+ query: SqlQuery,
350
+ *,
351
+ on: Database,
352
+ timeout: Optional[float] = None,
353
+ query_prep: Optional[QueryPreparation] = None,
354
+ ) -> _InternalExecResult:
355
+ """Prepares and executes a query on an actual database system while gracefully handling timeouts and errors.
356
+
357
+ This is a simple handler that does not care about the larger control flow of the benchmarking process. It simply executes
358
+ stuff and lets the more high-level control loops deal with the rest (e.g. proper error handling, etc).
359
+ """
360
+ if timeout and not isinstance(on, TimeoutSupport):
361
+ raise ValueError(f"Database system {on} does not provide timeout support")
362
+
363
+ if query_prep:
364
+ query = query_prep.prepare_query(query, on=on)
365
+
366
+ try:
367
+ if timeout:
368
+ exec_start = time.perf_counter_ns()
369
+ raw_result = on.execute_with_timeout(query, timeout=timeout)
370
+ exec_end = time.perf_counter_ns()
371
+ if raw_result is None:
372
+ return _TimeoutExecution(timeout=timeout)
373
+ query_result = simplify_result_set(raw_result)
374
+ else:
375
+ exec_start = time.perf_counter_ns()
376
+ query_result = on.execute_query(query, cache_enabled=False, raw=False)
377
+ exec_end = time.perf_counter_ns()
378
+ exec_time = (exec_end - exec_start) / 10**9 # convert to seconds
379
+
380
+ if isinstance(on, StopwatchSupport):
381
+ exec_time = on.last_query_runtime()
382
+
383
+ except Exception as e:
384
+ return _FailedExecution(error=e)
385
+
386
+ return _SuccessfullExecution(query_result=query_result, exec_time=exec_time)
387
+
388
+
389
+ class _NoOpLogger:
390
+ def next_workload_iter(self) -> None:
391
+ pass
392
+
393
+ def next_query(self, label: str) -> None:
394
+ pass
395
+
396
+ def next_query_rep(self) -> None:
397
+ pass
398
+
399
+
400
+ class _CustomLogger:
401
+ def __init__(
402
+ self, logger: Callable[[str], None], *, workload_reps: int = 1
403
+ ) -> None:
404
+ self._logger = logger
405
+ self._workload_reps = workload_reps
406
+ self._workload_iter: int = 0
407
+
408
+ def next_workload_iter(self) -> None:
409
+ self._workload_iter += 1
410
+
411
+ def next_query(self, label: str) -> None:
412
+ log_msg = f"Now benchmarking query {label} (repetition {self._workload_iter}/{self._workload_reps})"
413
+ self._logger(log_msg)
414
+
415
+ def next_query_rep(self) -> None:
416
+ pass
417
+
418
+
419
+ class _TqdmLogger:
420
+ def __init__(
421
+ self, *, workload_reps: int = 1, query_reps: int = 1, total_queries: int
422
+ ) -> None:
423
+ from tqdm import tqdm
424
+
425
+ self._rep_progress = tqdm(total=workload_reps, desc="Workload Rep.", unit="rep")
426
+ self._query_progress = tqdm(total=total_queries, desc="Query", unit="q")
427
+ self._query_rep = tqdm(total=query_reps, desc="Query Rep.", unit="rep")
428
+ self._initial: bool = True
429
+
430
+ def next_workload_iter(self) -> None:
431
+ if self._initial:
432
+ self._initial = False
433
+ return
434
+ self._rep_progress.update(1)
435
+ self._query_progress.reset()
436
+
437
+ def next_query(self, label: str) -> None:
438
+ self._query_progress.update(1)
439
+ self._query_rep.set_description(f"Query {label}")
440
+ self._query_rep.reset()
441
+
442
+ def next_query_rep(self) -> None:
443
+ self._query_rep.update(1)
444
+
445
+
446
+ _LoggerImpl = _NoOpLogger | _CustomLogger | _TqdmLogger
447
+
448
+
449
+ class _ResultSample:
450
+ """A result sample corresponds to all executions of a single query within one workload repetition.
451
+
452
+ It captures all relevant data (measurements or other artifacts) related to the optimization and execution of the query.
453
+ Each sample should be generated by the `_ExecutionResults` "manager" which keeps track of most of the global state of the
454
+ benchmarking process.
455
+ """
456
+
457
+ def __init__(
458
+ self,
459
+ *,
460
+ query: SqlQuery,
461
+ label: str,
462
+ initial_exec_idx: int,
463
+ current_workload_rep: int,
464
+ query_prep: QueryPreparation | None,
465
+ max_query_reps: int,
466
+ ) -> None:
467
+ # static data
468
+ self.label = label
469
+ self.query = query
470
+ self.current_workload_rep = current_workload_rep
471
+ self.query_prep = query_prep
472
+
473
+ # collected data
474
+ self.timestamps: list[datetime] = []
475
+ self.status: list[str] = []
476
+ self.optimization_time: float = np.nan
477
+ self.optimization_pipeline: OptimizationPipeline | None = None
478
+ self.optimized_query: SqlQuery | None = None
479
+ self.failure_reasons: list[str] = []
480
+ self.result_sets: list[object] = []
481
+ self.exec_times: list[float] = []
482
+ self.db_configs: list[dict] = []
483
+
484
+ # internal fields
485
+ self._max_query_reps = max_query_reps
486
+ self._optimization_failure: Exception | None = None
487
+ self._initial_idx = initial_exec_idx
488
+
489
+ def optimization_results(
490
+ self,
491
+ *,
492
+ optimized_query: SqlQuery,
493
+ pipeline: OptimizationPipeline,
494
+ optimization_time: float,
495
+ ) -> None:
496
+ self.optimization_time = optimization_time
497
+ self.optimization_pipeline = pipeline
498
+ self.optimized_query = optimized_query
499
+
500
+ def optimization_failure(
501
+ self, reason: Exception, *, pipeline: OptimizationPipeline
502
+ ) -> None:
503
+ self._optimization_failure = reason
504
+ self.optimization_pipeline = pipeline
505
+ self.timestamps += [None] * self._max_query_reps
506
+ self.status += ["optimization-error"] * self._max_query_reps
507
+ self.failure_reasons += [str(reason)] * self._max_query_reps
508
+ self.result_sets += [None] * self._max_query_reps
509
+ self.exec_times += [np.nan] * self._max_query_reps
510
+ self.db_configs += [{}] * self._max_query_reps
511
+
512
+ def start_execution(self) -> None:
513
+ self.timestamps.append(datetime.now())
514
+
515
+ def add_exec_sample(
516
+ self, result_set: object, *, exec_time: float, db_config: dict
517
+ ) -> None:
518
+ self.status.append("ok")
519
+ self.failure_reasons.append("")
520
+ self.result_sets.append(result_set)
521
+ self.exec_times.append(exec_time)
522
+ self.db_configs.append(db_config)
523
+
524
+ def add_exec_timeout(self, timeout: float, *, db_config: dict) -> None:
525
+ self.status.append("timeout")
526
+ self.failure_reasons.append("")
527
+ self.result_sets.append(None)
528
+ self.exec_times.append(timeout)
529
+ self.db_configs.append(db_config)
530
+
531
+ def add_exec_failure(self, reason: Exception, *, db_config: dict) -> None:
532
+ self.status.append("execution-error")
533
+ self.failure_reasons.append(str(reason))
534
+ self.result_sets.append(None)
535
+ self.exec_times.append(np.nan)
536
+ self.db_configs.append(db_config)
537
+
538
+ def failed_optimization(self) -> Optional[Exception]:
539
+ return self._optimization_failure
540
+
541
+ def num_executions(self) -> int:
542
+ return len(self.result_sets)
543
+
544
+ def last_successful(self) -> bool:
545
+ return self.status and self.status[-1] in ("ok", "timeout")
546
+
547
+ def last_result(self) -> Optional[ExecutionResult]:
548
+ if self._optimization_failure or not self.result_sets:
549
+ return None
550
+
551
+ return ExecutionResult(
552
+ query=self.query,
553
+ status=self.status[-1],
554
+ query_result=self.result_sets[-1],
555
+ optimization_time=self.optimization_time,
556
+ execution_time=self.exec_times[-1],
557
+ )
558
+
559
+ def to_df(self, *, only_last: bool = False) -> pd.DataFrame:
560
+ n_samples = len(self)
561
+ if not n_samples:
562
+ return pd.DataFrame()
563
+
564
+ if only_last:
565
+ rows = {
566
+ "exec_index": [self._initial_idx + n_samples - 1],
567
+ "label": [self.label],
568
+ "timestamp": [self.timestamps[-1]],
569
+ "workload_repetition": [self.current_workload_rep],
570
+ "query_repetition": [n_samples],
571
+ "query": [self.query],
572
+ "status": [self.status[-1]],
573
+ "query_result": [self.result_sets[-1]],
574
+ "exec_time": [self.exec_times[-1]],
575
+ "failure_reason": [self.failure_reasons[-1]],
576
+ "db_config": [self.db_configs[-1]],
577
+ "query_preparation": [self.query_prep],
578
+ "optimization_time": [self.optimization_time],
579
+ "optimization_pipeline": [self.optimization_pipeline],
580
+ "optimized_query": [self.optimized_query],
581
+ }
582
+ else:
583
+ rows = {
584
+ "exec_index": [self._initial_idx + i for i in range(n_samples)],
585
+ "label": [self.label] * n_samples,
586
+ "timestamp": self.timestamps,
587
+ "workload_repetition": [self.current_workload_rep] * n_samples,
588
+ "query_repetition": [i + 1 for i in range(n_samples)],
589
+ "query": [self.query] * n_samples,
590
+ "status": self.status,
591
+ "query_result": self.result_sets,
592
+ "exec_time": self.exec_times,
593
+ "failure_reason": self.failure_reasons,
594
+ "db_config": self.db_configs,
595
+ "query_preparation": [self.query_prep] * n_samples,
596
+ "optimization_time": [self.optimization_time] * n_samples,
597
+ "optimization_pipeline": [self.optimization_pipeline] * n_samples,
598
+ "optimized_query": [self.optimized_query] * n_samples,
599
+ }
600
+
601
+ return pd.DataFrame(rows)
602
+
603
+ def write_progressive(self, file: Path | None) -> None:
604
+ if file is None:
605
+ return
606
+
607
+ df = self.to_df(only_last=True)
608
+ df = prepare_export(df)
609
+ if file.is_file():
610
+ df.to_csv(file, mode="a", header=False, index=False)
611
+ return
612
+
613
+ # We are the first sample to write to the output file. Create it with headers
614
+ file.parent.mkdir(parents=True, exist_ok=True)
615
+ df.to_csv(file, index=False)
616
+
617
+ def __len__(self) -> int:
618
+ return self.num_executions()
619
+
620
+ def __str__(self) -> str:
621
+ reps = self.num_executions()
622
+ return f"{self.label} ({reps} / {self._max_query_reps})"
623
+
624
+
625
+ class _ExecutionResults:
626
+ """
627
+ Execution results act as the central container for all result samples that are produced during the benchmarking process.
628
+
629
+ It is responsible for creating the result samples for new queries. The benchmarking control loops must keep the results
630
+ up to date in the sense that they must notify it about new workload repetitions or the start of new queries.
631
+ """
632
+
633
+ def __init__(self, *, query_reps: int, query_prep: QueryPreparation | None) -> None:
634
+ self.query_reps = query_reps
635
+ self.query_prep = query_prep
636
+
637
+ self._execution_counter = 1
638
+ self._workload_rep = 0
639
+ self._samples: list[_ResultSample] = []
640
+
641
+ def next_workload_repetition(self) -> None:
642
+ self._workload_rep += 1
643
+
644
+ def next_query(self, query: SqlQuery, *, label: str) -> _ResultSample:
645
+ if self._samples:
646
+ last_sample = self._samples[-1]
647
+ self._execution_counter += last_sample.num_executions()
648
+
649
+ sample = _ResultSample(
650
+ query=query,
651
+ label=label,
652
+ initial_exec_idx=self._execution_counter,
653
+ current_workload_rep=self._workload_rep,
654
+ query_prep=self.query_prep,
655
+ max_query_reps=self.query_reps,
656
+ )
657
+ self._samples.append(sample)
658
+ return sample
659
+
660
+ def scratch_last_sample(self) -> None:
661
+ """Removes the latest sample from the results."""
662
+ if not self._samples:
663
+ raise ValueError("No samples to scratch")
664
+ self._samples.pop()
665
+
666
+ def to_df(self) -> pd.DataFrame:
667
+ """Provides all results as a single data frame. This is the final output of the benchmarking process."""
668
+ samples = [sample.to_df() for sample in self._samples]
669
+ return pd.concat(samples, ignore_index=True)
670
+
671
+
672
+ @dataclass
673
+ class _BenchmarkConfig:
674
+ target_db: Database
675
+ optimizer: OptimizationPipeline | None
676
+ output: Path | None
677
+ per_query_repetitions: int
678
+ timeout: float | None
679
+ query_prep: QueryPreparation | None
680
+ exec_callback: Callable[[ExecutionResult], None] | None
681
+ log: _LoggerImpl
682
+ error_action: ErrorHandling
683
+
684
+
685
+ def _exec_ctl_loop(
686
+ query: SqlQuery,
687
+ *,
688
+ sample: _ResultSample,
689
+ cfg: _BenchmarkConfig,
690
+ ) -> None:
691
+ """
692
+ The execution control loop handles the execution of a single query and records its results as specified by the benchmarking
693
+ config. This includes logging, error handling, callbacks and progressive output.
694
+
695
+ Note that per-query repetitions need to be handled by the main workload control loop.
696
+ """
697
+ cfg.log.next_query_rep()
698
+
699
+ db_config = cfg.target_db.describe()
700
+ sample.start_execution()
701
+ match _execute_query(
702
+ query, on=cfg.target_db, timeout=cfg.timeout, query_prep=cfg.query_prep
703
+ ):
704
+ case _SuccessfullExecution(result_set, exec_time):
705
+ sample.add_exec_sample(result_set, exec_time=exec_time, db_config=db_config)
706
+ case _TimeoutExecution(exec_time):
707
+ sample.add_exec_timeout(exec_time, db_config=db_config)
708
+ case _FailedExecution(err) if cfg.error_action == "log":
709
+ sample.add_exec_failure(err, db_config=db_config)
710
+ case _FailedExecution(err) if cfg.error_action == "raise":
711
+ raise err
712
+ case _FailedExecution(_) if cfg.error_action == "ignore":
713
+ pass
714
+ case _FailedExecution(_):
715
+ raise ValueError(f"Unknown error action: {cfg.error_action}")
716
+ case _ as other:
717
+ raise RuntimeError(f"Unhandled execution result: {other}")
718
+
719
+ if cfg.output and sample.last_successful():
720
+ sample.write_progressive(cfg.output)
721
+
722
+ if cfg.exec_callback and sample.last_successful():
723
+ cfg.exec_callback(sample.last_result())
724
+
725
+
726
+ def _workload_ctl_loop(
727
+ queries: Workload, *, results: _ExecutionResults, cfg: _BenchmarkConfig
728
+ ) -> None:
729
+ """
730
+ The workload control loop handles the query optimization for each query in the workload and sets up the _exec_ctl_loop for
731
+ the actual execution.
732
+ """
733
+ for label, query in queries.entries():
734
+ cfg.log.next_query(label)
735
+ sample = results.next_query(query, label=label)
736
+
737
+ match _optimize_query(query, pipeline=cfg.optimizer):
738
+ case _SuccessfullOptimization(optimized, opt_time):
739
+ query = optimized
740
+ sample.optimization_results(
741
+ optimized_query=optimized,
742
+ pipeline=cfg.optimizer,
743
+ optimization_time=opt_time,
744
+ )
745
+ case _NoOptimization():
746
+ pass
747
+ case _FailedOptimization(err):
748
+ sample.optimization_failure(reason=err, pipeline=cfg.optimizer)
749
+
750
+ if sample.failed_optimization():
751
+ match cfg.error_action:
752
+ case "log" if cfg.output:
753
+ sample.write_progressive(cfg.output)
754
+ case "log" if not cfg.output:
755
+ # we handle the output to CSV as part of the normal result export, no need to do anything here
756
+ pass
757
+ case "raise":
758
+ raise sample.failed_optimization()
759
+ case "ignore":
760
+ results.scratch_last_sample()
761
+ continue
762
+
763
+ for _ in range(cfg.per_query_repetitions):
764
+ _exec_ctl_loop(
765
+ query,
766
+ sample=sample,
767
+ cfg=cfg,
768
+ )
769
+
770
+
771
+ def execute_workload(
772
+ queries: Iterable[SqlQuery] | Workload,
773
+ on: ExecutionTarget,
774
+ *,
775
+ workload_repetitions: int = 1,
776
+ per_query_repetitions: int = 1,
777
+ shuffled: bool = False,
778
+ query_preparation: Optional[QueryPreparation | dict] = None,
779
+ timeout: Optional[float] = None,
780
+ exec_callback: Optional[Callable[[ExecutionResult], None]] = None,
781
+ repetition_callback: Optional[Callable[[int], None]] = None,
782
+ progressive_output: Optional[str | Path] = None,
783
+ logger: Optional[Callable[[str], None] | PredefLogger] = None,
784
+ error_action: ErrorHandling = "log",
785
+ ) -> pd.DataFrame:
786
+ """Simple benchmarking interface.
787
+
788
+ This function runs a query workload on a database system and measures the execution time of each query. All workload
789
+ queries can be optimized through an `OptimizationPipeline`.
790
+
791
+ Parameters
792
+ ----------
793
+ queries : Iterable[SqlQuery] | Workload
794
+ The queries to be executed.
795
+ on : ExecutionTarget
796
+ This is a catch-all parameter to specify the database system to execute the queries on, as well as the (optional)
797
+ pipeline to optimize the queries. If a pipeline is provided, all queries are first passed through the pipeline before
798
+ executing them on the pipeline's target database. It is even possible to provide a single optimization stage, in which
799
+ case the stage is first expanded into a full optimization pipeline.
800
+ workload_repetitions : int, optional
801
+ The number of times the entire workload should be repeated. By default, the workload is only executed once.
802
+ per_query_repetitions : int, optional
803
+ The number of times each query should be repeated within each workload repetition. The per-query repetitions happen
804
+ sequentially one after another before transitioning to the next query. By default each query is only executed once.
805
+ shuffled : bool, optional
806
+ Whether to randomize the execution order of each query within the workload. Shuffling is applied before each workload
807
+ repetition. Per query repetitions are *not* influenced by this setting.
808
+ query_preparation : Optional[QueryPreparation | dict], optional
809
+ Preparation steps that should be performed before running the query. The preparation result will be used in place of
810
+ the original query for all repetitions. If a dictionary is passed, all keys are assumed to be valid parameters to the
811
+ `QueryPreparation` constructor.
812
+ timeout : Optional[float], optional
813
+ The maximum time in seconds that the query is allowed to run. If the query exceeds this time, the execution is
814
+ cancelled and the execution time is set to *Inf*. If this parameter is omitted, no timeout is enforced. Notice that
815
+ timeouts require the database to implement `TimeoutSupport`.
816
+ progressive_output : Optional[str | Path], optional
817
+ If provided, results will be written to this file as soon as they are obtained. If the file already exists, it
818
+ will be appended. This is file is assumed to be a CSV file.
819
+ logger : Optional[Callable[[str], None] | PredefLogger], optional
820
+ Configures how progress should be logged. Depending on the specific argument, a number of different strategies are
821
+ available:
822
+
823
+ - passing *None* (the default) disables logging
824
+ - passing a callable invokes the function before every query execution. It receives information about the current
825
+ execution as argument
826
+ - referencing a pre-defined logger invokes. Currently, only *tqdm* is supported. It uses the corresponding library to
827
+ print a progress bar
828
+
829
+ Returns
830
+ -------
831
+ pd.DataFrame
832
+ The execution results for the input workload. The data frame will be structured as follows:
833
+
834
+ - the data frame will contain one row per query repetition
835
+ - *exec_index* contains an absolute index indicating when the query was executed
836
+ - *timestamp* is the time when the query execution started
837
+ - *label* is an identifier of the current query, usually inferred from the `Workload` object
838
+ - *workload_repetition* indicates the current workload repetition
839
+ - *query_repetition* indicates the current per-query repetition (in contrast to repetitions of the entire workload)
840
+ - *query* contains the input query being executed. If the query was optimized or prepared, these modifications are
841
+ **not** included here
842
+ - *status* indicates whether the query was executed successfully, or whether an error occurred during execution.
843
+ Possible values are "ok", "timeout", and "execution-error"
844
+ - *result_set* is the actual result of the query. Scalar results are represented as-is. In case of an error this will
845
+ be *None*
846
+ - *exec_time* contains the time it took to execute the query (in seconds). This includes the entire time from
847
+ sending the query to the database until the last byte of the result set has been transferred back to PostBOUND.
848
+ In case of an error this will be *NaN* and for timeouts this will be timeout itself.
849
+ - *failure_reason* contains a description of the error that occurred during optimization or execution
850
+ - *db_config* describes the database (and its state) on which the query was executed. The state is obtained just before
851
+ query execution started and after the optimization and query preparation steps have been applied
852
+ - *query_preparation* contains the settings that were used to prepare the query after optimization but before execution
853
+ - *optimization_time* contains the time it took to optimize the query using PostBOUND (in seconds). If the query was
854
+ not optimized, this will be *NaN*
855
+ - *optimization_pipeline* contains the optimization pipeline that was used to optimize the query. If the query was not
856
+ optimized, this will be *None*
857
+ - *optimized_query* contains the optimized query that was actually executed on the database. If the query was not
858
+ optimized, this will be *None*
859
+
860
+
861
+ Other Parameters
862
+ ----------------
863
+ exec_callback : Optional[Callable[[ExecutionResult], None]], optional
864
+ A post-process action that should be executed after each repetition of the query has been completed.
865
+ repetition_callback : Optional[Callable[[int], None]], optional
866
+ An optional post-process action that is executed after each workload repetition. The current repetition number is
867
+ provided as the only argument. Repetitions start at 1.
868
+ error_action : ErrorHandling, optional
869
+ Configures how errors during optimization or execution are handled. By default, failing queries are still contained
870
+ in the result data frame, but some columns might not contain meaningful values. Check the *status* column of the
871
+ data frame to see what happened.
872
+
873
+ Notes
874
+ -----
875
+ If the database system does provide accurate timing information through the `StopwatchSupport` interface, these
876
+ measurements will be preferred over the wall-clock timing that is obtained in the benchmarking process.
877
+ """
878
+ queries = _wrap_workload(queries)
879
+ if isinstance(on, OptimizationStage):
880
+ on = _wrap_optimization_stage(on)
881
+ target_db = on if isinstance(on, Database) else on.target_database()
882
+ optimizer = on if isinstance(on, OptimizationPipeline) else None
883
+
884
+ query_preparation = (
885
+ QueryPreparation(**query_preparation)
886
+ if isinstance(query_preparation, dict)
887
+ else query_preparation
888
+ )
889
+ progressive_output = Path(progressive_output) if progressive_output else None
890
+
891
+ log: _LoggerImpl
892
+ if logger == "tqdm":
893
+ log = _TqdmLogger(
894
+ workload_reps=workload_repetitions,
895
+ query_reps=per_query_repetitions,
896
+ total_queries=len(queries),
897
+ )
898
+ elif logger is not None:
899
+ log = _CustomLogger(logger, workload_reps=workload_repetitions)
900
+ else:
901
+ log = _NoOpLogger()
902
+
903
+ cfg = _BenchmarkConfig(
904
+ target_db=target_db,
905
+ optimizer=optimizer,
906
+ output=progressive_output,
907
+ per_query_repetitions=per_query_repetitions,
908
+ timeout=timeout,
909
+ query_prep=query_preparation,
910
+ exec_callback=exec_callback,
911
+ log=log,
912
+ error_action=error_action,
913
+ )
914
+ results = _ExecutionResults(
915
+ query_reps=per_query_repetitions, query_prep=query_preparation
916
+ )
917
+
918
+ # The overall control flow looks rougly like this:
919
+ # ++ workload repetitions [handled here]
920
+ # ++ workload control loop [handled by _workload_ctl_loop]
921
+ # query optimization
922
+ # per-query repetitions
923
+ # ++ high-level query execution and result generation [handled by _exec_ctl_loop]
924
+ # ++ low-level query execution [handled by _execute_query]
925
+
926
+ for i in range(workload_repetitions):
927
+ log.next_workload_iter()
928
+ results.next_workload_repetition()
929
+ if shuffled:
930
+ queries = queries.shuffle()
931
+
932
+ _workload_ctl_loop(
933
+ queries,
934
+ results=results,
935
+ cfg=cfg,
936
+ )
937
+
938
+ if repetition_callback:
939
+ repetition_callback(i + 1)
940
+
941
+ log.next_workload_iter() # to finalize progress bars
942
+ return results.to_df()
943
+
944
+
945
+ def prepare_export(results_df: pd.DataFrame) -> pd.DataFrame:
946
+ """Modifies a benchmark result dataframe such that it can be written to CSV files without problems.
947
+
948
+ This mostly involves converting Python objects to JSON counterparts that allow a reconstruction of equivalent data.
949
+
950
+ More specifically, the function handles two main aspects:
951
+
952
+ 1. making sure that the query result can be written to CSV, and
953
+ 2. making sure that the description of the optimization pipeline can be written to CSV.
954
+
955
+ In both cases, the column values will be transformed to JSON-objects if necessary.
956
+
957
+ Parameters
958
+ ----------
959
+ results_df : pd.DataFrame
960
+ The result dataframe created by one of the benchmark functions
961
+
962
+ Returns
963
+ -------
964
+ pd.DataFrame
965
+ The prepared dataframe
966
+
967
+ See Also
968
+ --------
969
+ postbound.experiments.runner : Functions to obtain benchmark results
970
+ """
971
+ if not len(results_df):
972
+ return results_df
973
+
974
+ prepared_df = results_df.copy()
975
+
976
+ example_result = prepared_df.iloc[0]
977
+ for col in example_result.index:
978
+ if not isinstance(example_result[col], (list, tuple, dict, Jsonizable)):
979
+ continue
980
+
981
+ prepared_df[col] = prepared_df[col].apply(util.to_json)
982
+
983
+ return prepared_df
984
+
985
+
986
+ def sort_results(
987
+ results_df: pd.DataFrame, by_column: str | tuple[str] = ("label", "exec_index")
988
+ ) -> pd.DataFrame:
989
+ """Provides a better sorting of the benchmark results in a data frame.
990
+
991
+ By default, the entries in the result data frame will be sorted either sequentially, or by a lexicographic ordering on the
992
+ label column. This function uses a natural ordering over the column labels.
993
+
994
+ In contrast to lexicographic sorting, natural sorting handles numeric labels in a better way: labels like
995
+ 1a, 10a and 100a are sorted in this order instead of in reverse.
996
+
997
+ Parameters
998
+ ----------
999
+ results_df : pd.DataFrame
1000
+ Data frame containing the results to sort
1001
+ by_column : str | tuple[str], optional
1002
+ The columns by which to order, by default `(COL_LABEL, COL_EXEC_IDX)`. A lexicographic ordering will
1003
+ be applied to all of them.
1004
+
1005
+ Returns
1006
+ -------
1007
+ pd.DataFrame
1008
+ A reordered data frame. The original data frame is not modified
1009
+ """
1010
+ return results_df.sort_values(
1011
+ by=by_column, key=lambda series: np.argsort(natsort.index_natsorted(series))
1012
+ )