lakebench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. lakebench-0.1.0/LICENSE +21 -0
  2. lakebench-0.1.0/PKG-INFO +214 -0
  3. lakebench-0.1.0/README.md +159 -0
  4. lakebench-0.1.0/pyproject.toml +49 -0
  5. lakebench-0.1.0/setup.cfg +4 -0
  6. lakebench-0.1.0/src/lakebench/__init__.py +0 -0
  7. lakebench-0.1.0/src/lakebench/benchmarks/__init__.py +0 -0
  8. lakebench-0.1.0/src/lakebench/benchmarks/_tpc/__init__.py +1 -0
  9. lakebench-0.1.0/src/lakebench/benchmarks/_tpc/_tpc.py +156 -0
  10. lakebench-0.1.0/src/lakebench/benchmarks/base.py +54 -0
  11. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/__init__.py +1 -0
  12. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/elt_bench.py +125 -0
  13. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/engine_impl/__init__.py +0 -0
  14. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py +147 -0
  15. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py +122 -0
  16. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py +145 -0
  17. lakebench-0.1.0/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py +105 -0
  18. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/__init__.py +1 -0
  19. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/ddl/ddl_v3.2.0.sql +496 -0
  20. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q1.sql +28 -0
  21. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q10.sql +71 -0
  22. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q11.sql +84 -0
  23. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q12.sql +27 -0
  24. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q13.sql +50 -0
  25. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q14a.sql +167 -0
  26. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q14b.sql +149 -0
  27. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q15.sql +20 -0
  28. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q16.sql +30 -0
  29. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q17.sql +40 -0
  30. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q18.sql +38 -0
  31. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q19.sql +29 -0
  32. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q2.sql +67 -0
  33. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q20.sql +27 -0
  34. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q21.sql +45 -0
  35. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q22.sql +24 -0
  36. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q23a.sql +87 -0
  37. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q23b.sql +87 -0
  38. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q24a.sql +57 -0
  39. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q24b.sql +57 -0
  40. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q25.sql +37 -0
  41. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q26.sql +24 -0
  42. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q27.sql +28 -0
  43. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q28.sql +82 -0
  44. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q29.sql +36 -0
  45. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q3.sql +19 -0
  46. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q30.sql +55 -0
  47. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q31.sql +57 -0
  48. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q32.sql +20 -0
  49. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q33.sql +85 -0
  50. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q34.sql +46 -0
  51. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q35.sql +70 -0
  52. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q36.sql +23 -0
  53. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q37.sql +21 -0
  54. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q38.sql +34 -0
  55. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q39a.sql +58 -0
  56. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q39b.sql +59 -0
  57. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q4.sql +146 -0
  58. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q40.sql +43 -0
  59. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q41.sql +121 -0
  60. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q42.sql +22 -0
  61. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q43.sql +44 -0
  62. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q44.sql +72 -0
  63. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q45.sql +29 -0
  64. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q46.sql +45 -0
  65. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q47.sql +70 -0
  66. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q48.sql +47 -0
  67. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q49.sql +145 -0
  68. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q5.sql +144 -0
  69. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q50.sql +86 -0
  70. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q51.sql +56 -0
  71. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q52.sql +21 -0
  72. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q53.sql +40 -0
  73. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q54.sql +69 -0
  74. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q55.sql +18 -0
  75. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q56.sql +86 -0
  76. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q57.sql +66 -0
  77. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q58.sql +107 -0
  78. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q59.sql +85 -0
  79. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q6.sql +31 -0
  80. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q60.sql +86 -0
  81. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q61.sql +43 -0
  82. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q62.sql +62 -0
  83. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q63.sql +40 -0
  84. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q64.sql +110 -0
  85. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q65.sql +46 -0
  86. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q66.sql +160 -0
  87. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q67.sql +58 -0
  88. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q68.sql +44 -0
  89. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q69.sql +59 -0
  90. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q7.sql +24 -0
  91. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q70.sql +39 -0
  92. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q71.sql +49 -0
  93. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q72.sql +65 -0
  94. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q73.sql +39 -0
  95. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q74.sql +70 -0
  96. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q75.sql +101 -0
  97. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q76.sql +55 -0
  98. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q77.sql +133 -0
  99. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q78.sql +96 -0
  100. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q79.sql +40 -0
  101. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q8.sql +35 -0
  102. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q80.sql +110 -0
  103. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q81.sql +61 -0
  104. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q82.sql +22 -0
  105. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q83.sql +94 -0
  106. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q84.sql +16 -0
  107. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q85.sql +64 -0
  108. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q86.sql +21 -0
  109. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q87.sql +39 -0
  110. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q88.sql +203 -0
  111. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q89.sql +50 -0
  112. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q9.sql +119 -0
  113. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q90.sql +28 -0
  114. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q91.sql +33 -0
  115. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q92.sql +24 -0
  116. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q93.sql +31 -0
  117. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q94.sql +32 -0
  118. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q95.sql +39 -0
  119. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q96.sql +14 -0
  120. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q97.sql +49 -0
  121. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q98.sql +28 -0
  122. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/resources/queries/q99.sql +62 -0
  123. lakebench-0.1.0/src/lakebench/benchmarks/tpcds/tpcds.py +28 -0
  124. lakebench-0.1.0/src/lakebench/benchmarks/tpch/__init__.py +1 -0
  125. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/ddl/ddl_v3.0.1.sql +84 -0
  126. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q1.sql +21 -0
  127. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q10.sql +28 -0
  128. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q11.sql +24 -0
  129. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q12.sql +29 -0
  130. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q13.sql +20 -0
  131. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q14.sql +13 -0
  132. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q15.sql +30 -0
  133. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q16.sql +29 -0
  134. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q17.sql +16 -0
  135. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q18.sql +31 -0
  136. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q19.sql +33 -0
  137. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q2.sql +43 -0
  138. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q20.sql +36 -0
  139. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q21.sql +36 -0
  140. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q22.sql +35 -0
  141. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q3.sql +20 -0
  142. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q4.sql +21 -0
  143. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q5.sql +18 -0
  144. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q6.sql +10 -0
  145. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q7.sql +34 -0
  146. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q8.sql +32 -0
  147. lakebench-0.1.0/src/lakebench/benchmarks/tpch/resources/queries/q9.sql +26 -0
  148. lakebench-0.1.0/src/lakebench/benchmarks/tpch/tpch.py +17 -0
  149. lakebench-0.1.0/src/lakebench/datagen/__init__.py +0 -0
  150. lakebench-0.1.0/src/lakebench/datagen/_tpc.py +70 -0
  151. lakebench-0.1.0/src/lakebench/datagen/tpcds.py +7 -0
  152. lakebench-0.1.0/src/lakebench/datagen/tpch.py +7 -0
  153. lakebench-0.1.0/src/lakebench/engines/__init__.py +0 -0
  154. lakebench-0.1.0/src/lakebench/engines/base.py +34 -0
  155. lakebench-0.1.0/src/lakebench/engines/daft.py +66 -0
  156. lakebench-0.1.0/src/lakebench/engines/delta_rs.py +16 -0
  157. lakebench-0.1.0/src/lakebench/engines/duckdb.py +62 -0
  158. lakebench-0.1.0/src/lakebench/engines/fabric_spark.py +29 -0
  159. lakebench-0.1.0/src/lakebench/engines/polars.py +72 -0
  160. lakebench-0.1.0/src/lakebench/engines/spark.py +95 -0
  161. lakebench-0.1.0/src/lakebench/utils/query_utils.py +13 -0
  162. lakebench-0.1.0/src/lakebench/utils/timer.py +42 -0
  163. lakebench-0.1.0/src/lakebench.egg-info/PKG-INFO +214 -0
  164. lakebench-0.1.0/src/lakebench.egg-info/SOURCES.txt +165 -0
  165. lakebench-0.1.0/src/lakebench.egg-info/dependency_links.txt +1 -0
  166. lakebench-0.1.0/src/lakebench.egg-info/requires.txt +21 -0
  167. lakebench-0.1.0/src/lakebench.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Miles Cole
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: lakebench
3
+ Version: 0.1.0
4
+ Summary: A multi-modal Python library for benchmarking Azure lakehouse engines and ELT scenarios, supporting both industry-standard and novel benchmarks.
5
+ Author-email: Miles Cole <m.w.c.360@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Miles Cole
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/mwc360/LakeBench
29
+ Project-URL: Issues, https://github.com/mwc360/LakeBench/issues
30
+ Classifier: Development Status :: 4 - Beta
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Programming Language :: Python
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Topic :: System :: Benchmark
35
+ Requires-Python: >=3.11
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+ Requires-Dist: numpy
39
+ Requires-Dist: pyarrow>=15.0.0
40
+ Requires-Dist: deltalake==0.18.2
41
+ Requires-Dist: sqlglot==26.30.0
42
+ Provides-Extra: duckdb
43
+ Requires-Dist: duckdb==1.3.1; extra == "duckdb"
44
+ Provides-Extra: polars
45
+ Requires-Dist: polars==1.30.0; extra == "polars"
46
+ Provides-Extra: daft
47
+ Requires-Dist: getdaft==0.4.18; extra == "daft"
48
+ Provides-Extra: all
49
+ Requires-Dist: duckdb==1.3.1; extra == "all"
50
+ Requires-Dist: polars==1.30.0; extra == "all"
51
+ Requires-Dist: getdaft==0.4.18; extra == "all"
52
+ Provides-Extra: datagen
53
+ Requires-Dist: duckdb==1.3.1; extra == "datagen"
54
+ Dynamic: license-file
55
+
56
+ # LakeBench
57
+
58
+ 🌊 LakeBench is the first Python-based, multi-modal benchmarking framework designed to evaluate performance across multiple lakehouse compute engines and ELT scenarios. Supporting a variety of engines and both industry-standard and novel benchmarks, LakeBench enables comprehensive, apples-to-apples comparisons in a single, extensible Python library.
59
+
60
+ Most existing benchmarks (like TPC-DS and TPC-H) are too query-heavy and miss the reality that data engineers build complex **ELT pipelines** — not just run analytic queries. While these traditional benchmarks are helpful for testing bulk loading and complex SQL execution, they do not reflect the broader data lifecycle that lakehouse systems must support.
61
+
62
+ **LakeBench bridges this gap by introducing novel benchmarks that aim to capture the growing spectrum of ELT workflows.** In addition to supporting industry standards like TPC-DS and TPC-H, LakeBench includes scenarios that measure not only query performance, but also data loading, transformation, incremental processing, and maintenance operations. This holistic approach enables you to benchmark engines on the real-world tasks that matter most for modern data engineering.
63
+
64
+ > LakeBench proposes that **the entire end-to-end data lifecycle managed by data engineers is relevant**: data loading, bulk and incremental transformations, maintenance jobs, and ad-hoc analytical queries. By benchmarking these stages, LakeBench delivers actionable insights into engine efficiency, performance, and operational trade-offs across the full data pipeline.
65
+
66
+ ---
67
+
68
+ ## 🧱 Key Features
69
+
70
+ - **Modular engine support** (Spark, DuckDB, Polars, Daft)
71
+ - **Benchmark scenarios** that reflect real-world ELT workflows
72
+ - **Atomic units of work** that benchmark discrete lifecycle stages
73
+ - **Configurable execution** to isolate engine behaviors
74
+ - COMING SOON: **Custom result logging** and metrics capture (e.g. SparkMeasure)
75
+
76
+ ---
77
+
78
+ ## 🔍 Benchmark Scenarios
79
+
80
+ LakeBench currently supports three benchmarks with more to come:
81
+
82
+ - **ELTBench**: An benchmark with various modes (`light`, `full`) that simulates typicaly ELT workloads:
83
+ - Raw data load (Parquet → Delta)
84
+ - Fact table generation
85
+ - Incremental merge processing
86
+ - Table maintenance (e.g. OPTIMIZE/VACUUM)
87
+ - Ad-hoc analytical queries
88
+ - **[TPC-DS](https://www.tpc.org/tpcds/)**: An industry-standard benchmark for complex analytical queries, featuring 24 source tables and 99 queries. Designed to simulate decision support systems and analytics workloads.
89
+ - **[TPC-H](https://www.tpc.org/tpch/)**: Focuses on ad-hoc decision support with 8 tables and 22 queries, evaluating performance on business-oriented analytical workloads.
90
+
91
+ _Coming Soon_
92
+ - **AtomicELT**: A derivative of _ELTBench_ that focuses on the performance of individual ELT operations. Each operation type is executed only once, allowing for granular comparison of engine performance on specific tasks. Results should be interpreted per operation, not as a cumulative runtime.
93
+
94
+ ---
95
+
96
+ ## 🛠️ Engine Support Matrix
97
+
98
+ LakeBench supports multiple lakehouse compute engines. Each benchmark scenario declares which engines it supports via `<BenchmarkClassName>.BENCHMARK_IMPL_REGISTRY`.
99
+
100
+ | Engine | ELTBench | AtomicELT | TPC-DS | TPC-H |
101
+ |-----------------|:--------:|:---------:|:------:|:-----:|
102
+ | Spark (Fabric) | ✅ | 🔜 | ✅ | ✅ |
103
+ | DuckDB | ✅ | 🔜 | ✅ | ✅ |
104
+ | Polars | ✅ | 🔜 | ✅ | ✅ |
105
+ | Daft | ✅ | 🔜 | ✅ | ✅ |
106
+
107
+ > **Legend:**
108
+ > ✅ = Supported
109
+ > 🔜 = Coming Soon
110
+ > (Blank) = Not currently supported
111
+
112
+ LakeBench is designed to be _extensible_—new engines can be added via subclassing an existing engine class, and benchmarks can register support for additional engines as they are implemented.
113
+
114
+ ---
115
+
116
+ ## 📦 Installation
117
+
118
+ Install from PyPi:
119
+
120
+ ```bash
121
+ pip install lakebench[duckdb,polars,daft]
122
+ ```
123
+
124
+ _Note: in this initial beta version, all engines have only been tested inside Microsoft Fabric Python and Spark Notebooks._
125
+
126
+ ## Example Usage
127
+
128
+ ### Fabric Spark
129
+ ```python
130
+ from lakebench.engines.fabric_spark import FabricSpark
131
+ from lakebench.benchmarks.elt_bench import ELTBench
132
+
133
+ engine = FabricSpark(
134
+ lakehouse_workspace_name="workspace",
135
+ lakehouse_name="lakehouse",
136
+ lakehouse_schema_name="schema"
137
+ )
138
+
139
+ benchmark = ELTBench(
140
+ engine=engine,
141
+ scenario_name="sf10",
142
+ mode="light",
143
+ tpcds_parquet_abfss_path="abfss://...",
144
+ save_results=True,
145
+ result_abfss_path="abfss://..."
146
+ )
147
+
148
+ benchmark.run()
149
+ ```
150
+
151
+ ### Polars
152
+ ```python
153
+ from lakebench.engines.polars import Polars
154
+ from lakebench.benchmarks.elt_bench import ELTBench
155
+
156
+ engine = Polars(
157
+ delta_abfss_schema_path = 'abfss://...'
158
+ )
159
+
160
+ benchmark = ELTBench(
161
+ engine=engine,
162
+ scenario_name="sf10",
163
+ mode="light",
164
+ tpcds_parquet_abfss_path="abfss://...",
165
+ save_results=True,
166
+ result_abfss_path="abfss://..."
167
+ )
168
+
169
+ benchmark.run()
170
+ ```
171
+
172
+ ## 🔌 Extensibility by Design
173
+
174
+ LakeBench is built to be **plug-and-play** for both benchmark types and compute engines:
175
+
176
+ - You can register **new engines** without modifying core benchmark logic.
177
+ - You can add **new benchmarks** that reuse existing engines and shared engine methods.
178
+ - LakeBench extension libraries can be created to extend core LakeBench capabilities with additional custom benchmarks and engines (i.e. `MyCustomSynapseSpark(Spark)`, `MyOrgsELT(BaseBenchmark)`).
179
+
180
+ This architecture encourages experimentation, benchmarking innovation, and easy adaptation to your needs.
181
+
182
+ _Example:_
183
+ ```python
184
+ # Automatically maps benchmark implementation to your custom engine class
185
+ from lakebench.engines.spark import Spark
186
+
187
+ class MyCustomSynapseSpark(Spark):
188
+ ...
189
+
190
+ benchmark = AtomicELT(engine=MyCustomSynapseSpark(...))
191
+ ```
192
+ All you need to do is subclass the relevant base class and it will auto-register provided that the referenced benchmark supports the base class. No changes to the framework internals required.
193
+
194
+ # 🔍 Philosophy
195
+ LakeBench is designed to host a suite of benchmarks that cover E2E data engineering and consumption workloads:
196
+ - Loading data from raw storage
197
+ - Transforming and enriching data
198
+ - Applying incremental module building logic
199
+ - Maintaining and optimizing datasets
200
+ - Running complex analytical queries
201
+
202
+ The core aim is provide transparency into engine efficiency, performance, and costs across the data lifecycle..
203
+
204
+ # 📬 Feedback / Contributions
205
+ Got ideas? Found a bug? Want to contribute a benchmark or engine wrapper? PRs and issues are welcome!
206
+
207
+
208
+ # Acknowledgement of Other _LakeBench_ Projects
209
+ The **LakeBench** name is also used by two unrelated academic and research efforts:
210
+ - **[RLGen/LAKEBENCH](https://github.com/RLGen/LAKEBENCH)**: A benchmark designed for evaluating vision-language models on multimodal tasks.
211
+ - **LakeBench: Benchmarks for Data Discovery over Lakes** ([paper link](https://www.catalyzex.com/paper/lakebench-benchmarks-for-data-discovery-over)):
212
+ A benchmark suite focused on improving data discovery and exploration over large data lakes.
213
+
214
+ While these projects target very different problem domains — such as machine learning and data discovery — they coincidentally share the same name. This project, focused on ELT benchmarking across lakehouse engines, is not affiliated with or derived from either.
@@ -0,0 +1,159 @@
1
+ # LakeBench
2
+
3
+ 🌊 LakeBench is the first Python-based, multi-modal benchmarking framework designed to evaluate performance across multiple lakehouse compute engines and ELT scenarios. Supporting a variety of engines and both industry-standard and novel benchmarks, LakeBench enables comprehensive, apples-to-apples comparisons in a single, extensible Python library.
4
+
5
+ Most existing benchmarks (like TPC-DS and TPC-H) are too query-heavy and miss the reality that data engineers build complex **ELT pipelines** — not just run analytic queries. While these traditional benchmarks are helpful for testing bulk loading and complex SQL execution, they do not reflect the broader data lifecycle that lakehouse systems must support.
6
+
7
+ **LakeBench bridges this gap by introducing novel benchmarks that aim to capture the growing spectrum of ELT workflows.** In addition to supporting industry standards like TPC-DS and TPC-H, LakeBench includes scenarios that measure not only query performance, but also data loading, transformation, incremental processing, and maintenance operations. This holistic approach enables you to benchmark engines on the real-world tasks that matter most for modern data engineering.
8
+
9
+ > LakeBench proposes that **the entire end-to-end data lifecycle managed by data engineers is relevant**: data loading, bulk and incremental transformations, maintenance jobs, and ad-hoc analytical queries. By benchmarking these stages, LakeBench delivers actionable insights into engine efficiency, performance, and operational trade-offs across the full data pipeline.
10
+
11
+ ---
12
+
13
+ ## 🧱 Key Features
14
+
15
+ - **Modular engine support** (Spark, DuckDB, Polars, Daft)
16
+ - **Benchmark scenarios** that reflect real-world ELT workflows
17
+ - **Atomic units of work** that benchmark discrete lifecycle stages
18
+ - **Configurable execution** to isolate engine behaviors
19
+ - COMING SOON: **Custom result logging** and metrics capture (e.g. SparkMeasure)
20
+
21
+ ---
22
+
23
+ ## 🔍 Benchmark Scenarios
24
+
25
+ LakeBench currently supports three benchmarks with more to come:
26
+
27
+ - **ELTBench**: An benchmark with various modes (`light`, `full`) that simulates typicaly ELT workloads:
28
+ - Raw data load (Parquet → Delta)
29
+ - Fact table generation
30
+ - Incremental merge processing
31
+ - Table maintenance (e.g. OPTIMIZE/VACUUM)
32
+ - Ad-hoc analytical queries
33
+ - **[TPC-DS](https://www.tpc.org/tpcds/)**: An industry-standard benchmark for complex analytical queries, featuring 24 source tables and 99 queries. Designed to simulate decision support systems and analytics workloads.
34
+ - **[TPC-H](https://www.tpc.org/tpch/)**: Focuses on ad-hoc decision support with 8 tables and 22 queries, evaluating performance on business-oriented analytical workloads.
35
+
36
+ _Coming Soon_
37
+ - **AtomicELT**: A derivative of _ELTBench_ that focuses on the performance of individual ELT operations. Each operation type is executed only once, allowing for granular comparison of engine performance on specific tasks. Results should be interpreted per operation, not as a cumulative runtime.
38
+
39
+ ---
40
+
41
+ ## 🛠️ Engine Support Matrix
42
+
43
+ LakeBench supports multiple lakehouse compute engines. Each benchmark scenario declares which engines it supports via `<BenchmarkClassName>.BENCHMARK_IMPL_REGISTRY`.
44
+
45
+ | Engine | ELTBench | AtomicELT | TPC-DS | TPC-H |
46
+ |-----------------|:--------:|:---------:|:------:|:-----:|
47
+ | Spark (Fabric) | ✅ | 🔜 | ✅ | ✅ |
48
+ | DuckDB | ✅ | 🔜 | ✅ | ✅ |
49
+ | Polars | ✅ | 🔜 | ✅ | ✅ |
50
+ | Daft | ✅ | 🔜 | ✅ | ✅ |
51
+
52
+ > **Legend:**
53
+ > ✅ = Supported
54
+ > 🔜 = Coming Soon
55
+ > (Blank) = Not currently supported
56
+
57
+ LakeBench is designed to be _extensible_—new engines can be added via subclassing an existing engine class, and benchmarks can register support for additional engines as they are implemented.
58
+
59
+ ---
60
+
61
+ ## 📦 Installation
62
+
63
+ Install from PyPi:
64
+
65
+ ```bash
66
+ pip install lakebench[duckdb,polars,daft]
67
+ ```
68
+
69
+ _Note: in this initial beta version, all engines have only been tested inside Microsoft Fabric Python and Spark Notebooks._
70
+
71
+ ## Example Usage
72
+
73
+ ### Fabric Spark
74
+ ```python
75
+ from lakebench.engines.fabric_spark import FabricSpark
76
+ from lakebench.benchmarks.elt_bench import ELTBench
77
+
78
+ engine = FabricSpark(
79
+ lakehouse_workspace_name="workspace",
80
+ lakehouse_name="lakehouse",
81
+ lakehouse_schema_name="schema"
82
+ )
83
+
84
+ benchmark = ELTBench(
85
+ engine=engine,
86
+ scenario_name="sf10",
87
+ mode="light",
88
+ tpcds_parquet_abfss_path="abfss://...",
89
+ save_results=True,
90
+ result_abfss_path="abfss://..."
91
+ )
92
+
93
+ benchmark.run()
94
+ ```
95
+
96
+ ### Polars
97
+ ```python
98
+ from lakebench.engines.polars import Polars
99
+ from lakebench.benchmarks.elt_bench import ELTBench
100
+
101
+ engine = Polars(
102
+ delta_abfss_schema_path = 'abfss://...'
103
+ )
104
+
105
+ benchmark = ELTBench(
106
+ engine=engine,
107
+ scenario_name="sf10",
108
+ mode="light",
109
+ tpcds_parquet_abfss_path="abfss://...",
110
+ save_results=True,
111
+ result_abfss_path="abfss://..."
112
+ )
113
+
114
+ benchmark.run()
115
+ ```
116
+
117
+ ## 🔌 Extensibility by Design
118
+
119
+ LakeBench is built to be **plug-and-play** for both benchmark types and compute engines:
120
+
121
+ - You can register **new engines** without modifying core benchmark logic.
122
+ - You can add **new benchmarks** that reuse existing engines and shared engine methods.
123
+ - LakeBench extension libraries can be created to extend core LakeBench capabilities with additional custom benchmarks and engines (i.e. `MyCustomSynapseSpark(Spark)`, `MyOrgsELT(BaseBenchmark)`).
124
+
125
+ This architecture encourages experimentation, benchmarking innovation, and easy adaptation to your needs.
126
+
127
+ _Example:_
128
+ ```python
129
+ # Automatically maps benchmark implementation to your custom engine class
130
+ from lakebench.engines.spark import Spark
131
+
132
+ class MyCustomSynapseSpark(Spark):
133
+ ...
134
+
135
+ benchmark = AtomicELT(engine=MyCustomSynapseSpark(...))
136
+ ```
137
+ All you need to do is subclass the relevant base class and it will auto-register provided that the referenced benchmark supports the base class. No changes to the framework internals required.
138
+
139
+ # 🔍 Philosophy
140
+ LakeBench is designed to host a suite of benchmarks that cover E2E data engineering and consumption workloads:
141
+ - Loading data from raw storage
142
+ - Transforming and enriching data
143
+ - Applying incremental module building logic
144
+ - Maintaining and optimizing datasets
145
+ - Running complex analytical queries
146
+
147
+ The core aim is provide transparency into engine efficiency, performance, and costs across the data lifecycle..
148
+
149
+ # 📬 Feedback / Contributions
150
+ Got ideas? Found a bug? Want to contribute a benchmark or engine wrapper? PRs and issues are welcome!
151
+
152
+
153
+ # Acknowledgement of Other _LakeBench_ Projects
154
+ The **LakeBench** name is also used by two unrelated academic and research efforts:
155
+ - **[RLGen/LAKEBENCH](https://github.com/RLGen/LAKEBENCH)**: A benchmark designed for evaluating vision-language models on multimodal tasks.
156
+ - **LakeBench: Benchmarks for Data Discovery over Lakes** ([paper link](https://www.catalyzex.com/paper/lakebench-benchmarks-for-data-discovery-over)):
157
+ A benchmark suite focused on improving data discovery and exploration over large data lakes.
158
+
159
+ While these projects target very different problem domains — such as machine learning and data discovery — they coincidentally share the same name. This project, focused on ELT benchmarking across lakehouse engines, is not affiliated with or derived from either.
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "lakebench"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name="Miles Cole", email="m.w.c.360@gmail.com" },
10
+ ]
11
+ license = {file = "LICENSE"}
12
+ description = "A multi-modal Python library for benchmarking Azure lakehouse engines and ELT scenarios, supporting both industry-standard and novel benchmarks."
13
+ readme = "README.md"
14
+ requires-python = ">=3.11"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python",
19
+ "Intended Audience :: Developers",
20
+ "Topic :: System :: Benchmark",
21
+ ]
22
+ dependencies = [
23
+ "numpy",
24
+ "pyarrow>=15.0.0",
25
+ "deltalake==0.18.2",
26
+ "sqlglot==26.30.0"
27
+ ]
28
+
29
+ [project.optional-dependencies]
30
+ duckdb = ["duckdb==1.3.1"]
31
+ polars = ["polars==1.30.0"]
32
+ daft = ["getdaft==0.4.18"]
33
+ all = ["duckdb==1.3.1", "polars==1.30.0", "getdaft==0.4.18"]
34
+ datagen = ["duckdb==1.3.1"]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/mwc360/LakeBench"
38
+ Issues = "https://github.com/mwc360/LakeBench/issues"
39
+
40
+ [tool.setuptools]
41
+ package-dir = {"" = "src"}
42
+ include-package-data = true
43
+
44
+ [tool.setuptools.package-data]
45
+ # Include SQL files src directory
46
+ "lakebench" = ["**/*.sql"]
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
File without changes
@@ -0,0 +1 @@
1
+ from ._tpc import _TPC
@@ -0,0 +1,156 @@
1
+ from typing import Optional
2
+ from ..base import BaseBenchmark
3
+ from ...utils.query_utils import transpile_and_qualify_query
4
+
5
+ from ...engines.base import BaseEngine
6
+ from ...engines.spark import Spark
7
+ from ...engines.duckdb import DuckDB
8
+ from ...engines.daft import Daft
9
+ from ...engines.polars import Polars
10
+
11
+ import importlib.resources
12
+ import posixpath
13
+
14
+ class _TPC(BaseBenchmark):
15
+ """
16
+ """
17
+ BENCHMARK_IMPL_REGISTRY = {
18
+ Spark: None,
19
+ DuckDB: None,
20
+ Daft: None,
21
+ Polars: None
22
+ }
23
+ MODE_REGISTRY = ['load', 'query', 'power_test']
24
+ TPC_BENCHMARK_VARIANT = ''
25
+ TABLE_REGISTRY = [
26
+ 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales',
27
+ 'customer', 'customer_address', 'customer_demographics', 'date_dim',
28
+ 'household_demographics', 'income_band', 'inventory', 'item',
29
+ 'promotion', 'reason', 'ship_mode', 'store', 'store_returns',
30
+ 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns',
31
+ 'web_sales', 'web_site'
32
+ ]
33
+ QUERY_REGISTRY = [
34
+ 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10',
35
+ 'q11', 'q12', 'q13', 'q14a', 'q14b', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20',
36
+ 'q21', 'q22', 'q23a', 'q23b', 'q24a', 'q24b', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30',
37
+ 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39a', 'q39b', 'q40',
38
+ 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50',
39
+ 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60',
40
+ 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70',
41
+ 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80',
42
+ 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'q88', 'q89', 'q90',
43
+ 'q91', 'q92', 'q93', 'q94', 'q95', 'q96', 'q97', 'q98', 'q99'
44
+ ]
45
+ DDL_FILE_NAME = ''
46
+
47
+ def __init__(
48
+ self,
49
+ engine: BaseEngine,
50
+ scenario_name: str,
51
+ query_list: Optional[list[str]] | None = None,
52
+ parquet_mount_path: Optional[str] = None,
53
+ parquet_abfss_path: Optional[str] = None,
54
+ result_abfss_path: Optional[str] = None,
55
+ save_results: bool = False
56
+ ):
57
+ super().__init__(engine, scenario_name, result_abfss_path, save_results)
58
+ if query_list is not None:
59
+ expanded_query_list = []
60
+ for query in query_list:
61
+ if query == '*':
62
+ expanded_query_list.extend(self.QUERY_REGISTRY) # Replace '*' with all queries
63
+ else:
64
+ expanded_query_list.append(query)
65
+ query_set = set(query_list)
66
+ if not query_set.issubset(self.QUERY_REGISTRY):
67
+ unsupported_queries = query_set - set(self.QUERY_REGISTRY)
68
+ raise ValueError(f"Query list contains unsupported queries: {unsupported_queries}. Supported queries: {self.QUERY_REGISTRY}.")
69
+ self.query_list = expanded_query_list
70
+ else:
71
+ self.query_list = self.QUERY_REGISTRY
72
+
73
+ for base_engine, benchmark_impl in self.BENCHMARK_IMPL_REGISTRY.items():
74
+ if isinstance(engine, base_engine):
75
+ self.benchmark_impl_class = benchmark_impl
76
+ break
77
+ else:
78
+ raise ValueError(
79
+ f"No benchmark implementation registered for engine type: {type(engine).__name__} "
80
+ f"in benchmark '{self.__class__.__name__}'."
81
+ )
82
+
83
+ self.engine = engine
84
+ self.scenario_name = scenario_name
85
+
86
+ match engine.REQUIRED_READ_ENDPOINT:
87
+ case 'mount':
88
+ if parquet_mount_path is None:
89
+ raise ValueError(f"parquet_mount_path must be provided for {type(engine).__name__} engine.")
90
+ self.source_data_path = parquet_mount_path
91
+ case 'abfss':
92
+ if parquet_abfss_path is None:
93
+ raise ValueError(f"parquet_abfss_path must be provided for {type(engine).__name__} engine.")
94
+ self.source_data_path = parquet_abfss_path
95
+ case _:
96
+ if parquet_mount_path is None and parquet_abfss_path is None:
97
+ raise ValueError(
98
+ f"Either parquet_mount_path or parquet_abfss_path must be provided for {type(engine).__name__} engine."
99
+ )
100
+ self.source_data_path = parquet_abfss_path or parquet_mount_path
101
+
102
+ def run(self, mode: str = 'power_test'):
103
+ match mode:
104
+ case 'load':
105
+ self._run_load_test()
106
+ case 'query':
107
+ self._run_query_test()
108
+ case 'power_test':
109
+ self._run_power_test()
110
+ case _:
111
+ raise ValueError(f"Unknown mode '{mode}'. Supported modes: {self.MODE_REGISTRY}.")
112
+
113
+ def _prepare_schema(self):
114
+ self.engine.create_schema_if_not_exists(drop_before_create=True)
115
+ with importlib.resources.path(f"lakebench.benchmarks.tpc{self.TPC_BENCHMARK_VARIANT.lower()}.resources.ddl", self.DDL_FILE_NAME) as ddl_path:
116
+ with open(ddl_path, 'r') as ddl_file:
117
+ ddl = ddl_file.read()
118
+
119
+ statements = [s for s in ddl.split(';') if len(s) > 7]
120
+ for statement in statements:
121
+ self.engine.execute_sql_statement(statement)
122
+
123
+ def _run_load_test(self):
124
+ if isinstance(self.engine, Spark):
125
+ self._prepare_schema()
126
+ for table_name in self.TABLE_REGISTRY:
127
+ with self.timer(phase="Load", test_item=table_name, engine=self.engine):
128
+ self.engine.load_parquet_to_delta(
129
+ parquet_folder_path=posixpath.join(self.source_data_path,table_name),
130
+ table_name=table_name
131
+ )
132
+ self.post_results()
133
+
134
+ def _run_query_test(self):
135
+ if isinstance(self.engine, (DuckDB, Daft, Polars)):
136
+ for table_name in self.TABLE_REGISTRY:
137
+ self.engine.register_table(table_name)
138
+ for query_name in self.query_list:
139
+ with importlib.resources.path(f"lakebench.benchmarks.tpc{self.TPC_BENCHMARK_VARIANT.lower()}.resources.queries", f'{query_name}.sql') as query_path:
140
+ with open(query_path, 'r') as query_file:
141
+ query = query_file.read()
142
+
143
+ prepped_query = transpile_and_qualify_query(
144
+ query=query,
145
+ from_dialect='spark',
146
+ to_dialect=self.engine.SQLGLOT_DIALECT,
147
+ catalog=self.engine.catalog_name,
148
+ schema=self.engine.schema_name
149
+ )
150
+ with self.timer(phase="Query", test_item=query_name, engine=self.engine):
151
+ execute_query = self.engine.execute_sql_query(prepped_query)
152
+ self.post_results()
153
+
154
+ def _run_power_test(self):
155
+ self._run_load_test()
156
+ self._run_query_test()
@@ -0,0 +1,54 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, Type, Optional
3
+ import uuid
4
+ from datetime import datetime
5
+ from ..utils.timer import timer
6
+
7
+ class BaseBenchmark(ABC):
8
+ BENCHMARK_IMPL_REGISTRY: Dict[Type, Type] = {}
9
+
10
+ def __init__(self, engine, scenario_name: str, result_abfss_path: Optional[str], save_results: bool = False):
11
+ self.engine = engine
12
+ self.scenario_name = scenario_name
13
+ self.result_abfss_path = result_abfss_path
14
+ self.save_results = save_results
15
+ self.header_detail_dict = {
16
+ 'run_id': str(uuid.uuid1()),
17
+ 'run_datetime': datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
18
+ 'engine': type(engine).__name__,
19
+ 'benchmark': self.__class__.__name__,
20
+ 'scenario': scenario_name,
21
+ 'total_cores': self.engine.get_total_cores(),
22
+ 'compute_size': self.engine.get_compute_size()
23
+ }
24
+ self.timer = timer
25
+ self.results = []
26
+
27
+ @abstractmethod
28
+ def run(self):
29
+ pass
30
+
31
+ def post_results(self):
32
+ result_array = [
33
+ {
34
+ **self.header_detail_dict,
35
+ 'phase': phase,
36
+ 'test_item': test_item,
37
+ 'start_datetime': start_datetime,
38
+ "duration_sec": duration_ms / 1000,
39
+ 'duration_ms': duration_ms,
40
+ 'iteration': iteration,
41
+ 'success': success,
42
+ 'error_message': error_message
43
+ }
44
+ for phase, test_item, start_datetime, duration_ms, iteration, success, error_message in self.timer.results
45
+ ]
46
+
47
+ if self.save_results:
48
+ if self.result_abfss_path is None:
49
+ raise ValueError("result_abfss_path must be provided if save_results is True.")
50
+ else:
51
+ self.engine.append_array_to_delta(self.result_abfss_path, result_array)
52
+
53
+ self.results.append(result_array)
54
+ self.timer.clear_results()
@@ -0,0 +1 @@
1
+ from .elt_bench import ELTBench