databricks-tpcds 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. databricks_tpcds-0.1.0/MANIFEST.in +1 -0
  2. databricks_tpcds-0.1.0/PKG-INFO +101 -0
  3. databricks_tpcds-0.1.0/README.md +80 -0
  4. databricks_tpcds-0.1.0/setup.cfg +4 -0
  5. databricks_tpcds-0.1.0/setup.py +31 -0
  6. databricks_tpcds-0.1.0/src/databricks_tpcds/__init__.py +9 -0
  7. databricks_tpcds-0.1.0/src/databricks_tpcds/databricks_tpcds.py +153 -0
  8. databricks_tpcds-0.1.0/src/databricks_tpcds.egg-info/PKG-INFO +101 -0
  9. databricks_tpcds-0.1.0/src/databricks_tpcds.egg-info/SOURCES.txt +119 -0
  10. databricks_tpcds-0.1.0/src/databricks_tpcds.egg-info/dependency_links.txt +1 -0
  11. databricks_tpcds-0.1.0/src/databricks_tpcds.egg-info/top_level.txt +2 -0
  12. databricks_tpcds-0.1.0/src/resources/__init__.py +0 -0
  13. databricks_tpcds-0.1.0/src/resources/queries/q0.sql +2 -0
  14. databricks_tpcds-0.1.0/src/resources/queries/q1.sql +24 -0
  15. databricks_tpcds-0.1.0/src/resources/queries/q10.sql +58 -0
  16. databricks_tpcds-0.1.0/src/resources/queries/q11.sql +80 -0
  17. databricks_tpcds-0.1.0/src/resources/queries/q12.sql +33 -0
  18. databricks_tpcds-0.1.0/src/resources/queries/q13.sql +51 -0
  19. databricks_tpcds-0.1.0/src/resources/queries/q14.sql +209 -0
  20. databricks_tpcds-0.1.0/src/resources/queries/q14a.sql +102 -0
  21. databricks_tpcds-0.1.0/src/resources/queries/q14b.sql +106 -0
  22. databricks_tpcds-0.1.0/src/resources/queries/q15.sql +19 -0
  23. databricks_tpcds-0.1.0/src/resources/queries/q16.sql +30 -0
  24. databricks_tpcds-0.1.0/src/resources/queries/q17.sql +44 -0
  25. databricks_tpcds-0.1.0/src/resources/queries/q18.sql +33 -0
  26. databricks_tpcds-0.1.0/src/resources/queries/q19.sql +24 -0
  27. databricks_tpcds-0.1.0/src/resources/queries/q2.sql +59 -0
  28. databricks_tpcds-0.1.0/src/resources/queries/q20.sql +29 -0
  29. databricks_tpcds-0.1.0/src/resources/queries/q21.sql +29 -0
  30. databricks_tpcds-0.1.0/src/resources/queries/q22.sql +19 -0
  31. databricks_tpcds-0.1.0/src/resources/queries/q23.sql +106 -0
  32. databricks_tpcds-0.1.0/src/resources/queries/q23a.sql +49 -0
  33. databricks_tpcds-0.1.0/src/resources/queries/q23b.sql +56 -0
  34. databricks_tpcds-0.1.0/src/resources/queries/q24.sql +105 -0
  35. databricks_tpcds-0.1.0/src/resources/queries/q24a.sql +51 -0
  36. databricks_tpcds-0.1.0/src/resources/queries/q24b.sql +53 -0
  37. databricks_tpcds-0.1.0/src/resources/queries/q25.sql +47 -0
  38. databricks_tpcds-0.1.0/src/resources/queries/q26.sql +20 -0
  39. databricks_tpcds-0.1.0/src/resources/queries/q27.sql +22 -0
  40. databricks_tpcds-0.1.0/src/resources/queries/q28.sql +52 -0
  41. databricks_tpcds-0.1.0/src/resources/queries/q29.sql +46 -0
  42. databricks_tpcds-0.1.0/src/resources/queries/q3.sql +20 -0
  43. databricks_tpcds-0.1.0/src/resources/queries/q30.sql +30 -0
  44. databricks_tpcds-0.1.0/src/resources/queries/q31.sql +51 -0
  45. databricks_tpcds-0.1.0/src/resources/queries/q32.sql +27 -0
  46. databricks_tpcds-0.1.0/src/resources/queries/q33.sql +74 -0
  47. databricks_tpcds-0.1.0/src/resources/queries/q34.sql +30 -0
  48. databricks_tpcds-0.1.0/src/resources/queries/q35.sql +57 -0
  49. databricks_tpcds-0.1.0/src/resources/queries/q36.sql +29 -0
  50. databricks_tpcds-0.1.0/src/resources/queries/q37.sql +16 -0
  51. databricks_tpcds-0.1.0/src/resources/queries/q38.sql +22 -0
  52. databricks_tpcds-0.1.0/src/resources/queries/q39.sql +53 -0
  53. databricks_tpcds-0.1.0/src/resources/queries/q39a.sql +25 -0
  54. databricks_tpcds-0.1.0/src/resources/queries/q39b.sql +27 -0
  55. databricks_tpcds-0.1.0/src/resources/queries/q4.sql +115 -0
  56. databricks_tpcds-0.1.0/src/resources/queries/q40.sql +27 -0
  57. databricks_tpcds-0.1.0/src/resources/queries/q41.sql +51 -0
  58. databricks_tpcds-0.1.0/src/resources/queries/q42.sql +21 -0
  59. databricks_tpcds-0.1.0/src/resources/queries/q43.sql +18 -0
  60. databricks_tpcds-0.1.0/src/resources/queries/q44.sql +34 -0
  61. databricks_tpcds-0.1.0/src/resources/queries/q45.sql +19 -0
  62. databricks_tpcds-0.1.0/src/resources/queries/q46.sql +34 -0
  63. databricks_tpcds-0.1.0/src/resources/queries/q47.sql +50 -0
  64. databricks_tpcds-0.1.0/src/resources/queries/q48.sql +66 -0
  65. databricks_tpcds-0.1.0/src/resources/queries/q49.sql +128 -0
  66. databricks_tpcds-0.1.0/src/resources/queries/q5.sql +127 -0
  67. databricks_tpcds-0.1.0/src/resources/queries/q50.sql +58 -0
  68. databricks_tpcds-0.1.0/src/resources/queries/q51.sql +44 -0
  69. databricks_tpcds-0.1.0/src/resources/queries/q52.sql +21 -0
  70. databricks_tpcds-0.1.0/src/resources/queries/q53.sql +27 -0
  71. databricks_tpcds-0.1.0/src/resources/queries/q54.sql +55 -0
  72. databricks_tpcds-0.1.0/src/resources/queries/q55.sql +13 -0
  73. databricks_tpcds-0.1.0/src/resources/queries/q56.sql +68 -0
  74. databricks_tpcds-0.1.0/src/resources/queries/q57.sql +47 -0
  75. databricks_tpcds-0.1.0/src/resources/queries/q58.sql +64 -0
  76. databricks_tpcds-0.1.0/src/resources/queries/q59.sql +43 -0
  77. databricks_tpcds-0.1.0/src/resources/queries/q6.sql +25 -0
  78. databricks_tpcds-0.1.0/src/resources/queries/q60.sql +77 -0
  79. databricks_tpcds-0.1.0/src/resources/queries/q61.sql +43 -0
  80. databricks_tpcds-0.1.0/src/resources/queries/q62.sql +34 -0
  81. databricks_tpcds-0.1.0/src/resources/queries/q63.sql +28 -0
  82. databricks_tpcds-0.1.0/src/resources/queries/q64.sql +119 -0
  83. databricks_tpcds-0.1.0/src/resources/queries/q65.sql +28 -0
  84. databricks_tpcds-0.1.0/src/resources/queries/q66.sql +219 -0
  85. databricks_tpcds-0.1.0/src/resources/queries/q67.sql +43 -0
  86. databricks_tpcds-0.1.0/src/resources/queries/q68.sql +41 -0
  87. databricks_tpcds-0.1.0/src/resources/queries/q69.sql +46 -0
  88. databricks_tpcds-0.1.0/src/resources/queries/q7.sql +20 -0
  89. databricks_tpcds-0.1.0/src/resources/queries/q70.sql +37 -0
  90. databricks_tpcds-0.1.0/src/resources/queries/q71.sql +39 -0
  91. databricks_tpcds-0.1.0/src/resources/queries/q72.sql +28 -0
  92. databricks_tpcds-0.1.0/src/resources/queries/q73.sql +27 -0
  93. databricks_tpcds-0.1.0/src/resources/queries/q74.sql +60 -0
  94. databricks_tpcds-0.1.0/src/resources/queries/q75.sql +69 -0
  95. databricks_tpcds-0.1.0/src/resources/queries/q76.sql +23 -0
  96. databricks_tpcds-0.1.0/src/resources/queries/q77.sql +107 -0
  97. databricks_tpcds-0.1.0/src/resources/queries/q78.sql +57 -0
  98. databricks_tpcds-0.1.0/src/resources/queries/q79.sql +22 -0
  99. databricks_tpcds-0.1.0/src/resources/queries/q8.sql +107 -0
  100. databricks_tpcds-0.1.0/src/resources/queries/q80.sql +95 -0
  101. databricks_tpcds-0.1.0/src/resources/queries/q81.sql +30 -0
  102. databricks_tpcds-0.1.0/src/resources/queries/q82.sql +16 -0
  103. databricks_tpcds-0.1.0/src/resources/queries/q83.sql +66 -0
  104. databricks_tpcds-0.1.0/src/resources/queries/q84.sql +20 -0
  105. databricks_tpcds-0.1.0/src/resources/queries/q85.sql +83 -0
  106. databricks_tpcds-0.1.0/src/resources/queries/q86.sql +25 -0
  107. databricks_tpcds-0.1.0/src/resources/queries/q87.sql +22 -0
  108. databricks_tpcds-0.1.0/src/resources/queries/q88.sql +93 -0
  109. databricks_tpcds-0.1.0/src/resources/queries/q89.sql +27 -0
  110. databricks_tpcds-0.1.0/src/resources/queries/q9.sql +50 -0
  111. databricks_tpcds-0.1.0/src/resources/queries/q90.sql +21 -0
  112. databricks_tpcds-0.1.0/src/resources/queries/q91.sql +30 -0
  113. databricks_tpcds-0.1.0/src/resources/queries/q92.sql +29 -0
  114. databricks_tpcds-0.1.0/src/resources/queries/q93.sql +17 -0
  115. databricks_tpcds-0.1.0/src/resources/queries/q94.sql +28 -0
  116. databricks_tpcds-0.1.0/src/resources/queries/q95.sql +31 -0
  117. databricks_tpcds-0.1.0/src/resources/queries/q96.sql +15 -0
  118. databricks_tpcds-0.1.0/src/resources/queries/q97.sql +24 -0
  119. databricks_tpcds-0.1.0/src/resources/queries/q98.sql +32 -0
  120. databricks_tpcds-0.1.0/src/resources/queries/q99.sql +34 -0
  121. databricks_tpcds-0.1.0/src/resources/table_names.py +26 -0
@@ -0,0 +1 @@
1
+ recursive-include src/resources/queries *.sql
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: databricks-tpcds
3
+ Version: 0.1.0
4
+ Summary: Run the TPC-DS benchmark on Databricks (Delta Lake).
5
+ Home-page: https://github.com/onehouseinc/onebench
6
+ Author: Onehouse
7
+ License: Apache-2.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ Dynamic: author
14
+ Dynamic: classifier
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-python
20
+ Dynamic: summary
21
+
22
+ ## Running TPCDS on Databricks
23
+ This document describes how to run TPCDS on Databricks. The TPCDS benchmark is a decision support benchmark that models several generally applicable aspects of a decision support system, including queries and data maintenance. The benchmark provides a representative evaluation of performance as a general purpose decision support system. The benchmark is the result of a partnership between the Transaction Processing Performance Council (TPC) and the decision support group (DS) of the Association for Computing Machinery (ACM).
24
+
25
+ ### Pre-requisites
26
+ 1. Databricks workspace
27
+ 2. Databricks metastore configured to workspace
28
+ 3. Databricks cluster (jobs/all purpose etc)
29
+
30
+ ## Install from PyPI
31
+ Install the package directly in a Databricks notebook:
32
+ ```shell
33
+ %pip install databricks-tpcds
34
+ ```
35
+
36
+ The package provides the `DatabricksTPCDS` library. You drive it from an entrypoint script like
37
+ the Delta Lake example below.
38
+
39
+ ## Delta Lake entrypoint example
40
+ Fill in the placeholder `catalog_name`, `bucket_name`, `prefix`, and `schema_name` with your own
41
+ values, then run it on your Databricks cluster.
42
+
43
+ ```python
44
+ from pyspark.sql import SparkSession
45
+ from databricks_tpcds.databricks_tpcds import DatabricksTPCDS
46
+
47
+
48
+ def main():
49
+ catalog_name = 'my_catalog'
50
+ bucket_name = 'my-bucket'
51
+ prefix = 'path/to/tpcds-datasets/1TB'
52
+ schema_name = 'my_schema'
53
+
54
+ # Initialize Spark session
55
+ spark = SparkSession.builder.appName("TPCDS Query Runner").getOrCreate()
56
+
57
+ # Enable/disable cache
58
+ spark.conf.set("spark.databricks.io.cache.enabled", "false")
59
+
60
+ databricks_tpcds = DatabricksTPCDS(spark, schema_name=schema_name, catalog_name=catalog_name)
61
+
62
+ # Create catalog
63
+ databricks_tpcds.create_catalog()
64
+
65
+ # Create schema
66
+ databricks_tpcds.create_schema()
67
+
68
+ # Create a single table, provide the table name
69
+ # databricks_tpcds.create_table(bucket_name, prefix, "call_center")
70
+
71
+ # Create multiple tables, provide the list of table names
72
+ # databricks_tpcds.create_tables(bucket_name, prefix, ["call_center", "catalog_page"])
73
+
74
+ # Create all tables, provide the bucket name and prefix, it'll create all the tables
75
+ databricks_tpcds.create_all_tables(bucket_name, prefix)
76
+
77
+ # Run all queries
78
+ for i in range(3):
79
+ time_taken_by_queries = databricks_tpcds.run_all_queries(should_warmup=False)
80
+ print("QUERY_NUMBER,TIME_TAKEN")
81
+ for query_no, time_taken in time_taken_by_queries.items():
82
+ print(f"{query_no},{time_taken}")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
87
+ ```
88
+
89
+ ## Developing locally
90
+ 1. Modify the code if necessary in `src/databricks_tpcds/databricks_tpcds.py`
91
+ 2. Take a look or modify the queries in `src/resources/queries/`
92
+ 3. Build the package:
93
+ ```shell
94
+ cd tpcds/databricks
95
+ python3.10 -m build
96
+ ```
97
+ 4. Upload the built `.whl` to your Databricks workspace and install it in a notebook:
98
+ ```shell
99
+ %pip install path/to/databricks_tpcds-0.1.0-py3-none-any.whl --force-reinstall
100
+ ```
101
+ 5. Run the benchmark using the Delta Lake entrypoint example above.
@@ -0,0 +1,80 @@
1
+ ## Running TPCDS on Databricks
2
+ This document describes how to run TPCDS on Databricks. The TPCDS benchmark is a decision support benchmark that models several generally applicable aspects of a decision support system, including queries and data maintenance. The benchmark provides a representative evaluation of performance as a general purpose decision support system. The benchmark is the result of a partnership between the Transaction Processing Performance Council (TPC) and the decision support group (DS) of the Association for Computing Machinery (ACM).
3
+
4
+ ### Pre-requisites
5
+ 1. Databricks workspace
6
+ 2. Databricks metastore configured to workspace
7
+ 3. Databricks cluster (jobs/all purpose etc)
8
+
9
+ ## Install from PyPI
10
+ Install the package directly in a Databricks notebook:
11
+ ```shell
12
+ %pip install databricks-tpcds
13
+ ```
14
+
15
+ The package provides the `DatabricksTPCDS` library. You drive it from an entrypoint script like
16
+ the Delta Lake example below.
17
+
18
+ ## Delta Lake entrypoint example
19
+ Fill in the placeholder `catalog_name`, `bucket_name`, `prefix`, and `schema_name` with your own
20
+ values, then run it on your Databricks cluster.
21
+
22
+ ```python
23
+ from pyspark.sql import SparkSession
24
+ from databricks_tpcds.databricks_tpcds import DatabricksTPCDS
25
+
26
+
27
+ def main():
28
+ catalog_name = 'my_catalog'
29
+ bucket_name = 'my-bucket'
30
+ prefix = 'path/to/tpcds-datasets/1TB'
31
+ schema_name = 'my_schema'
32
+
33
+ # Initialize Spark session
34
+ spark = SparkSession.builder.appName("TPCDS Query Runner").getOrCreate()
35
+
36
+ # Enable/disable cache
37
+ spark.conf.set("spark.databricks.io.cache.enabled", "false")
38
+
39
+ databricks_tpcds = DatabricksTPCDS(spark, schema_name=schema_name, catalog_name=catalog_name)
40
+
41
+ # Create catalog
42
+ databricks_tpcds.create_catalog()
43
+
44
+ # Create schema
45
+ databricks_tpcds.create_schema()
46
+
47
+ # Create a single table, provide the table name
48
+ # databricks_tpcds.create_table(bucket_name, prefix, "call_center")
49
+
50
+ # Create multiple tables, provide the list of table names
51
+ # databricks_tpcds.create_tables(bucket_name, prefix, ["call_center", "catalog_page"])
52
+
53
+ # Create all tables, provide the bucket name and prefix, it'll create all the tables
54
+ databricks_tpcds.create_all_tables(bucket_name, prefix)
55
+
56
+ # Run all queries
57
+ for i in range(3):
58
+ time_taken_by_queries = databricks_tpcds.run_all_queries(should_warmup=False)
59
+ print("QUERY_NUMBER,TIME_TAKEN")
60
+ for query_no, time_taken in time_taken_by_queries.items():
61
+ print(f"{query_no},{time_taken}")
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
66
+ ```
67
+
68
+ ## Developing locally
69
+ 1. Modify the code if necessary in `src/databricks_tpcds/databricks_tpcds.py`
70
+ 2. Take a look or modify the queries in `src/resources/queries/`
71
+ 3. Build the package:
72
+ ```shell
73
+ cd tpcds/databricks
74
+ python3.10 -m build
75
+ ```
76
+ 4. Upload the built `.whl` to your Databricks workspace and install it in a notebook:
77
+ ```shell
78
+ %pip install path/to/databricks_tpcds-0.1.0-py3-none-any.whl --force-reinstall
79
+ ```
80
+ 5. Run the benchmark using the Delta Lake entrypoint example above.
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,31 @@
1
+ import os
2
+
3
+ from setuptools import setup, find_packages
4
+
5
+ this_dir = os.path.abspath(os.path.dirname(__file__))
6
+ with open(os.path.join(this_dir, "README.md"), encoding="utf-8") as f:
7
+ long_description = f.read()
8
+
9
+ setup(
10
+ name='databricks-tpcds',
11
+ version='0.1.0',
12
+ description='Run the TPC-DS benchmark on Databricks (Delta Lake).',
13
+ long_description=long_description,
14
+ long_description_content_type='text/markdown',
15
+ author='Onehouse',
16
+ url='https://github.com/onehouseinc/onebench',
17
+ license='Apache-2.0',
18
+ python_requires='>=3.9',
19
+ packages=find_packages(where='src'),
20
+ package_dir={'': 'src'},
21
+ install_requires=[],
22
+ include_package_data=True,
23
+ package_data={
24
+ "resources": ["queries/*.sql"] # Ensure SQL query files are included
25
+ },
26
+ classifiers=[
27
+ "Programming Language :: Python :: 3",
28
+ "License :: OSI Approved :: Apache Software License",
29
+ "Operating System :: OS Independent",
30
+ ],
31
+ )
@@ -0,0 +1,9 @@
1
+ import os
2
+ import sys
3
+
4
+ # Ensure that the package can find resources
5
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
6
+
7
+ # Expose modules so they can be imported directly
8
+ from resources import table_names
9
+ from .databricks_tpcds import DatabricksTPCDS
@@ -0,0 +1,153 @@
1
+ import time
2
+ from resources import table_names
3
+ import importlib.resources as pkg_resources
4
+
5
+
6
+ class DatabricksTPCDS:
7
+ """
8
+ A class to interact with Databricks for TPC-DS benchmarking.
9
+
10
+ Attributes
11
+ ----------
12
+ spark : pyspark.sql.session.SparkSession
13
+ The SparkSession used to interact with Databricks.
14
+ catalog_name : str
15
+ The name of the catalog in Databricks.
16
+ time_taken_by_query : dict
17
+ A dictionary to store the time taken by each query.
18
+
19
+ Methods
20
+ -------
21
+ use_catalog() -> None:
22
+ Uses the catalog in Databricks.
23
+ create_catalog() -> None:
24
+ Creates a new catalog in Databricks.
25
+ create_table(bucket_name: str, prefix: str, table_name: str) -> None:
26
+ Creates a new table in the Databricks catalog.
27
+ create_tables(bucket_name: str, prefix: str, table_names: list[str]) -> None:
28
+ Creates multiple tables in the Databricks catalog.
29
+ create_all_tables(bucket_name: str, prefix: str) -> None:
30
+ Creates all tables in the Databricks catalog.
31
+ run_query(query_num: int) -> float:
32
+ Runs a TPC-DS query and returns the total execution time.
33
+ run_queries(query_nums: list[int]) -> dict[int, float]:
34
+ Runs multiple TPC-DS queries and returns a dictionary of total execution times.
35
+ run_all_queries() -> dict[int, float]:
36
+ Runs all TPC-DS queries and returns a dictionary of total execution times.
37
+ """
38
+
39
+ order_by_cols: dict[str, str] = {
40
+ "call_center": "cc_call_center_id",
41
+ "catalog_page": "cp_catalog_page_id",
42
+ "catalog_returns": "cr_returned_date_sk",
43
+ "catalog_sales": "cs_sold_date_sk",
44
+ "customer": "c_customer_id",
45
+ "customer_address": "ca_address_id",
46
+ "customer_demographics": "cd_demo_sk",
47
+ "date_dim": "d_date_id",
48
+ "household_demographics": "hd_demo_sk",
49
+ "income_band": "ib_income_band_sk",
50
+ "inventory": "inv_item_sk",
51
+ "item": "i_item_id",
52
+ "promotion": "p_promo_id",
53
+ "reason": "r_reason_id",
54
+ "ship_mode": "sm_ship_mode_id",
55
+ "store": "s_store_id",
56
+ "store_returns": "sr_returned_date_sk",
57
+ "store_sales": "ss_sold_date_sk",
58
+ "time_dim": "t_time_id",
59
+ "warehouse": "w_warehouse_id",
60
+ "web_page": "wp_web_page_id",
61
+ "web_returns": "wr_returned_date_sk",
62
+ "web_sales": "ws_sold_date_sk",
63
+ "web_site": "web_site_id"
64
+ }
65
+
66
+ def __init__(self, spark, schema_name, catalog_name=None):
67
+ self.spark = spark
68
+ self.catalog_name = catalog_name
69
+ self.schema_name = schema_name
70
+ # self.enable_cache = enable_cache
71
+ self.time_taken_by_query = {}
72
+ print(f"Disk cache enabled: {self.spark.conf.get('spark.databricks.io.cache.enabled')}")
73
+
74
+ # def enable_disk_cache(self) -> None:
75
+ # if self.enable_cache == True:
76
+ # self.spark.conf.set("spark.databricks.io.cache.enabled", "true")
77
+ # elif self.enable_cache == False:
78
+ # self.spark.conf.set("spark.databricks.io.cache.enabled", "false")
79
+
80
+ def use_catalog(self) -> None:
81
+ if self.catalog_name:
82
+ self.spark.sql(f"USE CATALOG {self.catalog_name}")
83
+
84
+ def use_schema(self) -> None:
85
+ self.spark.sql(f"USE SCHEMA {self.schema_name}")
86
+
87
+ def create_catalog(self) -> None:
88
+ if self.catalog_name:
89
+ self.spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog_name}")
90
+
91
+ def create_schema(self) -> None:
92
+ self.use_catalog()
93
+ self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.schema_name}")
94
+
95
+ def create_table(self, bucket_name: str, prefix: str, table_name: str, table_format: str = "delta") -> None:
96
+ table_path = f"s3://{bucket_name}/{prefix}/{table_name}"
97
+ self.use_catalog()
98
+ self.use_schema()
99
+ self.spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING {table_format} LOCATION '{table_path}'")
100
+ print(f"Table {table_name} created successfully using {table_format} format.")
101
+
102
+ def create_tables(self, bucket_name: str, prefix: str, table_names: list[str], table_format: str = "delta") -> None:
103
+ for table in table_names:
104
+ self.create_table(bucket_name, prefix, table, table_format)
105
+
106
+ def create_all_tables(self, bucket_name: str, prefix: str, table_format: str = "delta") -> None:
107
+ self.create_tables(bucket_name, prefix, table_names.TABLE_NAMES, table_format)
108
+
109
+ def warm_up(self) -> None:
110
+ self.use_catalog()
111
+ self.use_schema()
112
+ for table in table_names.TABLE_NAMES:
113
+ order_by_col = self.order_by_cols[table]
114
+ self.spark.sql(f"SELECT * FROM {table} ORDER BY {order_by_col} LIMIT 100").collect()
115
+
116
+ def run_query(self, query_num: str) -> float:
117
+ self.use_catalog()
118
+ self.use_schema()
119
+ query_filename = f"q{query_num}.sql"
120
+
121
+ try:
122
+ with pkg_resources.open_text("resources.queries", query_filename) as query:
123
+ query_desc = f"q{query_num}"
124
+ print(query_desc)
125
+ query_string = query.read()
126
+ start_time = time.time()
127
+ self.spark.sparkContext.setJobGroup(query_desc, query_desc, interruptOnCancel=True)
128
+ self.spark.sql(query_string).collect()
129
+ end_time = time.time()
130
+ time_taken = (end_time - start_time) * 1000
131
+ self.time_taken_by_query[query_num] = time_taken
132
+ return round(time_taken)
133
+ except FileNotFoundError:
134
+ print(f"Query file {query_filename} not found in package.")
135
+ return -1
136
+
137
+ def run_queries(self, query_nums: list[int]) -> dict[int, float]:
138
+ time_taken_by_queries = {}
139
+ for query_num in query_nums:
140
+ time_taken_by_queries[query_num] = self.run_query(str(query_num))
141
+ return time_taken_by_queries
142
+
143
+ def run_all_queries(self, should_warmup=False) -> dict[int, float]:
144
+ if should_warmup:
145
+ self.warm_up()
146
+ time_taken_by_queries = {}
147
+ for query_num in range(1, 100):
148
+ if query_num in [14, 23, 24, 39]:
149
+ for subqueries in ["a", "b"]:
150
+ time_taken_by_queries[f"{query_num}{subqueries}"] = self.run_query(f"{query_num}{subqueries}")
151
+ else:
152
+ time_taken_by_queries[query_num] = self.run_query(str(query_num))
153
+ return time_taken_by_queries
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: databricks-tpcds
3
+ Version: 0.1.0
4
+ Summary: Run the TPC-DS benchmark on Databricks (Delta Lake).
5
+ Home-page: https://github.com/onehouseinc/onebench
6
+ Author: Onehouse
7
+ License: Apache-2.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ Dynamic: author
14
+ Dynamic: classifier
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-python
20
+ Dynamic: summary
21
+
22
+ ## Running TPCDS on Databricks
23
+ This document describes how to run TPCDS on Databricks. The TPCDS benchmark is a decision support benchmark that models several generally applicable aspects of a decision support system, including queries and data maintenance. The benchmark provides a representative evaluation of performance as a general purpose decision support system. The benchmark is the result of a partnership between the Transaction Processing Performance Council (TPC) and the decision support group (DS) of the Association for Computing Machinery (ACM).
24
+
25
+ ### Pre-requisites
26
+ 1. Databricks workspace
27
+ 2. Databricks metastore configured to workspace
28
+ 3. Databricks cluster (jobs/all purpose etc)
29
+
30
+ ## Install from PyPI
31
+ Install the package directly in a Databricks notebook:
32
+ ```shell
33
+ %pip install databricks-tpcds
34
+ ```
35
+
36
+ The package provides the `DatabricksTPCDS` library. You drive it from an entrypoint script like
37
+ the Delta Lake example below.
38
+
39
+ ## Delta Lake entrypoint example
40
+ Fill in the placeholder `catalog_name`, `bucket_name`, `prefix`, and `schema_name` with your own
41
+ values, then run it on your Databricks cluster.
42
+
43
+ ```python
44
+ from pyspark.sql import SparkSession
45
+ from databricks_tpcds.databricks_tpcds import DatabricksTPCDS
46
+
47
+
48
+ def main():
49
+ catalog_name = 'my_catalog'
50
+ bucket_name = 'my-bucket'
51
+ prefix = 'path/to/tpcds-datasets/1TB'
52
+ schema_name = 'my_schema'
53
+
54
+ # Initialize Spark session
55
+ spark = SparkSession.builder.appName("TPCDS Query Runner").getOrCreate()
56
+
57
+ # Enable/disable cache
58
+ spark.conf.set("spark.databricks.io.cache.enabled", "false")
59
+
60
+ databricks_tpcds = DatabricksTPCDS(spark, schema_name=schema_name, catalog_name=catalog_name)
61
+
62
+ # Create catalog
63
+ databricks_tpcds.create_catalog()
64
+
65
+ # Create schema
66
+ databricks_tpcds.create_schema()
67
+
68
+ # Create a single table, provide the table name
69
+ # databricks_tpcds.create_table(bucket_name, prefix, "call_center")
70
+
71
+ # Create multiple tables, provide the list of table names
72
+ # databricks_tpcds.create_tables(bucket_name, prefix, ["call_center", "catalog_page"])
73
+
74
+ # Create all tables, provide the bucket name and prefix, it'll create all the tables
75
+ databricks_tpcds.create_all_tables(bucket_name, prefix)
76
+
77
+ # Run all queries
78
+ for i in range(3):
79
+ time_taken_by_queries = databricks_tpcds.run_all_queries(should_warmup=False)
80
+ print("QUERY_NUMBER,TIME_TAKEN")
81
+ for query_no, time_taken in time_taken_by_queries.items():
82
+ print(f"{query_no},{time_taken}")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
87
+ ```
88
+
89
+ ## Developing locally
90
+ 1. Modify the code if necessary in `src/databricks_tpcds/databricks_tpcds.py`
91
+ 2. Take a look or modify the queries in `src/resources/queries/`
92
+ 3. Build the package:
93
+ ```shell
94
+ cd tpcds/databricks
95
+ python3.10 -m build
96
+ ```
97
+ 4. Upload the built `.whl` to your Databricks workspace and install it in a notebook:
98
+ ```shell
99
+ %pip install path/to/databricks_tpcds-0.1.0-py3-none-any.whl --force-reinstall
100
+ ```
101
+ 5. Run the benchmark using the Delta Lake entrypoint example above.
@@ -0,0 +1,119 @@
1
+ MANIFEST.in
2
+ README.md
3
+ setup.py
4
+ src/databricks_tpcds/__init__.py
5
+ src/databricks_tpcds/databricks_tpcds.py
6
+ src/databricks_tpcds.egg-info/PKG-INFO
7
+ src/databricks_tpcds.egg-info/SOURCES.txt
8
+ src/databricks_tpcds.egg-info/dependency_links.txt
9
+ src/databricks_tpcds.egg-info/top_level.txt
10
+ src/resources/__init__.py
11
+ src/resources/table_names.py
12
+ src/resources/queries/q0.sql
13
+ src/resources/queries/q1.sql
14
+ src/resources/queries/q10.sql
15
+ src/resources/queries/q11.sql
16
+ src/resources/queries/q12.sql
17
+ src/resources/queries/q13.sql
18
+ src/resources/queries/q14.sql
19
+ src/resources/queries/q14a.sql
20
+ src/resources/queries/q14b.sql
21
+ src/resources/queries/q15.sql
22
+ src/resources/queries/q16.sql
23
+ src/resources/queries/q17.sql
24
+ src/resources/queries/q18.sql
25
+ src/resources/queries/q19.sql
26
+ src/resources/queries/q2.sql
27
+ src/resources/queries/q20.sql
28
+ src/resources/queries/q21.sql
29
+ src/resources/queries/q22.sql
30
+ src/resources/queries/q23.sql
31
+ src/resources/queries/q23a.sql
32
+ src/resources/queries/q23b.sql
33
+ src/resources/queries/q24.sql
34
+ src/resources/queries/q24a.sql
35
+ src/resources/queries/q24b.sql
36
+ src/resources/queries/q25.sql
37
+ src/resources/queries/q26.sql
38
+ src/resources/queries/q27.sql
39
+ src/resources/queries/q28.sql
40
+ src/resources/queries/q29.sql
41
+ src/resources/queries/q3.sql
42
+ src/resources/queries/q30.sql
43
+ src/resources/queries/q31.sql
44
+ src/resources/queries/q32.sql
45
+ src/resources/queries/q33.sql
46
+ src/resources/queries/q34.sql
47
+ src/resources/queries/q35.sql
48
+ src/resources/queries/q36.sql
49
+ src/resources/queries/q37.sql
50
+ src/resources/queries/q38.sql
51
+ src/resources/queries/q39.sql
52
+ src/resources/queries/q39a.sql
53
+ src/resources/queries/q39b.sql
54
+ src/resources/queries/q4.sql
55
+ src/resources/queries/q40.sql
56
+ src/resources/queries/q41.sql
57
+ src/resources/queries/q42.sql
58
+ src/resources/queries/q43.sql
59
+ src/resources/queries/q44.sql
60
+ src/resources/queries/q45.sql
61
+ src/resources/queries/q46.sql
62
+ src/resources/queries/q47.sql
63
+ src/resources/queries/q48.sql
64
+ src/resources/queries/q49.sql
65
+ src/resources/queries/q5.sql
66
+ src/resources/queries/q50.sql
67
+ src/resources/queries/q51.sql
68
+ src/resources/queries/q52.sql
69
+ src/resources/queries/q53.sql
70
+ src/resources/queries/q54.sql
71
+ src/resources/queries/q55.sql
72
+ src/resources/queries/q56.sql
73
+ src/resources/queries/q57.sql
74
+ src/resources/queries/q58.sql
75
+ src/resources/queries/q59.sql
76
+ src/resources/queries/q6.sql
77
+ src/resources/queries/q60.sql
78
+ src/resources/queries/q61.sql
79
+ src/resources/queries/q62.sql
80
+ src/resources/queries/q63.sql
81
+ src/resources/queries/q64.sql
82
+ src/resources/queries/q65.sql
83
+ src/resources/queries/q66.sql
84
+ src/resources/queries/q67.sql
85
+ src/resources/queries/q68.sql
86
+ src/resources/queries/q69.sql
87
+ src/resources/queries/q7.sql
88
+ src/resources/queries/q70.sql
89
+ src/resources/queries/q71.sql
90
+ src/resources/queries/q72.sql
91
+ src/resources/queries/q73.sql
92
+ src/resources/queries/q74.sql
93
+ src/resources/queries/q75.sql
94
+ src/resources/queries/q76.sql
95
+ src/resources/queries/q77.sql
96
+ src/resources/queries/q78.sql
97
+ src/resources/queries/q79.sql
98
+ src/resources/queries/q8.sql
99
+ src/resources/queries/q80.sql
100
+ src/resources/queries/q81.sql
101
+ src/resources/queries/q82.sql
102
+ src/resources/queries/q83.sql
103
+ src/resources/queries/q84.sql
104
+ src/resources/queries/q85.sql
105
+ src/resources/queries/q86.sql
106
+ src/resources/queries/q87.sql
107
+ src/resources/queries/q88.sql
108
+ src/resources/queries/q89.sql
109
+ src/resources/queries/q9.sql
110
+ src/resources/queries/q90.sql
111
+ src/resources/queries/q91.sql
112
+ src/resources/queries/q92.sql
113
+ src/resources/queries/q93.sql
114
+ src/resources/queries/q94.sql
115
+ src/resources/queries/q95.sql
116
+ src/resources/queries/q96.sql
117
+ src/resources/queries/q97.sql
118
+ src/resources/queries/q98.sql
119
+ src/resources/queries/q99.sql
@@ -0,0 +1,2 @@
1
+ databricks_tpcds
2
+ resources
File without changes
@@ -0,0 +1,2 @@
1
+ --TPC-DS Q0
2
+ select * from call_center limit 10;
@@ -0,0 +1,24 @@
1
+ --TPC-DS Q1
2
+ with customer_total_return as
3
+ (select sr_customer_sk as ctr_customer_sk
4
+ ,sr_store_sk as ctr_store_sk
5
+ ,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return
6
+ from store_returns
7
+ ,date_dim
8
+ where sr_returned_date_sk = d_date_sk
9
+ and d_year =1999
10
+ group by sr_customer_sk
11
+ ,sr_store_sk)
12
+ select c_customer_id
13
+ from customer_total_return ctr1
14
+ ,store
15
+ ,customer
16
+ where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
17
+ from customer_total_return ctr2
18
+ where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
19
+ and s_store_sk = ctr1.ctr_store_sk
20
+ and s_state = 'TN'
21
+ and ctr1.ctr_customer_sk = c_customer_sk
22
+ order by c_customer_id
23
+ limit 100;
24
+