pydataframer-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ """
2
+ pydataframer-databricks: Databricks connector for Dataframer
3
+ """
4
+
5
+ from .connectors import DatabricksConnector, FileType, DatasetType
6
+
7
+ __all__ = [
8
+ "FileType",
9
+ "DatasetType",
10
+ "DatabricksConnector",
11
+ ]
@@ -0,0 +1,251 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DatasetType(Enum):
5
+ """Dataset type enumeration matching Dataframer backend."""
6
+ SINGLE_FILE = "SINGLE_FILE"
7
+ MULTI_FILE = "MULTI_FILE"
8
+ MULTI_FOLDER = "MULTI_FOLDER"
9
+
10
+
11
+ class FileType(Enum):
12
+ """File type enumeration matching Dataframer backend."""
13
+ MD = "md"
14
+ TXT = "txt"
15
+ CSV = "csv"
16
+ PDF = "pdf"
17
+ JSON = "json"
18
+ JSONL = "jsonl"
19
+
20
+
21
+ class DatabricksConnector:
22
+ """
23
+ Databricks connector for Dataframer workflows.
24
+
25
+ This class provides methods to interact with Databricks SQL, fetch sample data,
26
+ and load generated data into Databricks tables.
27
+
28
+ Parameters
29
+ ----------
30
+ dbutils : DBUtils
31
+ The dbutils object from your Databricks notebook context.
32
+ This is automatically available in Databricks notebooks.
33
+
34
+ Examples
35
+ --------
36
+ >>> databricks_connector = DatabricksConnector(dbutils)
37
+ >>> df = databricks_connector.fetch_sample_data(
38
+ ... num_items_to_select=25,
39
+ ... table_name="samples.bakehouse.media_customer_reviews"
40
+ ... )
41
+ >>> df.head()
42
+ """
43
+
44
+ def __init__(self, dbutils):
45
+ """
46
+ Initialize the Databricks connector.
47
+
48
+ Parameters
49
+ ----------
50
+ dbutils : DBUtils
51
+ The dbutils object from your Databricks notebook context.
52
+ """
53
+ self.dbutils = dbutils
54
+
55
+ def get_connection(self):
56
+ """
57
+ Return an authenticated Databricks SQL connection.
58
+
59
+ Returns
60
+ -------
61
+ Connection
62
+ A Databricks SQL connection object.
63
+ """
64
+ from databricks import sql
65
+ from databricks.sdk.core import Config, oauth_service_principal
66
+
67
+ server_hostname = self.dbutils.secrets.get("dataframer", "DATABRICKS_SERVER_HOSTNAME")
68
+ http_path = self.dbutils.secrets.get("dataframer", "DATABRICKS_HTTP_PATH")
69
+
70
+ def credential_provider():
71
+ config = Config(
72
+ host=f"https://{server_hostname}",
73
+ client_id=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_ID"),
74
+ client_secret=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_SECRET"),
75
+ )
76
+ return oauth_service_principal(config)
77
+
78
+ return sql.connect(
79
+ server_hostname=server_hostname,
80
+ http_path=http_path,
81
+ credentials_provider=credential_provider,
82
+ user_agent_entry="dataframer_user_agent",
83
+ )
84
+
85
+ def fetch_sample_data(self, num_items_to_select, table_name):
86
+ """
87
+ Fetch sample data from a Databricks table and return it as a Pandas DataFrame.
88
+
89
+ Parameters
90
+ ----------
91
+ num_items_to_select : int
92
+ Number of rows to fetch from the table.
93
+ table_name : str
94
+ Fully qualified table name (e.g., "catalog.schema.table").
95
+
96
+ Returns
97
+ -------
98
+ pd.DataFrame
99
+ A Pandas DataFrame containing the sample data.
100
+
101
+ Examples
102
+ --------
103
+ >>> databricks_connector = DatabricksConnector(dbutils)
104
+ >>> df = databricks_connector.fetch_sample_data(
105
+ ... num_items_to_select=25,
106
+ ... table_name="samples.bakehouse.media_customer_reviews"
107
+ ... )
108
+ >>> df.head()
109
+ """
110
+ import pandas as pd
111
+
112
+ query = f"""
113
+ SELECT *
114
+ FROM {table_name}
115
+ LIMIT {num_items_to_select}
116
+ """
117
+
118
+ try:
119
+ with self.get_connection() as connection:
120
+ with connection.cursor() as cursor:
121
+ cursor.execute(query)
122
+ rows = cursor.fetchall()
123
+ columns = [desc[0] for desc in cursor.description]
124
+ except Exception as e:
125
+ error_msg = f"Failed to fetch data from table `{table_name}`"
126
+ print(f"{error_msg}: {str(e)}")
127
+ print("Verify table exists, is accessible, and you have SELECT permissions")
128
+ raise RuntimeError(f"{error_msg}: {str(e)}") from e
129
+
130
+ return pd.DataFrame(rows, columns=columns)
131
+
132
+ def load_generated_data(self, table_name, downloaded_zip, dataset_type, file_type):
133
+ """
134
+ Load generated samples from a ZIP file into a Databricks table using Databricks SQL.
135
+
136
+ Parameters
137
+ ----------
138
+ table_name : str
139
+ Target table name (e.g., "catalog.schema.table")
140
+ downloaded_zip : file-like
141
+ ZIP file object containing the generated data file
142
+ dataset_type : DatasetType
143
+ Type of dataset structure (DatasetType.SINGLE_FILE, DatasetType.MULTI_FILE, or DatasetType.MULTI_FOLDER)
144
+ file_type : FileType
145
+ Type of file in the ZIP (FileType.CSV, FileType.JSON, FileType.JSONL, etc.)
146
+
147
+ Examples
148
+ --------
149
+ >>> databricks_connector = DatabricksConnector(dbutils)
150
+ >>> with open("samples.zip", "rb") as f:
151
+ ... databricks_connector.load_generated_data(
152
+ ... table_name="my_catalog.my_schema.my_table",
153
+ ... downloaded_zip=f,
154
+ ... dataset_type=DatasetType.SINGLE_FILE,
155
+ ... file_type=FileType.CSV
156
+ ... )
157
+ """
158
+ import zipfile
159
+ import pandas as pd
160
+ from io import BytesIO
161
+
162
+ zip_buffer = BytesIO(downloaded_zip.read())
163
+
164
+ if dataset_type == DatasetType.SINGLE_FILE:
165
+ try:
166
+ with zipfile.ZipFile(zip_buffer) as z:
167
+ file_list = z.namelist()
168
+
169
+ generated_data_files = [f for f in file_list if f.lower().endswith(f'.{file_type.value}')]
170
+
171
+ if len(generated_data_files) != 1:
172
+ error_msg = f"Expected exactly one .{file_type.value} file in ZIP"
173
+ print(f"{error_msg}. Available files: {file_list}")
174
+ raise ValueError(error_msg)
175
+
176
+ data_filename = generated_data_files[0]
177
+ data_bytes = z.read(data_filename)
178
+ print(f"Found {file_type.value} file: {data_filename}")
179
+
180
+ except zipfile.BadZipFile as e:
181
+ error_msg = "Invalid or corrupted ZIP file"
182
+ print(f"{error_msg}: {str(e)}")
183
+ raise ValueError(f"{error_msg}: {str(e)}") from e
184
+ except ValueError:
185
+ raise
186
+ except Exception as e:
187
+ error_msg = "Failed to extract file from ZIP"
188
+ print(f"{error_msg}: {str(e)}")
189
+ raise RuntimeError(f"{error_msg}: {str(e)}") from e
190
+
191
+ if file_type == FileType.CSV:
192
+ pandas_df = pd.read_csv(BytesIO(data_bytes))
193
+ elif file_type == FileType.JSON:
194
+ # TODO: Implement JSON file handling
195
+ pass
196
+ elif file_type == FileType.JSONL:
197
+ # TODO: Implement JSONL file handling
198
+ pass
199
+ else:
200
+ raise ValueError(f"Unsupported file_type: {file_type}. Supported: CSV, JSON, JSONL for SINGLE_FILE datasets")
201
+
202
+ with self.get_connection() as connection:
203
+ cursor = connection.cursor()
204
+
205
+ columns_sql = ", ".join(
206
+ f"`{col}` STRING" for col in pandas_df.columns
207
+ )
208
+
209
+ try:
210
+ cursor.execute(f"""
211
+ CREATE OR REPLACE TABLE {table_name} (
212
+ {columns_sql}
213
+ )
214
+ """)
215
+ except Exception as e:
216
+ error_msg = f"Failed to create table `{table_name}`"
217
+ print(f"{error_msg}: {str(e)}")
218
+ print("Verify table name format (catalog.schema.table), permissions, and warehouse is running")
219
+ cursor.close()
220
+ raise RuntimeError(f"{error_msg}: {str(e)}") from e
221
+
222
+ insert_sql = f"""
223
+ INSERT INTO {table_name}
224
+ VALUES ({", ".join(["?"] * len(pandas_df.columns))})
225
+ """
226
+
227
+ try:
228
+ cursor.executemany(
229
+ insert_sql,
230
+ pandas_df.values.tolist()
231
+ )
232
+ except Exception as e:
233
+ error_msg = f"Failed to insert data into table `{table_name}`"
234
+ print(f"{error_msg}: {str(e)} | Rows attempted: {len(pandas_df)}")
235
+ cursor.close()
236
+ raise RuntimeError(f"{error_msg}: {str(e)}") from e
237
+
238
+ cursor.close()
239
+
240
+ print(f"✅ Table `{table_name}` saved successfully using Databricks SQL")
241
+
242
+ elif dataset_type == DatasetType.MULTI_FILE:
243
+ # TODO: Implement MULTI_FILE handling
244
+ pass
245
+
246
+ elif dataset_type == DatasetType.MULTI_FOLDER:
247
+ # TODO: Implement MULTI_FOLDER handling
248
+ pass
249
+
250
+ else:
251
+ raise ValueError(f"Invalid dataset_type: {dataset_type}. Expected DatasetType enum")
@@ -0,0 +1,43 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydataframer-databricks
3
+ Version: 0.1.0
4
+ Summary: Databricks connector for Dataframer
5
+ Author-email: Dataframer <info@dataframer.ai>
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: databricks-sdk>=0.81.0
9
+ Requires-Dist: databricks-sql-connector>=4.2.4
10
+ Requires-Dist: pandas>=2.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
13
+ Requires-Dist: pytest>=7.4.0; extra == 'dev'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # pydataframer-databricks
17
+
18
+ Databricks connector package for Dataframer, providing seamless integration with Databricks SQL and data operations.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install pydataframer-databricks
24
+ ```
25
+
26
+ ## Building
27
+
28
+ Requires [uv](https://docs.astral.sh/uv/) installed in your environment.
29
+
30
+ ```bash
31
+ uv build
32
+ ```
33
+
34
+ ## Development
35
+
36
+ ```bash
37
+ # Install with dev dependencies
38
+ uv pip install -e ".[dev]"
39
+
40
+ # Run tests
41
+ pytest
42
+ ```
43
+
@@ -0,0 +1,5 @@
1
+ pydataframer_databricks/__init__.py,sha256=piRrFtKpGGc2ctFDnMNblp5Whp6froRKXNeYkHnrw_o,214
2
+ pydataframer_databricks/connectors.py,sha256=E4RlU30ADp0V27tuHWOai-7CM1YvmTInS_YonUpWMds,9191
3
+ pydataframer_databricks-0.1.0.dist-info/METADATA,sha256=L-5a9ThsJYq_CTeAI22Zlo269NoXH3jRtyFOW_dPxAQ,891
4
+ pydataframer_databricks-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pydataframer_databricks-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any