pydataframer-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DatasetType(Enum):
|
|
5
|
+
"""Dataset type enumeration matching Dataframer backend."""
|
|
6
|
+
SINGLE_FILE = "SINGLE_FILE"
|
|
7
|
+
MULTI_FILE = "MULTI_FILE"
|
|
8
|
+
MULTI_FOLDER = "MULTI_FOLDER"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileType(Enum):
|
|
12
|
+
"""File type enumeration matching Dataframer backend."""
|
|
13
|
+
MD = "md"
|
|
14
|
+
TXT = "txt"
|
|
15
|
+
CSV = "csv"
|
|
16
|
+
PDF = "pdf"
|
|
17
|
+
JSON = "json"
|
|
18
|
+
JSONL = "jsonl"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DatabricksConnector:
|
|
22
|
+
"""
|
|
23
|
+
Databricks connector for Dataframer workflows.
|
|
24
|
+
|
|
25
|
+
This class provides methods to interact with Databricks SQL, fetch sample data,
|
|
26
|
+
and load generated data into Databricks tables.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
dbutils : DBUtils
|
|
31
|
+
The dbutils object from your Databricks notebook context.
|
|
32
|
+
This is automatically available in Databricks notebooks.
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> databricks_connector = DatabricksConnector(dbutils)
|
|
37
|
+
>>> df = databricks_connector.fetch_sample_data(
|
|
38
|
+
... num_items_to_select=25,
|
|
39
|
+
... table_name="samples.bakehouse.media_customer_reviews"
|
|
40
|
+
... )
|
|
41
|
+
>>> df.head()
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, dbutils):
|
|
45
|
+
"""
|
|
46
|
+
Initialize the Databricks connector.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
dbutils : DBUtils
|
|
51
|
+
The dbutils object from your Databricks notebook context.
|
|
52
|
+
"""
|
|
53
|
+
self.dbutils = dbutils
|
|
54
|
+
|
|
55
|
+
def get_connection(self):
|
|
56
|
+
"""
|
|
57
|
+
Return an authenticated Databricks SQL connection.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Connection
|
|
62
|
+
A Databricks SQL connection object.
|
|
63
|
+
"""
|
|
64
|
+
from databricks import sql
|
|
65
|
+
from databricks.sdk.core import Config, oauth_service_principal
|
|
66
|
+
|
|
67
|
+
server_hostname = self.dbutils.secrets.get("dataframer", "DATABRICKS_SERVER_HOSTNAME")
|
|
68
|
+
http_path = self.dbutils.secrets.get("dataframer", "DATABRICKS_HTTP_PATH")
|
|
69
|
+
|
|
70
|
+
def credential_provider():
|
|
71
|
+
config = Config(
|
|
72
|
+
host=f"https://{server_hostname}",
|
|
73
|
+
client_id=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_ID"),
|
|
74
|
+
client_secret=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_SECRET"),
|
|
75
|
+
)
|
|
76
|
+
return oauth_service_principal(config)
|
|
77
|
+
|
|
78
|
+
return sql.connect(
|
|
79
|
+
server_hostname=server_hostname,
|
|
80
|
+
http_path=http_path,
|
|
81
|
+
credentials_provider=credential_provider,
|
|
82
|
+
user_agent_entry="dataframer_user_agent",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def fetch_sample_data(self, num_items_to_select, table_name):
|
|
86
|
+
"""
|
|
87
|
+
Fetch sample data from a Databricks table and return it as a Pandas DataFrame.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
num_items_to_select : int
|
|
92
|
+
Number of rows to fetch from the table.
|
|
93
|
+
table_name : str
|
|
94
|
+
Fully qualified table name (e.g., "catalog.schema.table").
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
pd.DataFrame
|
|
99
|
+
A Pandas DataFrame containing the sample data.
|
|
100
|
+
|
|
101
|
+
Examples
|
|
102
|
+
--------
|
|
103
|
+
>>> databricks_connector = DatabricksConnector(dbutils)
|
|
104
|
+
>>> df = databricks_connector.fetch_sample_data(
|
|
105
|
+
... num_items_to_select=25,
|
|
106
|
+
... table_name="samples.bakehouse.media_customer_reviews"
|
|
107
|
+
... )
|
|
108
|
+
>>> df.head()
|
|
109
|
+
"""
|
|
110
|
+
import pandas as pd
|
|
111
|
+
|
|
112
|
+
query = f"""
|
|
113
|
+
SELECT *
|
|
114
|
+
FROM {table_name}
|
|
115
|
+
LIMIT {num_items_to_select}
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
with self.get_connection() as connection:
|
|
120
|
+
with connection.cursor() as cursor:
|
|
121
|
+
cursor.execute(query)
|
|
122
|
+
rows = cursor.fetchall()
|
|
123
|
+
columns = [desc[0] for desc in cursor.description]
|
|
124
|
+
except Exception as e:
|
|
125
|
+
error_msg = f"Failed to fetch data from table `{table_name}`"
|
|
126
|
+
print(f"{error_msg}: {str(e)}")
|
|
127
|
+
print("Verify table exists, is accessible, and you have SELECT permissions")
|
|
128
|
+
raise RuntimeError(f"{error_msg}: {str(e)}") from e
|
|
129
|
+
|
|
130
|
+
return pd.DataFrame(rows, columns=columns)
|
|
131
|
+
|
|
132
|
+
def load_generated_data(self, table_name, downloaded_zip, dataset_type, file_type):
|
|
133
|
+
"""
|
|
134
|
+
Load generated samples from a ZIP file into a Databricks table using Databricks SQL.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
table_name : str
|
|
139
|
+
Target table name (e.g., "catalog.schema.table")
|
|
140
|
+
downloaded_zip : file-like
|
|
141
|
+
ZIP file object containing the generated data file
|
|
142
|
+
dataset_type : DatasetType
|
|
143
|
+
Type of dataset structure (DatasetType.SINGLE_FILE, DatasetType.MULTI_FILE, or DatasetType.MULTI_FOLDER)
|
|
144
|
+
file_type : FileType
|
|
145
|
+
Type of file in the ZIP (FileType.CSV, FileType.JSON, FileType.JSONL, etc.)
|
|
146
|
+
|
|
147
|
+
Examples
|
|
148
|
+
--------
|
|
149
|
+
>>> databricks_connector = DatabricksConnector(dbutils)
|
|
150
|
+
>>> with open("samples.zip", "rb") as f:
|
|
151
|
+
... databricks_connector.load_generated_data(
|
|
152
|
+
... table_name="my_catalog.my_schema.my_table",
|
|
153
|
+
... downloaded_zip=f,
|
|
154
|
+
... dataset_type=DatasetType.SINGLE_FILE,
|
|
155
|
+
... file_type=FileType.CSV
|
|
156
|
+
... )
|
|
157
|
+
"""
|
|
158
|
+
import zipfile
|
|
159
|
+
import pandas as pd
|
|
160
|
+
from io import BytesIO
|
|
161
|
+
|
|
162
|
+
zip_buffer = BytesIO(downloaded_zip.read())
|
|
163
|
+
|
|
164
|
+
if dataset_type == DatasetType.SINGLE_FILE:
|
|
165
|
+
try:
|
|
166
|
+
with zipfile.ZipFile(zip_buffer) as z:
|
|
167
|
+
file_list = z.namelist()
|
|
168
|
+
|
|
169
|
+
generated_data_files = [f for f in file_list if f.lower().endswith(f'.{file_type.value}')]
|
|
170
|
+
|
|
171
|
+
if len(generated_data_files) != 1:
|
|
172
|
+
error_msg = f"Expected exactly one .{file_type.value} file in ZIP"
|
|
173
|
+
print(f"{error_msg}. Available files: {file_list}")
|
|
174
|
+
raise ValueError(error_msg)
|
|
175
|
+
|
|
176
|
+
data_filename = generated_data_files[0]
|
|
177
|
+
data_bytes = z.read(data_filename)
|
|
178
|
+
print(f"Found {file_type.value} file: {data_filename}")
|
|
179
|
+
|
|
180
|
+
except zipfile.BadZipFile as e:
|
|
181
|
+
error_msg = "Invalid or corrupted ZIP file"
|
|
182
|
+
print(f"{error_msg}: {str(e)}")
|
|
183
|
+
raise ValueError(f"{error_msg}: {str(e)}") from e
|
|
184
|
+
except ValueError:
|
|
185
|
+
raise
|
|
186
|
+
except Exception as e:
|
|
187
|
+
error_msg = "Failed to extract file from ZIP"
|
|
188
|
+
print(f"{error_msg}: {str(e)}")
|
|
189
|
+
raise RuntimeError(f"{error_msg}: {str(e)}") from e
|
|
190
|
+
|
|
191
|
+
if file_type == FileType.CSV:
|
|
192
|
+
pandas_df = pd.read_csv(BytesIO(data_bytes))
|
|
193
|
+
elif file_type == FileType.JSON:
|
|
194
|
+
# TODO: Implement JSON file handling
|
|
195
|
+
pass
|
|
196
|
+
elif file_type == FileType.JSONL:
|
|
197
|
+
# TODO: Implement JSONL file handling
|
|
198
|
+
pass
|
|
199
|
+
else:
|
|
200
|
+
raise ValueError(f"Unsupported file_type: {file_type}. Supported: CSV, JSON, JSONL for SINGLE_FILE datasets")
|
|
201
|
+
|
|
202
|
+
with self.get_connection() as connection:
|
|
203
|
+
cursor = connection.cursor()
|
|
204
|
+
|
|
205
|
+
columns_sql = ", ".join(
|
|
206
|
+
f"`{col}` STRING" for col in pandas_df.columns
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
cursor.execute(f"""
|
|
211
|
+
CREATE OR REPLACE TABLE {table_name} (
|
|
212
|
+
{columns_sql}
|
|
213
|
+
)
|
|
214
|
+
""")
|
|
215
|
+
except Exception as e:
|
|
216
|
+
error_msg = f"Failed to create table `{table_name}`"
|
|
217
|
+
print(f"{error_msg}: {str(e)}")
|
|
218
|
+
print("Verify table name format (catalog.schema.table), permissions, and warehouse is running")
|
|
219
|
+
cursor.close()
|
|
220
|
+
raise RuntimeError(f"{error_msg}: {str(e)}") from e
|
|
221
|
+
|
|
222
|
+
insert_sql = f"""
|
|
223
|
+
INSERT INTO {table_name}
|
|
224
|
+
VALUES ({", ".join(["?"] * len(pandas_df.columns))})
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
cursor.executemany(
|
|
229
|
+
insert_sql,
|
|
230
|
+
pandas_df.values.tolist()
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
error_msg = f"Failed to insert data into table `{table_name}`"
|
|
234
|
+
print(f"{error_msg}: {str(e)} | Rows attempted: {len(pandas_df)}")
|
|
235
|
+
cursor.close()
|
|
236
|
+
raise RuntimeError(f"{error_msg}: {str(e)}") from e
|
|
237
|
+
|
|
238
|
+
cursor.close()
|
|
239
|
+
|
|
240
|
+
print(f"✅ Table `{table_name}` saved successfully using Databricks SQL")
|
|
241
|
+
|
|
242
|
+
elif dataset_type == DatasetType.MULTI_FILE:
|
|
243
|
+
# TODO: Implement MULTI_FILE handling
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
elif dataset_type == DatasetType.MULTI_FOLDER:
|
|
247
|
+
# TODO: Implement MULTI_FOLDER handling
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
raise ValueError(f"Invalid dataset_type: {dataset_type}. Expected DatasetType enum")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pydataframer-databricks
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Databricks connector for Dataframer
|
|
5
|
+
Author-email: Dataframer <info@dataframer.ai>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: databricks-sdk>=0.81.0
|
|
9
|
+
Requires-Dist: databricks-sql-connector>=4.2.4
|
|
10
|
+
Requires-Dist: pandas>=2.0.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=7.4.0; extra == 'dev'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# pydataframer-databricks
|
|
17
|
+
|
|
18
|
+
Databricks connector package for Dataframer, providing seamless integration with Databricks SQL and data operations.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install pydataframer-databricks
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Building
|
|
27
|
+
|
|
28
|
+
Requires [uv](https://docs.astral.sh/uv/) installed in your environment.
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv build
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Development
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Install with dev dependencies
|
|
38
|
+
uv pip install -e ".[dev]"
|
|
39
|
+
|
|
40
|
+
# Run tests
|
|
41
|
+
pytest
|
|
42
|
+
```
|
|
43
|
+
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
pydataframer_databricks/__init__.py,sha256=piRrFtKpGGc2ctFDnMNblp5Whp6froRKXNeYkHnrw_o,214
|
|
2
|
+
pydataframer_databricks/connectors.py,sha256=E4RlU30ADp0V27tuHWOai-7CM1YvmTInS_YonUpWMds,9191
|
|
3
|
+
pydataframer_databricks-0.1.0.dist-info/METADATA,sha256=L-5a9ThsJYq_CTeAI22Zlo269NoXH3jRtyFOW_dPxAQ,891
|
|
4
|
+
pydataframer_databricks-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pydataframer_databricks-0.1.0.dist-info/RECORD,,
|