ml-analytics-tools 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ml_analytics_tools-0.4.0/ml_analytics_tools.egg-info → ml_analytics_tools-0.4.2}/PKG-INFO +19 -12
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/README.md +18 -11
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/sf_connector.py +80 -20
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2/ml_analytics_tools.egg-info}/PKG-INFO +19 -12
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/pyproject.toml +1 -1
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_gsheet_connector.py +1 -1
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_sf_connector.py +70 -17
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/LICENSE +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/__init__.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/aws_auth.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/cli.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/data_connector.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/gsheet_connector.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/model_manager.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/model_tools.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/s3_connector.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/slack_connector.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/tunnel_manager.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/utils.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/requires.txt +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/top_level.txt +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/setup.cfg +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_aws_auth.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_db_s3.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_identity_column.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_model_manager.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_model_tools.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_s3_redshift_validation.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_tunnel_manager.py +0 -0
- {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -51,7 +51,7 @@ arguments.
|
|
|
51
51
|
## What Is Included
|
|
52
52
|
|
|
53
53
|
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
54
|
-
- `SFConnector`: read
|
|
54
|
+
- `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
|
|
55
55
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
56
56
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
57
57
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -168,7 +168,8 @@ df = dc.sql("SELECT 1 AS col_1")
|
|
|
168
168
|
|
|
169
169
|
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
170
170
|
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
171
|
-
per token lifetime.
|
|
171
|
+
per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
|
|
172
|
+
`SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
|
|
172
173
|
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
173
174
|
default Databricks personal-scope secrets automatically:
|
|
174
175
|
|
|
@@ -195,9 +196,10 @@ df = (
|
|
|
195
196
|
|
|
196
197
|
### Query Snowflake With Spark (`SFConnector`)
|
|
197
198
|
|
|
198
|
-
On Databricks, `SFConnector` reads
|
|
199
|
-
|
|
200
|
-
`DataConnector`, and only imports PySpark when a
|
|
199
|
+
On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
|
|
200
|
+
persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
|
|
201
|
+
settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
|
|
202
|
+
query/write method runs.
|
|
201
203
|
|
|
202
204
|
```python
|
|
203
205
|
from ml_analytics import SFConnector
|
|
@@ -210,8 +212,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
|
|
|
210
212
|
# pandas DataFrame
|
|
211
213
|
pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
|
|
212
214
|
|
|
213
|
-
#
|
|
214
|
-
sf.
|
|
215
|
+
# run a query from a .sql file (relative to project root), with templating
|
|
216
|
+
df = sf.sql("queries/experiment.sql", days=14)
|
|
217
|
+
|
|
218
|
+
# pull and save the result to a Unity Catalog table in one call
|
|
219
|
+
sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
|
|
220
|
+
|
|
221
|
+
# or save any Spark DataFrame to Unity Catalog
|
|
222
|
+
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
215
223
|
```
|
|
216
224
|
|
|
217
225
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -262,15 +270,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
|
262
270
|
#### OAuth authentication (alternative to a service account)
|
|
263
271
|
|
|
264
272
|
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
265
|
-
credentials
|
|
266
|
-
|
|
267
|
-
are found:
|
|
273
|
+
credentials. Set these env vars and the connector uses OAuth automatically when
|
|
274
|
+
no service-account credentials are found:
|
|
268
275
|
|
|
269
276
|
| Variable | Required | Description |
|
|
270
277
|
|----------|----------|-------------|
|
|
271
278
|
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
272
279
|
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
273
|
-
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `
|
|
280
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
|
|
274
281
|
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
275
282
|
|
|
276
283
|
The first run opens a browser for one-time consent; the cached refresh token
|
|
@@ -16,7 +16,7 @@ arguments.
|
|
|
16
16
|
## What Is Included
|
|
17
17
|
|
|
18
18
|
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
19
|
-
- `SFConnector`: read
|
|
19
|
+
- `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
|
|
20
20
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
21
21
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
22
22
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -133,7 +133,8 @@ df = dc.sql("SELECT 1 AS col_1")
|
|
|
133
133
|
|
|
134
134
|
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
135
135
|
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
136
|
-
per token lifetime.
|
|
136
|
+
per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
|
|
137
|
+
`SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
|
|
137
138
|
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
138
139
|
default Databricks personal-scope secrets automatically:
|
|
139
140
|
|
|
@@ -160,9 +161,10 @@ df = (
|
|
|
160
161
|
|
|
161
162
|
### Query Snowflake With Spark (`SFConnector`)
|
|
162
163
|
|
|
163
|
-
On Databricks, `SFConnector` reads
|
|
164
|
-
|
|
165
|
-
`DataConnector`, and only imports PySpark when a
|
|
164
|
+
On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
|
|
165
|
+
persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
|
|
166
|
+
settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
|
|
167
|
+
query/write method runs.
|
|
166
168
|
|
|
167
169
|
```python
|
|
168
170
|
from ml_analytics import SFConnector
|
|
@@ -175,8 +177,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
|
|
|
175
177
|
# pandas DataFrame
|
|
176
178
|
pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
|
|
177
179
|
|
|
178
|
-
#
|
|
179
|
-
sf.
|
|
180
|
+
# run a query from a .sql file (relative to project root), with templating
|
|
181
|
+
df = sf.sql("queries/experiment.sql", days=14)
|
|
182
|
+
|
|
183
|
+
# pull and save the result to a Unity Catalog table in one call
|
|
184
|
+
sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
|
|
185
|
+
|
|
186
|
+
# or save any Spark DataFrame to Unity Catalog
|
|
187
|
+
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
180
188
|
```
|
|
181
189
|
|
|
182
190
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -227,15 +235,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
|
227
235
|
#### OAuth authentication (alternative to a service account)
|
|
228
236
|
|
|
229
237
|
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
230
|
-
credentials
|
|
231
|
-
|
|
232
|
-
are found:
|
|
238
|
+
credentials. Set these env vars and the connector uses OAuth automatically when
|
|
239
|
+
no service-account credentials are found:
|
|
233
240
|
|
|
234
241
|
| Variable | Required | Description |
|
|
235
242
|
|----------|----------|-------------|
|
|
236
243
|
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
237
244
|
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
238
|
-
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `
|
|
245
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
|
|
239
246
|
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
240
247
|
|
|
241
248
|
The first run opens a browser for one-time consent; the cached refresh token
|
|
@@ -18,7 +18,7 @@ from .data_connector import (
|
|
|
18
18
|
_load_private_key_pem_for_spark,
|
|
19
19
|
_snowflake_secret_scope,
|
|
20
20
|
)
|
|
21
|
-
from .utils import get_logger, log_and_raise_error
|
|
21
|
+
from .utils import get_logger, load_sql_query, log_and_raise_error
|
|
22
22
|
|
|
23
23
|
# Cached Spark session shared across SFConnector instances. Populated lazily by
|
|
24
24
|
# get_spark(); never created at import time so the package stays importable
|
|
@@ -221,65 +221,125 @@ class SFConnector:
|
|
|
221
221
|
if self.authenticator:
|
|
222
222
|
options["sfAuthenticator"] = self.authenticator
|
|
223
223
|
elif self.authenticator:
|
|
224
|
-
options["sfAuthenticator"] = self.authenticator
|
|
225
224
|
if self.authenticator.lower() == "externalbrowser":
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
"
|
|
225
|
+
log_and_raise_error(
|
|
226
|
+
self._logger,
|
|
227
|
+
"Snowflake externalbrowser authentication is interactive and cannot be used by "
|
|
228
|
+
"SFConnector (Spark jobs block on the browser SSO handshake). Use key-pair "
|
|
229
|
+
"(SNOWFLAKE_PRIVATE_KEY/_PATH) or OAuth (SNOWFLAKE_TOKEN) for Spark workloads, "
|
|
230
|
+
"or use DataConnector for interactive local queries.",
|
|
229
231
|
)
|
|
232
|
+
options["sfAuthenticator"] = self.authenticator
|
|
230
233
|
|
|
231
234
|
# Caller-provided options win over resolved defaults.
|
|
232
235
|
options.update({k: v for k, v in self.extra_options.items() if _clean_env_value(v) is not None})
|
|
233
236
|
return options
|
|
234
237
|
|
|
235
|
-
def
|
|
238
|
+
def _resolve_query(self, query: str, **kwargs) -> str:
|
|
239
|
+
"""Resolve a query string: if it looks like a SQL file path, load it; otherwise return as-is."""
|
|
240
|
+
if query and query.strip().endswith(".sql"):
|
|
241
|
+
loaded = load_sql_query(query.strip(), **kwargs)
|
|
242
|
+
if loaded is None:
|
|
243
|
+
log_and_raise_error(self._logger, f"Could not load SQL file: {query}")
|
|
244
|
+
self._logger.info(f"Loaded SQL from file: {query}")
|
|
245
|
+
return loaded
|
|
246
|
+
return query
|
|
247
|
+
|
|
248
|
+
def sql(
|
|
249
|
+
self,
|
|
250
|
+
query: str,
|
|
251
|
+
return_pandas: bool = False,
|
|
252
|
+
save_table: bool = False,
|
|
253
|
+
table: str = None,
|
|
254
|
+
schema: str = None,
|
|
255
|
+
catalog: str = None,
|
|
256
|
+
mode: str = "overwrite",
|
|
257
|
+
**kwargs,
|
|
258
|
+
):
|
|
236
259
|
"""
|
|
237
260
|
Execute a SQL query against Snowflake and return the result.
|
|
238
261
|
|
|
262
|
+
Optionally persist the result straight into a Databricks Unity Catalog
|
|
263
|
+
table while pulling the data, by passing ``save_table=True`` along with a
|
|
264
|
+
destination ``table`` (and optionally ``schema`` / ``catalog``).
|
|
265
|
+
|
|
239
266
|
Parameters
|
|
240
267
|
----------
|
|
241
268
|
query : str
|
|
242
|
-
SQL query to execute
|
|
269
|
+
SQL query to execute, or a path to a ``.sql`` file (relative to the
|
|
270
|
+
project root). When a ``.sql`` path is given, its contents are loaded
|
|
271
|
+
automatically.
|
|
243
272
|
return_pandas : bool, optional
|
|
244
273
|
If True, return a pandas DataFrame; otherwise return a Spark
|
|
245
274
|
DataFrame. Defaults to False.
|
|
275
|
+
save_table : bool, optional
|
|
276
|
+
If True, write the result to a Unity Catalog table via
|
|
277
|
+
:meth:`save_to_uc` before returning. Defaults to False.
|
|
278
|
+
table : str, optional
|
|
279
|
+
Destination table name when ``save_table`` is True. May be fully
|
|
280
|
+
qualified (``catalog.schema.table``), in which case ``schema`` /
|
|
281
|
+
``catalog`` are ignored.
|
|
282
|
+
schema, catalog : str, optional
|
|
283
|
+
Unity Catalog schema and catalog to qualify ``table`` with.
|
|
284
|
+
mode : str, optional
|
|
285
|
+
Spark write mode for the saved table ('overwrite', 'append',
|
|
286
|
+
'ignore', 'error'). Defaults to 'overwrite'.
|
|
287
|
+
**kwargs
|
|
288
|
+
Template variables substituted into the SQL file using ``str.format()``.
|
|
246
289
|
"""
|
|
290
|
+
query = self._resolve_query(query, **kwargs)
|
|
247
291
|
spark = self._get_spark()
|
|
248
292
|
try:
|
|
249
293
|
df = spark.read.format(self.source_format).options(**self.spark_options()).option("query", query).load()
|
|
250
294
|
except Exception as e:
|
|
251
295
|
log_and_raise_error(self._logger, f"Error reading from Snowflake: {e}")
|
|
252
296
|
|
|
297
|
+
if save_table:
|
|
298
|
+
self.save_to_uc(df, table=table, schema=schema, catalog=catalog, mode=mode)
|
|
299
|
+
|
|
253
300
|
if return_pandas:
|
|
254
301
|
return df.toPandas()
|
|
255
302
|
return df
|
|
256
303
|
|
|
257
|
-
|
|
304
|
+
@staticmethod
|
|
305
|
+
def _qualified_uc_name(table: str, schema: str = None, catalog: str = None) -> str:
|
|
306
|
+
"""Build a Unity Catalog table identifier from its parts.
|
|
307
|
+
|
|
308
|
+
A ``table`` that already contains dots is treated as fully qualified and
|
|
309
|
+
returned as-is; otherwise ``catalog`` / ``schema`` are prepended when given.
|
|
258
310
|
"""
|
|
259
|
-
|
|
311
|
+
if "." in table:
|
|
312
|
+
return table
|
|
313
|
+
parts = [part for part in (catalog, schema, table) if part]
|
|
314
|
+
return ".".join(parts)
|
|
315
|
+
|
|
316
|
+
def save_to_uc(self, df, table: str, schema: str = None, catalog: str = None, mode: str = "overwrite"):
|
|
317
|
+
"""
|
|
318
|
+
Write a Spark DataFrame to a Databricks Unity Catalog table.
|
|
319
|
+
|
|
320
|
+
Uses Spark's native ``df.write.saveAsTable(...)`` (a managed UC table),
|
|
321
|
+
not the Snowflake connector.
|
|
260
322
|
|
|
261
323
|
Parameters
|
|
262
324
|
----------
|
|
263
325
|
df : pyspark.sql.DataFrame
|
|
264
326
|
DataFrame to write.
|
|
265
327
|
table : str
|
|
266
|
-
Destination table name
|
|
267
|
-
|
|
328
|
+
Destination table name. May be fully qualified
|
|
329
|
+
(``catalog.schema.table``), in which case ``schema`` / ``catalog``
|
|
330
|
+
are ignored.
|
|
331
|
+
schema, catalog : str, optional
|
|
332
|
+
Unity Catalog schema and catalog to qualify ``table`` with.
|
|
268
333
|
mode : str, optional
|
|
269
334
|
Spark write mode: 'overwrite', 'append', 'ignore', or 'error'.
|
|
270
335
|
Defaults to 'overwrite'.
|
|
271
|
-
column_mapping : str, optional
|
|
272
|
-
Snowflake ``column_mapping`` option ('name' or 'order').
|
|
273
|
-
Defaults to 'name' so columns are matched by name.
|
|
274
336
|
"""
|
|
275
337
|
if not table:
|
|
276
338
|
log_and_raise_error(self._logger, "A destination table name is required.")
|
|
277
339
|
|
|
278
|
-
|
|
279
|
-
options["dbtable"] = table
|
|
280
|
-
options["column_mapping"] = column_mapping
|
|
340
|
+
full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
|
|
281
341
|
try:
|
|
282
|
-
df.write.
|
|
342
|
+
df.write.mode(mode).saveAsTable(full_name)
|
|
283
343
|
except Exception as e:
|
|
284
|
-
log_and_raise_error(self._logger, f"Error writing to
|
|
285
|
-
self._logger.info(f"Table '{
|
|
344
|
+
log_and_raise_error(self._logger, f"Error writing to Unity Catalog table '{full_name}': {e}")
|
|
345
|
+
self._logger.info(f"Table '{full_name}' written to Unity Catalog (mode={mode}).")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -51,7 +51,7 @@ arguments.
|
|
|
51
51
|
## What Is Included
|
|
52
52
|
|
|
53
53
|
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
54
|
-
- `SFConnector`: read
|
|
54
|
+
- `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
|
|
55
55
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
56
56
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
57
57
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -168,7 +168,8 @@ df = dc.sql("SELECT 1 AS col_1")
|
|
|
168
168
|
|
|
169
169
|
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
170
170
|
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
171
|
-
per token lifetime.
|
|
171
|
+
per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
|
|
172
|
+
`SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
|
|
172
173
|
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
173
174
|
default Databricks personal-scope secrets automatically:
|
|
174
175
|
|
|
@@ -195,9 +196,10 @@ df = (
|
|
|
195
196
|
|
|
196
197
|
### Query Snowflake With Spark (`SFConnector`)
|
|
197
198
|
|
|
198
|
-
On Databricks, `SFConnector` reads
|
|
199
|
-
|
|
200
|
-
`DataConnector`, and only imports PySpark when a
|
|
199
|
+
On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
|
|
200
|
+
persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
|
|
201
|
+
settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
|
|
202
|
+
query/write method runs.
|
|
201
203
|
|
|
202
204
|
```python
|
|
203
205
|
from ml_analytics import SFConnector
|
|
@@ -210,8 +212,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
|
|
|
210
212
|
# pandas DataFrame
|
|
211
213
|
pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
|
|
212
214
|
|
|
213
|
-
#
|
|
214
|
-
sf.
|
|
215
|
+
# run a query from a .sql file (relative to project root), with templating
|
|
216
|
+
df = sf.sql("queries/experiment.sql", days=14)
|
|
217
|
+
|
|
218
|
+
# pull and save the result to a Unity Catalog table in one call
|
|
219
|
+
sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
|
|
220
|
+
|
|
221
|
+
# or save any Spark DataFrame to Unity Catalog
|
|
222
|
+
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
215
223
|
```
|
|
216
224
|
|
|
217
225
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -262,15 +270,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
|
262
270
|
#### OAuth authentication (alternative to a service account)
|
|
263
271
|
|
|
264
272
|
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
265
|
-
credentials
|
|
266
|
-
|
|
267
|
-
are found:
|
|
273
|
+
credentials. Set these env vars and the connector uses OAuth automatically when
|
|
274
|
+
no service-account credentials are found:
|
|
268
275
|
|
|
269
276
|
| Variable | Required | Description |
|
|
270
277
|
|----------|----------|-------------|
|
|
271
278
|
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
272
279
|
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
273
|
-
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `
|
|
280
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
|
|
274
281
|
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
275
282
|
|
|
276
283
|
The first run opens a browser for one-time consent; the cached refresh token
|
|
@@ -949,7 +949,7 @@ class TestGSheetOAuth:
|
|
|
949
949
|
monkeypatch.delenv(var, raising=False)
|
|
950
950
|
monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_ID", "cid.apps.googleusercontent.com")
|
|
951
951
|
monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_SECRET", "GOCSPX-secret")
|
|
952
|
-
monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "
|
|
952
|
+
monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "my-gcp-project")
|
|
953
953
|
monkeypatch.setenv("GSHEET_TOKEN_PATH", str(token_path))
|
|
954
954
|
|
|
955
955
|
def test_oauth_runs_flow_when_no_token(self, monkeypatch, tmp_path, mock_google_api_services):
|
|
@@ -176,6 +176,76 @@ def test_extra_options_override(monkeypatch):
|
|
|
176
176
|
assert options["sfTimezone"] == "UTC"
|
|
177
177
|
|
|
178
178
|
|
|
179
|
+
def test_externalbrowser_authenticator_raises(monkeypatch):
|
|
180
|
+
_clear_snowflake_env(monkeypatch)
|
|
181
|
+
sf = SFConnector(account="acct", user="u", authenticator="externalbrowser")
|
|
182
|
+
with pytest.raises(ValueError, match="externalbrowser"):
|
|
183
|
+
sf.spark_options()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_resolve_query_inline_passthrough(monkeypatch):
|
|
187
|
+
_clear_snowflake_env(monkeypatch)
|
|
188
|
+
sf = SFConnector(account="acct", user="u")
|
|
189
|
+
assert sf._resolve_query("SELECT 1") == "SELECT 1"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_resolve_query_loads_sql_file(monkeypatch, tmp_path):
|
|
193
|
+
_clear_snowflake_env(monkeypatch)
|
|
194
|
+
sql_file = tmp_path / "q.sql"
|
|
195
|
+
sql_file.write_text("SELECT {n} AS n")
|
|
196
|
+
monkeypatch.setattr("ml_analytics.sf_connector.find_project_root", lambda *a, **k: tmp_path, raising=False)
|
|
197
|
+
monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *a, **k: tmp_path)
|
|
198
|
+
sf = SFConnector(account="acct", user="u")
|
|
199
|
+
assert sf._resolve_query("q.sql", n=5) == "SELECT 5 AS n"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_resolve_query_missing_file_raises(monkeypatch, tmp_path):
|
|
203
|
+
_clear_snowflake_env(monkeypatch)
|
|
204
|
+
monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *a, **k: tmp_path)
|
|
205
|
+
sf = SFConnector(account="acct", user="u")
|
|
206
|
+
with pytest.raises(ValueError, match="Could not load SQL file"):
|
|
207
|
+
sf._resolve_query("missing.sql")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_qualified_uc_name_parts():
|
|
211
|
+
assert SFConnector._qualified_uc_name("t", schema="s", catalog="c") == "c.s.t"
|
|
212
|
+
assert SFConnector._qualified_uc_name("t", schema="s") == "s.t"
|
|
213
|
+
assert SFConnector._qualified_uc_name("t") == "t"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_qualified_uc_name_already_qualified():
|
|
217
|
+
# A dotted table name is treated as fully qualified; schema/catalog ignored.
|
|
218
|
+
assert SFConnector._qualified_uc_name("cat.sch.tbl", schema="x", catalog="y") == "cat.sch.tbl"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def test_save_to_uc_uses_saveastable(monkeypatch):
|
|
222
|
+
_clear_snowflake_env(monkeypatch)
|
|
223
|
+
sf = SFConnector(account="acct", user="u")
|
|
224
|
+
|
|
225
|
+
calls = {}
|
|
226
|
+
|
|
227
|
+
class _Writer:
|
|
228
|
+
def mode(self, m):
|
|
229
|
+
calls["mode"] = m
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
def saveAsTable(self, name):
|
|
233
|
+
calls["name"] = name
|
|
234
|
+
|
|
235
|
+
class _DF:
|
|
236
|
+
write = _Writer()
|
|
237
|
+
|
|
238
|
+
sf.save_to_uc(_DF(), table="tbl", schema="sch", catalog="cat", mode="append")
|
|
239
|
+
assert calls == {"mode": "append", "name": "cat.sch.tbl"}
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def test_save_to_uc_requires_table(monkeypatch):
|
|
243
|
+
_clear_snowflake_env(monkeypatch)
|
|
244
|
+
sf = SFConnector(account="acct", user="u")
|
|
245
|
+
with pytest.raises(ValueError, match="table name is required"):
|
|
246
|
+
sf.save_to_uc(object(), table="")
|
|
247
|
+
|
|
248
|
+
|
|
179
249
|
def test_missing_account_raises(monkeypatch):
|
|
180
250
|
_clear_snowflake_env(monkeypatch)
|
|
181
251
|
with pytest.raises(ValueError):
|
|
@@ -247,20 +317,3 @@ def test_sql_return_pandas(monkeypatch):
|
|
|
247
317
|
sf.sql("select 1", return_pandas=True)
|
|
248
318
|
|
|
249
319
|
df.toPandas.assert_called_once()
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def test_save_table(monkeypatch):
|
|
253
|
-
_clear_snowflake_env(monkeypatch)
|
|
254
|
-
spark, _ = _mock_spark()
|
|
255
|
-
sf = SFConnector(account="acct", user="u", password="p", spark=spark)
|
|
256
|
-
|
|
257
|
-
df = MagicMock()
|
|
258
|
-
sf.save_table(df, "cds.my_table", mode="append")
|
|
259
|
-
|
|
260
|
-
df.write.format.assert_called_once_with("net.snowflake.spark.snowflake")
|
|
261
|
-
writer = df.write.format.return_value
|
|
262
|
-
options_passed = writer.options.call_args.kwargs
|
|
263
|
-
assert options_passed["dbtable"] == "cds.my_table"
|
|
264
|
-
assert options_passed["column_mapping"] == "name"
|
|
265
|
-
writer.options.return_value.mode.assert_called_once_with("append")
|
|
266
|
-
writer.options.return_value.mode.return_value.save.assert_called_once()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/requires.txt
RENAMED
|
File without changes
|
{ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|