ml-analytics-tools 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {ml_analytics_tools-0.4.0/ml_analytics_tools.egg-info → ml_analytics_tools-0.4.2}/PKG-INFO +19 -12
  2. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/README.md +18 -11
  3. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/sf_connector.py +80 -20
  4. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2/ml_analytics_tools.egg-info}/PKG-INFO +19 -12
  5. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/pyproject.toml +1 -1
  6. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_gsheet_connector.py +1 -1
  7. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_sf_connector.py +70 -17
  8. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/LICENSE +0 -0
  9. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/__init__.py +0 -0
  10. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/aws_auth.py +0 -0
  11. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/cli.py +0 -0
  12. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/data_connector.py +0 -0
  13. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/gsheet_connector.py +0 -0
  14. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/model_manager.py +0 -0
  15. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/model_tools.py +0 -0
  16. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/s3_connector.py +0 -0
  17. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/slack_connector.py +0 -0
  18. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/tunnel_manager.py +0 -0
  19. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics/utils.py +0 -0
  20. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
  21. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
  22. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
  23. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/requires.txt +0 -0
  24. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/ml_analytics_tools.egg-info/top_level.txt +0 -0
  25. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/setup.cfg +0 -0
  26. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_aws_auth.py +0 -0
  27. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_db_s3.py +0 -0
  28. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_identity_column.py +0 -0
  29. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_model_manager.py +0 -0
  30. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_model_tools.py +0 -0
  31. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_s3_redshift_validation.py +0 -0
  32. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_tunnel_manager.py +0 -0
  33. {ml_analytics_tools-0.4.0 → ml_analytics_tools-0.4.2}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -51,7 +51,7 @@ arguments.
51
51
  ## What Is Included
52
52
 
53
53
  - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
54
- - `SFConnector`: read and write Snowflake through Spark (Databricks). PySpark is imported lazily, so the rest of the package works without it.
54
+ - `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
55
55
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
56
56
  - `GSheet`: read, write, share, and export Google Sheets data.
57
57
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -168,7 +168,8 @@ df = dc.sql("SELECT 1 AS col_1")
168
168
 
169
169
  For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
170
170
  SSO tokens are cached in the OS keychain, so the browser login only happens once
171
- per token lifetime.
171
+ per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
172
+ `SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
172
173
  For Databricks and Spark jobs, use key-pair auth instead. The connector reads
173
174
  default Databricks personal-scope secrets automatically:
174
175
 
@@ -195,9 +196,10 @@ df = (
195
196
 
196
197
  ### Query Snowflake With Spark (`SFConnector`)
197
198
 
198
- On Databricks, `SFConnector` reads and writes Snowflake directly as Spark
199
- DataFrames. It reuses the same `SNOWFLAKE_*` settings and key-pair secrets as
200
- `DataConnector`, and only imports PySpark when a query/write method runs.
199
+ On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
200
+ persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
201
+ settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
202
+ query/write method runs.
201
203
 
202
204
  ```python
203
205
  from ml_analytics import SFConnector
@@ -210,8 +212,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
210
212
  # pandas DataFrame
211
213
  pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
212
214
 
213
- # Write a Spark DataFrame back to Snowflake
214
- sf.save_table(df, "cds.my_table", mode="overwrite")
215
+ # run a query from a .sql file (relative to project root), with templating
216
+ df = sf.sql("queries/experiment.sql", days=14)
217
+
218
+ # pull and save the result to a Unity Catalog table in one call
219
+ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
220
+
221
+ # or save any Spark DataFrame to Unity Catalog
222
+ sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
215
223
  ```
216
224
 
217
225
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -262,15 +270,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
262
270
  #### OAuth authentication (alternative to a service account)
263
271
 
264
272
  `GSheet` can authenticate as your own Google account using OAuth installed-app
265
- credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
266
- and the connector uses OAuth automatically when no service-account credentials
267
- are found:
273
+ credentials. Set these env vars and the connector uses OAuth automatically when
274
+ no service-account credentials are found:
268
275
 
269
276
  | Variable | Required | Description |
270
277
  |----------|----------|-------------|
271
278
  | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
272
279
  | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
273
- | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
280
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
274
281
  | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
275
282
 
276
283
  The first run opens a browser for one-time consent; the cached refresh token
@@ -16,7 +16,7 @@ arguments.
16
16
  ## What Is Included
17
17
 
18
18
  - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
19
- - `SFConnector`: read and write Snowflake through Spark (Databricks). PySpark is imported lazily, so the rest of the package works without it.
19
+ - `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
20
20
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
21
21
  - `GSheet`: read, write, share, and export Google Sheets data.
22
22
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -133,7 +133,8 @@ df = dc.sql("SELECT 1 AS col_1")
133
133
 
134
134
  For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
135
135
  SSO tokens are cached in the OS keychain, so the browser login only happens once
136
- per token lifetime.
136
+ per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
137
+ `SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
137
138
  For Databricks and Spark jobs, use key-pair auth instead. The connector reads
138
139
  default Databricks personal-scope secrets automatically:
139
140
 
@@ -160,9 +161,10 @@ df = (
160
161
 
161
162
  ### Query Snowflake With Spark (`SFConnector`)
162
163
 
163
- On Databricks, `SFConnector` reads and writes Snowflake directly as Spark
164
- DataFrames. It reuses the same `SNOWFLAKE_*` settings and key-pair secrets as
165
- `DataConnector`, and only imports PySpark when a query/write method runs.
164
+ On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
165
+ persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
166
+ settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
167
+ query/write method runs.
166
168
 
167
169
  ```python
168
170
  from ml_analytics import SFConnector
@@ -175,8 +177,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
175
177
  # pandas DataFrame
176
178
  pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
177
179
 
178
- # Write a Spark DataFrame back to Snowflake
179
- sf.save_table(df, "cds.my_table", mode="overwrite")
180
+ # run a query from a .sql file (relative to project root), with templating
181
+ df = sf.sql("queries/experiment.sql", days=14)
182
+
183
+ # pull and save the result to a Unity Catalog table in one call
184
+ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
185
+
186
+ # or save any Spark DataFrame to Unity Catalog
187
+ sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
180
188
  ```
181
189
 
182
190
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -227,15 +235,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
227
235
  #### OAuth authentication (alternative to a service account)
228
236
 
229
237
  `GSheet` can authenticate as your own Google account using OAuth installed-app
230
- credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
231
- and the connector uses OAuth automatically when no service-account credentials
232
- are found:
238
+ credentials. Set these env vars and the connector uses OAuth automatically when
239
+ no service-account credentials are found:
233
240
 
234
241
  | Variable | Required | Description |
235
242
  |----------|----------|-------------|
236
243
  | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
237
244
  | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
238
- | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
245
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
239
246
  | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
240
247
 
241
248
  The first run opens a browser for one-time consent; the cached refresh token
@@ -18,7 +18,7 @@ from .data_connector import (
18
18
  _load_private_key_pem_for_spark,
19
19
  _snowflake_secret_scope,
20
20
  )
21
- from .utils import get_logger, log_and_raise_error
21
+ from .utils import get_logger, load_sql_query, log_and_raise_error
22
22
 
23
23
  # Cached Spark session shared across SFConnector instances. Populated lazily by
24
24
  # get_spark(); never created at import time so the package stays importable
@@ -221,65 +221,125 @@ class SFConnector:
221
221
  if self.authenticator:
222
222
  options["sfAuthenticator"] = self.authenticator
223
223
  elif self.authenticator:
224
- options["sfAuthenticator"] = self.authenticator
225
224
  if self.authenticator.lower() == "externalbrowser":
226
- self._logger.warning(
227
- "Snowflake externalbrowser authentication is interactive and is not suitable for "
228
- "Databricks/Spark jobs. Use key-pair or OAuth for Spark workloads."
225
+ log_and_raise_error(
226
+ self._logger,
227
+ "Snowflake externalbrowser authentication is interactive and cannot be used by "
228
+ "SFConnector (Spark jobs block on the browser SSO handshake). Use key-pair "
229
+ "(SNOWFLAKE_PRIVATE_KEY/_PATH) or OAuth (SNOWFLAKE_TOKEN) for Spark workloads, "
230
+ "or use DataConnector for interactive local queries.",
229
231
  )
232
+ options["sfAuthenticator"] = self.authenticator
230
233
 
231
234
  # Caller-provided options win over resolved defaults.
232
235
  options.update({k: v for k, v in self.extra_options.items() if _clean_env_value(v) is not None})
233
236
  return options
234
237
 
235
- def sql(self, query: str, return_pandas: bool = False):
238
+ def _resolve_query(self, query: str, **kwargs) -> str:
239
+ """Resolve a query string: if it looks like a SQL file path, load it; otherwise return as-is."""
240
+ if query and query.strip().endswith(".sql"):
241
+ loaded = load_sql_query(query.strip(), **kwargs)
242
+ if loaded is None:
243
+ log_and_raise_error(self._logger, f"Could not load SQL file: {query}")
244
+ self._logger.info(f"Loaded SQL from file: {query}")
245
+ return loaded
246
+ return query
247
+
248
+ def sql(
249
+ self,
250
+ query: str,
251
+ return_pandas: bool = False,
252
+ save_table: bool = False,
253
+ table: str = None,
254
+ schema: str = None,
255
+ catalog: str = None,
256
+ mode: str = "overwrite",
257
+ **kwargs,
258
+ ):
236
259
  """
237
260
  Execute a SQL query against Snowflake and return the result.
238
261
 
262
+ Optionally persist the result straight into a Databricks Unity Catalog
263
+ table while pulling the data, by passing ``save_table=True`` along with a
264
+ destination ``table`` (and optionally ``schema`` / ``catalog``).
265
+
239
266
  Parameters
240
267
  ----------
241
268
  query : str
242
- SQL query to execute.
269
+ SQL query to execute, or a path to a ``.sql`` file (relative to the
270
+ project root). When a ``.sql`` path is given, its contents are loaded
271
+ automatically.
243
272
  return_pandas : bool, optional
244
273
  If True, return a pandas DataFrame; otherwise return a Spark
245
274
  DataFrame. Defaults to False.
275
+ save_table : bool, optional
276
+ If True, write the result to a Unity Catalog table via
277
+ :meth:`save_to_uc` before returning. Defaults to False.
278
+ table : str, optional
279
+ Destination table name when ``save_table`` is True. May be fully
280
+ qualified (``catalog.schema.table``), in which case ``schema`` /
281
+ ``catalog`` are ignored.
282
+ schema, catalog : str, optional
283
+ Unity Catalog schema and catalog to qualify ``table`` with.
284
+ mode : str, optional
285
+ Spark write mode for the saved table ('overwrite', 'append',
286
+ 'ignore', 'error'). Defaults to 'overwrite'.
287
+ **kwargs
288
+ Template variables substituted into the SQL file using ``str.format()``.
246
289
  """
290
+ query = self._resolve_query(query, **kwargs)
247
291
  spark = self._get_spark()
248
292
  try:
249
293
  df = spark.read.format(self.source_format).options(**self.spark_options()).option("query", query).load()
250
294
  except Exception as e:
251
295
  log_and_raise_error(self._logger, f"Error reading from Snowflake: {e}")
252
296
 
297
+ if save_table:
298
+ self.save_to_uc(df, table=table, schema=schema, catalog=catalog, mode=mode)
299
+
253
300
  if return_pandas:
254
301
  return df.toPandas()
255
302
  return df
256
303
 
257
- def save_table(self, df, table: str, mode: str = "overwrite", column_mapping: str = "name"):
304
+ @staticmethod
305
+ def _qualified_uc_name(table: str, schema: str = None, catalog: str = None) -> str:
306
+ """Build a Unity Catalog table identifier from its parts.
307
+
308
+ A ``table`` that already contains dots is treated as fully qualified and
309
+ returned as-is; otherwise ``catalog`` / ``schema`` are prepended when given.
258
310
  """
259
- Write a Spark DataFrame to a Snowflake table.
311
+ if "." in table:
312
+ return table
313
+ parts = [part for part in (catalog, schema, table) if part]
314
+ return ".".join(parts)
315
+
316
+ def save_to_uc(self, df, table: str, schema: str = None, catalog: str = None, mode: str = "overwrite"):
317
+ """
318
+ Write a Spark DataFrame to a Databricks Unity Catalog table.
319
+
320
+ Uses Spark's native ``df.write.saveAsTable(...)`` (a managed UC table),
321
+ not the Snowflake connector.
260
322
 
261
323
  Parameters
262
324
  ----------
263
325
  df : pyspark.sql.DataFrame
264
326
  DataFrame to write.
265
327
  table : str
266
- Destination table name (``sfDatabase`` / ``sfSchema`` from the
267
- connector are used unless the name is fully qualified).
328
+ Destination table name. May be fully qualified
329
+ (``catalog.schema.table``), in which case ``schema`` / ``catalog``
330
+ are ignored.
331
+ schema, catalog : str, optional
332
+ Unity Catalog schema and catalog to qualify ``table`` with.
268
333
  mode : str, optional
269
334
  Spark write mode: 'overwrite', 'append', 'ignore', or 'error'.
270
335
  Defaults to 'overwrite'.
271
- column_mapping : str, optional
272
- Snowflake ``column_mapping`` option ('name' or 'order').
273
- Defaults to 'name' so columns are matched by name.
274
336
  """
275
337
  if not table:
276
338
  log_and_raise_error(self._logger, "A destination table name is required.")
277
339
 
278
- options = self.spark_options()
279
- options["dbtable"] = table
280
- options["column_mapping"] = column_mapping
340
+ full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
281
341
  try:
282
- df.write.format(self.source_format).options(**options).mode(mode).save()
342
+ df.write.mode(mode).saveAsTable(full_name)
283
343
  except Exception as e:
284
- log_and_raise_error(self._logger, f"Error writing to Snowflake table '{table}': {e}")
285
- self._logger.info(f"Table '{table}' written successfully (mode={mode}).")
344
+ log_and_raise_error(self._logger, f"Error writing to Unity Catalog table '{full_name}': {e}")
345
+ self._logger.info(f"Table '{full_name}' written to Unity Catalog (mode={mode}).")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -51,7 +51,7 @@ arguments.
51
51
  ## What Is Included
52
52
 
53
53
  - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
54
- - `SFConnector`: read and write Snowflake through Spark (Databricks). PySpark is imported lazily, so the rest of the package works without it.
54
+ - `SFConnector`: read Snowflake through Spark and save results to Unity Catalog tables (Databricks). PySpark is imported lazily, so the rest of the package works without it.
55
55
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
56
56
  - `GSheet`: read, write, share, and export Google Sheets data.
57
57
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -168,7 +168,8 @@ df = dc.sql("SELECT 1 AS col_1")
168
168
 
169
169
  For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
170
170
  SSO tokens are cached in the OS keychain, so the browser login only happens once
171
- per token lifetime.
171
+ per token lifetime. (Note: `externalbrowser` works with `DataConnector` only;
172
+ `SFConnector` rejects it, since Spark jobs block on the interactive browser SSO.)
172
173
  For Databricks and Spark jobs, use key-pair auth instead. The connector reads
173
174
  default Databricks personal-scope secrets automatically:
174
175
 
@@ -195,9 +196,10 @@ df = (
195
196
 
196
197
  ### Query Snowflake With Spark (`SFConnector`)
197
198
 
198
- On Databricks, `SFConnector` reads and writes Snowflake directly as Spark
199
- DataFrames. It reuses the same `SNOWFLAKE_*` settings and key-pair secrets as
200
- `DataConnector`, and only imports PySpark when a query/write method runs.
199
+ On Databricks, `SFConnector` reads Snowflake directly as Spark DataFrames and can
200
+ persist results into Unity Catalog tables. It reuses the same `SNOWFLAKE_*`
201
+ settings and key-pair secrets as `DataConnector`, and only imports PySpark when a
202
+ query/write method runs.
201
203
 
202
204
  ```python
203
205
  from ml_analytics import SFConnector
@@ -210,8 +212,14 @@ df = sf.sql("SELECT * FROM cds.dim_tutor LIMIT 1000")
210
212
  # pandas DataFrame
211
213
  pdf = sf.sql("SELECT 1 AS col_1", return_pandas=True)
212
214
 
213
- # Write a Spark DataFrame back to Snowflake
214
- sf.save_table(df, "cds.my_table", mode="overwrite")
215
+ # run a query from a .sql file (relative to project root), with templating
216
+ df = sf.sql("queries/experiment.sql", days=14)
217
+
218
+ # pull and save the result to a Unity Catalog table in one call
219
+ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp")
220
+
221
+ # or save any Spark DataFrame to Unity Catalog
222
+ sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
215
223
  ```
216
224
 
217
225
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -262,15 +270,14 @@ gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
262
270
  #### OAuth authentication (alternative to a service account)
263
271
 
264
272
  `GSheet` can authenticate as your own Google account using OAuth installed-app
265
- credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
266
- and the connector uses OAuth automatically when no service-account credentials
267
- are found:
273
+ credentials. Set these env vars and the connector uses OAuth automatically when
274
+ no service-account credentials are found:
268
275
 
269
276
  | Variable | Required | Description |
270
277
  |----------|----------|-------------|
271
278
  | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
272
279
  | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
273
- | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
280
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `my-gcp-project`) |
274
281
  | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
275
282
 
276
283
  The first run opens a browser for one-time consent; the cached refresh token
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ml-analytics-tools"
3
- version = "0.4.0"
3
+ version = "0.4.2"
4
4
  description = "Tools for ML projects and data management"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -949,7 +949,7 @@ class TestGSheetOAuth:
949
949
  monkeypatch.delenv(var, raising=False)
950
950
  monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_ID", "cid.apps.googleusercontent.com")
951
951
  monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_SECRET", "GOCSPX-secret")
952
- monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "preply-gworkspace-cli")
952
+ monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "my-gcp-project")
953
953
  monkeypatch.setenv("GSHEET_TOKEN_PATH", str(token_path))
954
954
 
955
955
  def test_oauth_runs_flow_when_no_token(self, monkeypatch, tmp_path, mock_google_api_services):
@@ -176,6 +176,76 @@ def test_extra_options_override(monkeypatch):
176
176
  assert options["sfTimezone"] == "UTC"
177
177
 
178
178
 
179
+ def test_externalbrowser_authenticator_raises(monkeypatch):
180
+ _clear_snowflake_env(monkeypatch)
181
+ sf = SFConnector(account="acct", user="u", authenticator="externalbrowser")
182
+ with pytest.raises(ValueError, match="externalbrowser"):
183
+ sf.spark_options()
184
+
185
+
186
+ def test_resolve_query_inline_passthrough(monkeypatch):
187
+ _clear_snowflake_env(monkeypatch)
188
+ sf = SFConnector(account="acct", user="u")
189
+ assert sf._resolve_query("SELECT 1") == "SELECT 1"
190
+
191
+
192
+ def test_resolve_query_loads_sql_file(monkeypatch, tmp_path):
193
+ _clear_snowflake_env(monkeypatch)
194
+ sql_file = tmp_path / "q.sql"
195
+ sql_file.write_text("SELECT {n} AS n")
196
+ monkeypatch.setattr("ml_analytics.sf_connector.find_project_root", lambda *a, **k: tmp_path, raising=False)
197
+ monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *a, **k: tmp_path)
198
+ sf = SFConnector(account="acct", user="u")
199
+ assert sf._resolve_query("q.sql", n=5) == "SELECT 5 AS n"
200
+
201
+
202
+ def test_resolve_query_missing_file_raises(monkeypatch, tmp_path):
203
+ _clear_snowflake_env(monkeypatch)
204
+ monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *a, **k: tmp_path)
205
+ sf = SFConnector(account="acct", user="u")
206
+ with pytest.raises(ValueError, match="Could not load SQL file"):
207
+ sf._resolve_query("missing.sql")
208
+
209
+
210
+ def test_qualified_uc_name_parts():
211
+ assert SFConnector._qualified_uc_name("t", schema="s", catalog="c") == "c.s.t"
212
+ assert SFConnector._qualified_uc_name("t", schema="s") == "s.t"
213
+ assert SFConnector._qualified_uc_name("t") == "t"
214
+
215
+
216
+ def test_qualified_uc_name_already_qualified():
217
+ # A dotted table name is treated as fully qualified; schema/catalog ignored.
218
+ assert SFConnector._qualified_uc_name("cat.sch.tbl", schema="x", catalog="y") == "cat.sch.tbl"
219
+
220
+
221
+ def test_save_to_uc_uses_saveastable(monkeypatch):
222
+ _clear_snowflake_env(monkeypatch)
223
+ sf = SFConnector(account="acct", user="u")
224
+
225
+ calls = {}
226
+
227
+ class _Writer:
228
+ def mode(self, m):
229
+ calls["mode"] = m
230
+ return self
231
+
232
+ def saveAsTable(self, name):
233
+ calls["name"] = name
234
+
235
+ class _DF:
236
+ write = _Writer()
237
+
238
+ sf.save_to_uc(_DF(), table="tbl", schema="sch", catalog="cat", mode="append")
239
+ assert calls == {"mode": "append", "name": "cat.sch.tbl"}
240
+
241
+
242
+ def test_save_to_uc_requires_table(monkeypatch):
243
+ _clear_snowflake_env(monkeypatch)
244
+ sf = SFConnector(account="acct", user="u")
245
+ with pytest.raises(ValueError, match="table name is required"):
246
+ sf.save_to_uc(object(), table="")
247
+
248
+
179
249
  def test_missing_account_raises(monkeypatch):
180
250
  _clear_snowflake_env(monkeypatch)
181
251
  with pytest.raises(ValueError):
@@ -247,20 +317,3 @@ def test_sql_return_pandas(monkeypatch):
247
317
  sf.sql("select 1", return_pandas=True)
248
318
 
249
319
  df.toPandas.assert_called_once()
250
-
251
-
252
- def test_save_table(monkeypatch):
253
- _clear_snowflake_env(monkeypatch)
254
- spark, _ = _mock_spark()
255
- sf = SFConnector(account="acct", user="u", password="p", spark=spark)
256
-
257
- df = MagicMock()
258
- sf.save_table(df, "cds.my_table", mode="append")
259
-
260
- df.write.format.assert_called_once_with("net.snowflake.spark.snowflake")
261
- writer = df.write.format.return_value
262
- options_passed = writer.options.call_args.kwargs
263
- assert options_passed["dbtable"] == "cds.my_table"
264
- assert options_passed["column_mapping"] == "name"
265
- writer.options.return_value.mode.assert_called_once_with("append")
266
- writer.options.return_value.mode.return_value.save.assert_called_once()