ml-analytics-tools 0.4.3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ml_analytics_tools-0.4.3/ml_analytics_tools.egg-info → ml_analytics_tools-0.4.4}/PKG-INFO +9 -1
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/README.md +8 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/__init__.py +2 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/data_connector.py +4 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/sf_connector.py +223 -5
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/utils.py +56 -36
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4/ml_analytics_tools.egg-info}/PKG-INFO +9 -1
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/pyproject.toml +1 -1
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_sf_connector.py +124 -2
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/LICENSE +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/aws_auth.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/cli.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/gsheet_connector.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/model_manager.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/model_tools.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/s3_connector.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/slack_connector.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/tunnel_manager.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/requires.txt +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/top_level.txt +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/setup.cfg +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_aws_auth.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_db_s3.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_gsheet_connector.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_identity_column.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_model_manager.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_model_tools.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_s3_redshift_validation.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_tunnel_manager.py +0 -0
- {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -220,6 +220,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
|
|
|
220
220
|
|
|
221
221
|
# or save any Spark DataFrame to Unity Catalog
|
|
222
222
|
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
223
|
+
|
|
224
|
+
# save a YAML-ordered folder of SQL queries as Unity Catalog tables
|
|
225
|
+
df = sf.save_pipeline_to_uc(
|
|
226
|
+
"queries/churn_pipeline",
|
|
227
|
+
pipeline="daily",
|
|
228
|
+
catalog="prod",
|
|
229
|
+
schema="analytics",
|
|
230
|
+
)
|
|
223
231
|
```
|
|
224
232
|
|
|
225
233
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -185,6 +185,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
|
|
|
185
185
|
|
|
186
186
|
# or save any Spark DataFrame to Unity Catalog
|
|
187
187
|
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
188
|
+
|
|
189
|
+
# save a YAML-ordered folder of SQL queries as Unity Catalog tables
|
|
190
|
+
df = sf.save_pipeline_to_uc(
|
|
191
|
+
"queries/churn_pipeline",
|
|
192
|
+
pipeline="daily",
|
|
193
|
+
catalog="prod",
|
|
194
|
+
schema="analytics",
|
|
195
|
+
)
|
|
188
196
|
```
|
|
189
197
|
|
|
190
198
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -19,6 +19,7 @@ from .utils import (
|
|
|
19
19
|
get_sql_files,
|
|
20
20
|
load_sql_query,
|
|
21
21
|
log_and_raise_error,
|
|
22
|
+
resolve_sql_query_paths,
|
|
22
23
|
)
|
|
23
24
|
|
|
24
25
|
# Automatically load .env file when the package is imported
|
|
@@ -50,6 +51,7 @@ __all__ = [
|
|
|
50
51
|
"load_sql_query",
|
|
51
52
|
"log_and_raise_error",
|
|
52
53
|
"ModelManager",
|
|
54
|
+
"resolve_sql_query_paths",
|
|
53
55
|
"S3Connector",
|
|
54
56
|
"SFConnector",
|
|
55
57
|
"SlackConnector",
|
|
@@ -232,6 +232,7 @@ class DataConnector:
|
|
|
232
232
|
schema=None,
|
|
233
233
|
role=None,
|
|
234
234
|
authenticator=None,
|
|
235
|
+
token=None,
|
|
235
236
|
private_key=None,
|
|
236
237
|
private_key_path=None,
|
|
237
238
|
private_key_passphrase=None,
|
|
@@ -262,6 +263,7 @@ class DataConnector:
|
|
|
262
263
|
schema=schema,
|
|
263
264
|
role=role,
|
|
264
265
|
authenticator=authenticator,
|
|
266
|
+
token=token,
|
|
265
267
|
private_key=private_key,
|
|
266
268
|
private_key_path=private_key_path,
|
|
267
269
|
private_key_passphrase=private_key_passphrase,
|
|
@@ -335,6 +337,7 @@ class DataConnector:
|
|
|
335
337
|
schema=None,
|
|
336
338
|
role=None,
|
|
337
339
|
authenticator=None,
|
|
340
|
+
token=None,
|
|
338
341
|
private_key=None,
|
|
339
342
|
private_key_path=None,
|
|
340
343
|
private_key_passphrase=None,
|
|
@@ -361,6 +364,7 @@ class DataConnector:
|
|
|
361
364
|
)
|
|
362
365
|
token = _get_snowflake_config_value(
|
|
363
366
|
"SNOWFLAKE_TOKEN",
|
|
367
|
+
explicit=token,
|
|
364
368
|
secret_scope=secret_scope,
|
|
365
369
|
aliases=("SNOWFLAKE_OAUTH_TOKEN", "SNOWFLAKE_ACCESS_TOKEN"),
|
|
366
370
|
)
|
|
@@ -18,7 +18,7 @@ from .data_connector import (
|
|
|
18
18
|
_load_private_key_pem_for_spark,
|
|
19
19
|
_snowflake_secret_scope,
|
|
20
20
|
)
|
|
21
|
-
from .utils import get_logger, load_sql_query, log_and_raise_error
|
|
21
|
+
from .utils import get_logger, load_sql_query, log_and_raise_error, resolve_sql_query_paths
|
|
22
22
|
|
|
23
23
|
# Cached Spark session shared across SFConnector instances. Populated lazily by
|
|
24
24
|
# get_spark(); never created at import time so the package stays importable
|
|
@@ -254,6 +254,10 @@ class SFConnector:
|
|
|
254
254
|
schema: str = None,
|
|
255
255
|
catalog: str = None,
|
|
256
256
|
mode: str = "overwrite",
|
|
257
|
+
optimize: bool = True,
|
|
258
|
+
zorder_by=None,
|
|
259
|
+
merge_schema: bool = True,
|
|
260
|
+
comment: str = None,
|
|
257
261
|
**kwargs,
|
|
258
262
|
):
|
|
259
263
|
"""
|
|
@@ -284,6 +288,15 @@ class SFConnector:
|
|
|
284
288
|
mode : str, optional
|
|
285
289
|
Spark write mode for the saved table ('overwrite', 'append',
|
|
286
290
|
'ignore', 'error'). Defaults to 'overwrite'.
|
|
291
|
+
optimize : bool, optional
|
|
292
|
+
If saving to Unity Catalog, run ``OPTIMIZE`` after the write.
|
|
293
|
+
Defaults to True.
|
|
294
|
+
zorder_by : str or list[str], optional
|
|
295
|
+
Optional columns for Delta ``ZORDER BY`` during optimize.
|
|
296
|
+
merge_schema : bool, optional
|
|
297
|
+
If saving to Unity Catalog, set Delta ``mergeSchema=true``. Defaults to True.
|
|
298
|
+
comment : str, optional
|
|
299
|
+
Optional table comment stored as a Unity Catalog table property.
|
|
287
300
|
**kwargs
|
|
288
301
|
Template variables substituted into the SQL file using ``str.format()``.
|
|
289
302
|
"""
|
|
@@ -295,12 +308,120 @@ class SFConnector:
|
|
|
295
308
|
log_and_raise_error(self._logger, f"Error reading from Snowflake: {e}")
|
|
296
309
|
|
|
297
310
|
if save_table:
|
|
298
|
-
self.save_to_uc(
|
|
311
|
+
self.save_to_uc(
|
|
312
|
+
df,
|
|
313
|
+
table=table,
|
|
314
|
+
schema=schema,
|
|
315
|
+
catalog=catalog,
|
|
316
|
+
mode=mode,
|
|
317
|
+
optimize=optimize,
|
|
318
|
+
zorder_by=zorder_by,
|
|
319
|
+
merge_schema=merge_schema,
|
|
320
|
+
comment=comment,
|
|
321
|
+
)
|
|
299
322
|
|
|
300
323
|
if return_pandas:
|
|
301
324
|
return df.toPandas()
|
|
302
325
|
return df
|
|
303
326
|
|
|
327
|
+
def save_pipeline_to_uc(
|
|
328
|
+
self,
|
|
329
|
+
query_paths,
|
|
330
|
+
*,
|
|
331
|
+
pipeline: str | None = None,
|
|
332
|
+
catalog: str = None,
|
|
333
|
+
schema: str = None,
|
|
334
|
+
tables: dict[str, str] = None,
|
|
335
|
+
table_prefix: str = "",
|
|
336
|
+
table_suffix: str = "",
|
|
337
|
+
mode: str = "overwrite",
|
|
338
|
+
modes: dict[str, str] = None,
|
|
339
|
+
optimize: bool = True,
|
|
340
|
+
zorder_by=None,
|
|
341
|
+
merge_schema: bool = True,
|
|
342
|
+
comment: str = None,
|
|
343
|
+
comments: dict[str, str] = None,
|
|
344
|
+
return_all: bool = False,
|
|
345
|
+
**kwargs,
|
|
346
|
+
):
|
|
347
|
+
"""
|
|
348
|
+
Run YAML-ordered Snowflake queries and save each result as a Unity Catalog table.
|
|
349
|
+
|
|
350
|
+
This is a convenience wrapper around ``sql(..., save_table=True)``. It
|
|
351
|
+
uses the same folder/YAML resolution as ``execute_sql_scripts``:
|
|
352
|
+
``steps`` define the SQL files to run and their order.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
query_paths
|
|
357
|
+
Folder, file, list, or ordered dict of SQL files.
|
|
358
|
+
pipeline
|
|
359
|
+
Optional YAML pipeline name.
|
|
360
|
+
catalog, schema
|
|
361
|
+
Default Unity Catalog destination for unqualified table names.
|
|
362
|
+
tables
|
|
363
|
+
Optional mapping of step name to destination table. Values may be
|
|
364
|
+
unqualified (using ``catalog`` / ``schema``) or fully qualified.
|
|
365
|
+
table_prefix, table_suffix
|
|
366
|
+
Applied to step names when ``tables`` does not define a destination.
|
|
367
|
+
mode
|
|
368
|
+
Default Spark write mode for every table.
|
|
369
|
+
modes
|
|
370
|
+
Optional mapping of step name to Spark write mode.
|
|
371
|
+
optimize
|
|
372
|
+
If True, run ``OPTIMIZE`` after saving each Unity Catalog table.
|
|
373
|
+
zorder_by
|
|
374
|
+
Optional columns for Delta ``ZORDER BY``. Pass a dict to configure
|
|
375
|
+
columns per step, or a string/list to use the same columns for every
|
|
376
|
+
saved table.
|
|
377
|
+
merge_schema
|
|
378
|
+
If True, set Delta ``mergeSchema=true`` for every saved table.
|
|
379
|
+
comment
|
|
380
|
+
Optional table comment applied to every saved table.
|
|
381
|
+
comments
|
|
382
|
+
Optional mapping of step name to table comment.
|
|
383
|
+
return_all
|
|
384
|
+
If True, return a dict of step name to Spark DataFrame. Otherwise
|
|
385
|
+
return the last step's Spark DataFrame.
|
|
386
|
+
**kwargs
|
|
387
|
+
Template variables substituted into SQL files via ``str.format()``.
|
|
388
|
+
"""
|
|
389
|
+
resolved_paths = resolve_sql_query_paths(query_paths, pipeline=pipeline)
|
|
390
|
+
if not resolved_paths:
|
|
391
|
+
log_and_raise_error(self._logger, "No SQL files found for pipeline.")
|
|
392
|
+
|
|
393
|
+
tables = tables or {}
|
|
394
|
+
modes = modes or {}
|
|
395
|
+
comments = comments or {}
|
|
396
|
+
results = {}
|
|
397
|
+
last_df = None
|
|
398
|
+
|
|
399
|
+
for name, query_path in resolved_paths.items():
|
|
400
|
+
destination = tables.get(name) or f"{table_prefix}{name}{table_suffix}"
|
|
401
|
+
if not destination:
|
|
402
|
+
log_and_raise_error(self._logger, f"No Unity Catalog table configured for step '{name}'.")
|
|
403
|
+
|
|
404
|
+
step_mode = modes.get(name, mode)
|
|
405
|
+
step_zorder_by = zorder_by.get(name) if isinstance(zorder_by, dict) else zorder_by
|
|
406
|
+
step_comment = comments.get(name, comment)
|
|
407
|
+
self._logger.info(f"[{name}] saving to Unity Catalog table {destination} (mode={step_mode}) ...")
|
|
408
|
+
last_df = self.sql(
|
|
409
|
+
str(query_path),
|
|
410
|
+
save_table=True,
|
|
411
|
+
table=destination,
|
|
412
|
+
schema=schema,
|
|
413
|
+
catalog=catalog,
|
|
414
|
+
mode=step_mode,
|
|
415
|
+
optimize=optimize,
|
|
416
|
+
zorder_by=step_zorder_by,
|
|
417
|
+
merge_schema=merge_schema,
|
|
418
|
+
comment=step_comment,
|
|
419
|
+
**kwargs,
|
|
420
|
+
)
|
|
421
|
+
results[name] = last_df
|
|
422
|
+
|
|
423
|
+
return results if return_all else last_df
|
|
424
|
+
|
|
304
425
|
@staticmethod
|
|
305
426
|
def _qualified_uc_name(table: str, schema: str = None, catalog: str = None) -> str:
|
|
306
427
|
"""Build a Unity Catalog table identifier from its parts.
|
|
@@ -313,12 +434,91 @@ class SFConnector:
|
|
|
313
434
|
parts = [part for part in (catalog, schema, table) if part]
|
|
314
435
|
return ".".join(parts)
|
|
315
436
|
|
|
316
|
-
|
|
437
|
+
@staticmethod
|
|
438
|
+
def _zorder_clause(zorder_by=None) -> str:
|
|
439
|
+
"""Build the optional Delta ZORDER BY clause."""
|
|
440
|
+
if not zorder_by:
|
|
441
|
+
return ""
|
|
442
|
+
if isinstance(zorder_by, str):
|
|
443
|
+
columns = [column.strip() for column in zorder_by.split(",")]
|
|
444
|
+
else:
|
|
445
|
+
columns = [str(column).strip() for column in zorder_by]
|
|
446
|
+
columns = [column for column in columns if column]
|
|
447
|
+
if not columns:
|
|
448
|
+
return ""
|
|
449
|
+
return f" ZORDER BY ({', '.join(columns)})"
|
|
450
|
+
|
|
451
|
+
@staticmethod
|
|
452
|
+
def _sql_string_literal(value: str) -> str:
|
|
453
|
+
"""Escape a value for use inside a single-quoted SQL string literal."""
|
|
454
|
+
return str(value).replace("'", "''")
|
|
455
|
+
|
|
456
|
+
def set_uc_table_comment(self, table: str, comment: str, schema: str = None, catalog: str = None, spark=None):
|
|
457
|
+
"""
|
|
458
|
+
Set a Unity Catalog table comment using Databricks table properties.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
table
|
|
463
|
+
Table name. May be fully qualified.
|
|
464
|
+
comment
|
|
465
|
+
Comment text to store.
|
|
466
|
+
schema, catalog
|
|
467
|
+
Optional qualifiers when ``table`` is not fully qualified.
|
|
468
|
+
spark
|
|
469
|
+
Optional SparkSession to use. Defaults to this connector's Spark session.
|
|
470
|
+
"""
|
|
471
|
+
full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
|
|
472
|
+
spark = spark or self._get_spark()
|
|
473
|
+
escaped_comment = self._sql_string_literal(comment)
|
|
474
|
+
try:
|
|
475
|
+
spark.sql(f"ALTER TABLE {full_name} SET TBLPROPERTIES ('comment' = '{escaped_comment}')")
|
|
476
|
+
except Exception as e:
|
|
477
|
+
log_and_raise_error(self._logger, f"Error setting comment for Unity Catalog table '{full_name}': {e}")
|
|
478
|
+
self._logger.info(f"Comment set for Unity Catalog table '{full_name}'.")
|
|
479
|
+
|
|
480
|
+
def optimize_uc_table(self, table: str, schema: str = None, catalog: str = None, zorder_by=None, spark=None):
|
|
481
|
+
"""
|
|
482
|
+
Run Databricks Delta ``OPTIMIZE`` on a Unity Catalog table.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
table
|
|
487
|
+
Table name. May be fully qualified.
|
|
488
|
+
schema, catalog
|
|
489
|
+
Optional qualifiers when ``table`` is not fully qualified.
|
|
490
|
+
zorder_by
|
|
491
|
+
Optional column or columns for ``ZORDER BY``.
|
|
492
|
+
spark
|
|
493
|
+
Optional SparkSession to use. Defaults to this connector's Spark session.
|
|
494
|
+
"""
|
|
495
|
+
full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
|
|
496
|
+
spark = spark or self._get_spark()
|
|
497
|
+
optimize_sql = f"OPTIMIZE {full_name}{self._zorder_clause(zorder_by)}"
|
|
498
|
+
try:
|
|
499
|
+
spark.sql(optimize_sql)
|
|
500
|
+
except Exception as e:
|
|
501
|
+
log_and_raise_error(self._logger, f"Error optimizing Unity Catalog table '{full_name}': {e}")
|
|
502
|
+
self._logger.info(f"Table '{full_name}' optimized.")
|
|
503
|
+
|
|
504
|
+
def save_to_uc(
|
|
505
|
+
self,
|
|
506
|
+
df,
|
|
507
|
+
table: str,
|
|
508
|
+
schema: str = None,
|
|
509
|
+
catalog: str = None,
|
|
510
|
+
mode: str = "overwrite",
|
|
511
|
+
optimize: bool = True,
|
|
512
|
+
zorder_by=None,
|
|
513
|
+
merge_schema: bool = True,
|
|
514
|
+
comment: str = None,
|
|
515
|
+
):
|
|
317
516
|
"""
|
|
318
517
|
Write a Spark DataFrame to a Databricks Unity Catalog table.
|
|
319
518
|
|
|
320
519
|
Uses Spark's native ``df.write.saveAsTable(...)`` (a managed UC table),
|
|
321
|
-
not the Snowflake connector.
|
|
520
|
+
not the Snowflake connector. By default, runs Delta ``OPTIMIZE`` after
|
|
521
|
+
the write.
|
|
322
522
|
|
|
323
523
|
Parameters
|
|
324
524
|
----------
|
|
@@ -333,13 +533,31 @@ class SFConnector:
|
|
|
333
533
|
mode : str, optional
|
|
334
534
|
Spark write mode: 'overwrite', 'append', 'ignore', or 'error'.
|
|
335
535
|
Defaults to 'overwrite'.
|
|
536
|
+
optimize : bool, optional
|
|
537
|
+
If True, run ``OPTIMIZE`` after saving. Defaults to True.
|
|
538
|
+
zorder_by : str or list[str], optional
|
|
539
|
+
Optional columns for Delta ``ZORDER BY`` during optimize.
|
|
540
|
+
merge_schema : bool, optional
|
|
541
|
+
If True, writes as Delta with ``mergeSchema=true``. Defaults to True.
|
|
542
|
+
comment : str, optional
|
|
543
|
+
Optional table comment stored as a Unity Catalog table property.
|
|
336
544
|
"""
|
|
337
545
|
if not table:
|
|
338
546
|
log_and_raise_error(self._logger, "A destination table name is required.")
|
|
339
547
|
|
|
340
548
|
full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
|
|
549
|
+
spark = getattr(df, "sparkSession", None) or self._spark
|
|
341
550
|
try:
|
|
342
|
-
df.write.
|
|
551
|
+
writer = df.write.format("delta")
|
|
552
|
+
if merge_schema:
|
|
553
|
+
writer = writer.option("mergeSchema", "true")
|
|
554
|
+
writer.mode(mode).saveAsTable(full_name)
|
|
343
555
|
except Exception as e:
|
|
344
556
|
log_and_raise_error(self._logger, f"Error writing to Unity Catalog table '{full_name}': {e}")
|
|
345
557
|
self._logger.info(f"Table '{full_name}' written to Unity Catalog (mode={mode}).")
|
|
558
|
+
|
|
559
|
+
if comment is not None:
|
|
560
|
+
self.set_uc_table_comment(full_name, comment, spark=spark)
|
|
561
|
+
|
|
562
|
+
if optimize:
|
|
563
|
+
self.optimize_uc_table(full_name, zorder_by=zorder_by, spark=spark)
|
|
@@ -624,6 +624,61 @@ def _is_select_statement(statement: str) -> bool:
|
|
|
624
624
|
return True
|
|
625
625
|
|
|
626
626
|
|
|
627
|
+
def resolve_sql_query_paths(query_paths, pipeline: str | None = None) -> dict[str, Path]:
|
|
628
|
+
"""
|
|
629
|
+
Normalize SQL pipeline input into an ordered mapping of query name to file path.
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
query_paths: one of:
|
|
633
|
+
- str: relative folder path from project root; SQL files are discovered
|
|
634
|
+
via get_sql_files() (respects pipeline.yaml if present).
|
|
635
|
+
- Path pointing to a directory: same as str, resolved relative to project root.
|
|
636
|
+
- Path pointing to a single .sql file: executes that file only.
|
|
637
|
+
- list[str | Path]: ordered list of individual SQL file paths.
|
|
638
|
+
- dict[str, str | Path]: explicit ordered mapping of name -> path; preserves insertion order.
|
|
639
|
+
pipeline: Optional pipeline name passed to get_sql_files() when query_paths is a folder.
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
Ordered dict[str, Path].
|
|
643
|
+
"""
|
|
644
|
+
logger = get_logger("ml_analytics.utils.resolve_sql_query_paths")
|
|
645
|
+
|
|
646
|
+
if isinstance(query_paths, str):
|
|
647
|
+
try:
|
|
648
|
+
project_root = find_project_root()
|
|
649
|
+
candidate = project_root / query_paths
|
|
650
|
+
except FileNotFoundError:
|
|
651
|
+
candidate = Path(query_paths)
|
|
652
|
+
if candidate.is_file():
|
|
653
|
+
return {candidate.stem: candidate}
|
|
654
|
+
resolved = get_sql_files(query_paths, pipeline=pipeline)
|
|
655
|
+
if not resolved:
|
|
656
|
+
log_and_raise_error(logger, f"No SQL files found for folder '{query_paths}'.")
|
|
657
|
+
return resolved
|
|
658
|
+
|
|
659
|
+
if isinstance(query_paths, Path):
|
|
660
|
+
if query_paths.is_dir():
|
|
661
|
+
try:
|
|
662
|
+
project_root = find_project_root()
|
|
663
|
+
relative = query_paths.relative_to(project_root)
|
|
664
|
+
except ValueError:
|
|
665
|
+
relative = query_paths
|
|
666
|
+
resolved = get_sql_files(str(relative), pipeline=pipeline)
|
|
667
|
+
if not resolved:
|
|
668
|
+
log_and_raise_error(logger, f"No SQL files found in directory '{query_paths}'.")
|
|
669
|
+
return resolved
|
|
670
|
+
return {query_paths.stem: query_paths}
|
|
671
|
+
|
|
672
|
+
if isinstance(query_paths, list):
|
|
673
|
+
return {Path(p).stem: Path(p) for p in query_paths}
|
|
674
|
+
|
|
675
|
+
if isinstance(query_paths, dict):
|
|
676
|
+
return {k: Path(v) if isinstance(v, str) else v for k, v in query_paths.items()}
|
|
677
|
+
|
|
678
|
+
log_and_raise_error(logger, f"Expected a folder path, list, or dict, got: {type(query_paths)}")
|
|
679
|
+
return {}
|
|
680
|
+
|
|
681
|
+
|
|
627
682
|
def execute_sql_scripts(
|
|
628
683
|
query_paths,
|
|
629
684
|
data_connector=None,
|
|
@@ -662,42 +717,7 @@ def execute_sql_scripts(
|
|
|
662
717
|
|
|
663
718
|
from ml_analytics.data_connector import DataConnector
|
|
664
719
|
|
|
665
|
-
|
|
666
|
-
if isinstance(query_paths, str):
|
|
667
|
-
try:
|
|
668
|
-
project_root = find_project_root()
|
|
669
|
-
candidate = project_root / query_paths
|
|
670
|
-
except FileNotFoundError:
|
|
671
|
-
candidate = Path(query_paths)
|
|
672
|
-
if candidate.is_file():
|
|
673
|
-
query_paths = {candidate.stem: candidate}
|
|
674
|
-
else:
|
|
675
|
-
resolved = get_sql_files(query_paths, pipeline=pipeline)
|
|
676
|
-
if not resolved:
|
|
677
|
-
log_and_raise_error(logger, f"No SQL files found for folder '{query_paths}'.")
|
|
678
|
-
return
|
|
679
|
-
query_paths = resolved
|
|
680
|
-
elif isinstance(query_paths, Path):
|
|
681
|
-
if query_paths.is_dir():
|
|
682
|
-
try:
|
|
683
|
-
project_root = find_project_root()
|
|
684
|
-
relative = query_paths.relative_to(project_root)
|
|
685
|
-
except ValueError:
|
|
686
|
-
relative = query_paths
|
|
687
|
-
resolved = get_sql_files(str(relative), pipeline=pipeline)
|
|
688
|
-
if not resolved:
|
|
689
|
-
log_and_raise_error(logger, f"No SQL files found in directory '{query_paths}'.")
|
|
690
|
-
return
|
|
691
|
-
query_paths = resolved
|
|
692
|
-
else:
|
|
693
|
-
query_paths = {query_paths.stem: query_paths}
|
|
694
|
-
elif isinstance(query_paths, list):
|
|
695
|
-
query_paths = {Path(p).stem: Path(p) for p in query_paths}
|
|
696
|
-
elif isinstance(query_paths, dict):
|
|
697
|
-
query_paths = {k: Path(v) if isinstance(v, str) else v for k, v in query_paths.items()}
|
|
698
|
-
else:
|
|
699
|
-
log_and_raise_error(logger, f"Expected a folder path, list, or dict, got: {type(query_paths)}")
|
|
700
|
-
return
|
|
720
|
+
query_paths = resolve_sql_query_paths(query_paths, pipeline=pipeline)
|
|
701
721
|
|
|
702
722
|
def _run_scripts(dc):
|
|
703
723
|
"""Execute all scripts on the given DataConnector instance.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -220,6 +220,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
|
|
|
220
220
|
|
|
221
221
|
# or save any Spark DataFrame to Unity Catalog
|
|
222
222
|
sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
|
|
223
|
+
|
|
224
|
+
# save a YAML-ordered folder of SQL queries as Unity Catalog tables
|
|
225
|
+
df = sf.save_pipeline_to_uc(
|
|
226
|
+
"queries/churn_pipeline",
|
|
227
|
+
pipeline="daily",
|
|
228
|
+
catalog="prod",
|
|
229
|
+
schema="analytics",
|
|
230
|
+
)
|
|
223
231
|
```
|
|
224
232
|
|
|
225
233
|
Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
|
|
@@ -58,6 +58,7 @@ def _mock_spark():
|
|
|
58
58
|
"""Spark double whose read chain returns a DataFrame mock."""
|
|
59
59
|
spark = MagicMock()
|
|
60
60
|
df = MagicMock()
|
|
61
|
+
df.sparkSession = spark
|
|
61
62
|
reader = spark.read.format.return_value
|
|
62
63
|
reader.options.return_value.option.return_value.load.return_value = df
|
|
63
64
|
reader.options.return_value.load.return_value = df
|
|
@@ -220,11 +221,20 @@ def test_qualified_uc_name_already_qualified():
|
|
|
220
221
|
|
|
221
222
|
def test_save_to_uc_uses_saveastable(monkeypatch):
|
|
222
223
|
_clear_snowflake_env(monkeypatch)
|
|
223
|
-
|
|
224
|
+
spark = MagicMock()
|
|
225
|
+
sf = SFConnector(account="acct", user="u", spark=spark)
|
|
224
226
|
|
|
225
227
|
calls = {}
|
|
226
228
|
|
|
227
229
|
class _Writer:
|
|
230
|
+
def format(self, fmt):
|
|
231
|
+
calls["format"] = fmt
|
|
232
|
+
return self
|
|
233
|
+
|
|
234
|
+
def option(self, key, value):
|
|
235
|
+
calls["option"] = (key, value)
|
|
236
|
+
return self
|
|
237
|
+
|
|
228
238
|
def mode(self, m):
|
|
229
239
|
calls["mode"] = m
|
|
230
240
|
return self
|
|
@@ -234,9 +244,42 @@ def test_save_to_uc_uses_saveastable(monkeypatch):
|
|
|
234
244
|
|
|
235
245
|
class _DF:
|
|
236
246
|
write = _Writer()
|
|
247
|
+
sparkSession = spark
|
|
237
248
|
|
|
238
249
|
sf.save_to_uc(_DF(), table="tbl", schema="sch", catalog="cat", mode="append")
|
|
239
|
-
assert calls == {
|
|
250
|
+
assert calls == {
|
|
251
|
+
"format": "delta",
|
|
252
|
+
"option": ("mergeSchema", "true"),
|
|
253
|
+
"mode": "append",
|
|
254
|
+
"name": "cat.sch.tbl",
|
|
255
|
+
}
|
|
256
|
+
spark.sql.assert_called_once_with("OPTIMIZE cat.sch.tbl")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def test_save_to_uc_can_zorder_comment_or_skip_optimize(monkeypatch):
|
|
260
|
+
_clear_snowflake_env(monkeypatch)
|
|
261
|
+
spark = MagicMock()
|
|
262
|
+
sf = SFConnector(account="acct", user="u", spark=spark)
|
|
263
|
+
|
|
264
|
+
df = MagicMock()
|
|
265
|
+
df.sparkSession = spark
|
|
266
|
+
|
|
267
|
+
sf.save_to_uc(
|
|
268
|
+
df,
|
|
269
|
+
table="tbl",
|
|
270
|
+
schema="sch",
|
|
271
|
+
catalog="cat",
|
|
272
|
+
zorder_by=["customer_id", "event_date"],
|
|
273
|
+
comment="Tutor's metrics",
|
|
274
|
+
)
|
|
275
|
+
assert [call.args[0] for call in spark.sql.call_args_list] == [
|
|
276
|
+
"ALTER TABLE cat.sch.tbl SET TBLPROPERTIES ('comment' = 'Tutor''s metrics')",
|
|
277
|
+
"OPTIMIZE cat.sch.tbl ZORDER BY (customer_id, event_date)",
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
spark.reset_mock()
|
|
281
|
+
sf.save_to_uc(df, table="tbl", schema="sch", catalog="cat", optimize=False)
|
|
282
|
+
spark.sql.assert_not_called()
|
|
240
283
|
|
|
241
284
|
|
|
242
285
|
def test_save_to_uc_requires_table(monkeypatch):
|
|
@@ -317,3 +360,82 @@ def test_sql_return_pandas(monkeypatch):
|
|
|
317
360
|
sf.sql("select 1", return_pandas=True)
|
|
318
361
|
|
|
319
362
|
df.toPandas.assert_called_once()
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def test_save_pipeline_to_uc_uses_yaml_order_and_file_stem_tables(monkeypatch, tmp_path):
|
|
366
|
+
_clear_snowflake_env(monkeypatch)
|
|
367
|
+
folder = tmp_path / "queries"
|
|
368
|
+
folder.mkdir()
|
|
369
|
+
(folder / "base.sql").write_text("SELECT '{run_date}' AS run_date;")
|
|
370
|
+
(folder / "features.sql").write_text("SELECT 1 AS feature;")
|
|
371
|
+
(folder / "daily.yaml").write_text(
|
|
372
|
+
"""
|
|
373
|
+
steps:
|
|
374
|
+
- features
|
|
375
|
+
- base
|
|
376
|
+
"""
|
|
377
|
+
)
|
|
378
|
+
monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *args, **kwargs: tmp_path)
|
|
379
|
+
|
|
380
|
+
spark, df = _mock_spark()
|
|
381
|
+
sf = SFConnector(account="acct", user="u", password="p", spark=spark)
|
|
382
|
+
|
|
383
|
+
result = sf.save_pipeline_to_uc(
|
|
384
|
+
"queries",
|
|
385
|
+
pipeline="daily",
|
|
386
|
+
catalog="prod",
|
|
387
|
+
schema="analytics",
|
|
388
|
+
run_date="2026-06-17",
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
assert result is df
|
|
392
|
+
query_calls = spark.read.format.return_value.options.return_value.option.call_args_list
|
|
393
|
+
assert [call.args for call in query_calls] == [
|
|
394
|
+
("query", "SELECT 1 AS feature;"),
|
|
395
|
+
("query", "SELECT '2026-06-17' AS run_date;"),
|
|
396
|
+
]
|
|
397
|
+
save_calls = df.write.format.return_value.option.return_value.mode.return_value.saveAsTable.call_args_list
|
|
398
|
+
assert [call.args[0] for call in save_calls] == [
|
|
399
|
+
"prod.analytics.features",
|
|
400
|
+
"prod.analytics.base",
|
|
401
|
+
]
|
|
402
|
+
assert [call.args[0] for call in spark.sql.call_args_list] == [
|
|
403
|
+
"OPTIMIZE prod.analytics.features",
|
|
404
|
+
"OPTIMIZE prod.analytics.base",
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def test_save_pipeline_to_uc_allows_table_and_mode_overrides(monkeypatch, tmp_path):
|
|
409
|
+
_clear_snowflake_env(monkeypatch)
|
|
410
|
+
folder = tmp_path / "queries"
|
|
411
|
+
folder.mkdir()
|
|
412
|
+
(folder / "base.sql").write_text("SELECT 1 AS col_1;")
|
|
413
|
+
(folder / "final.sql").write_text("SELECT 2 AS col_2;")
|
|
414
|
+
monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *args, **kwargs: tmp_path)
|
|
415
|
+
|
|
416
|
+
spark, df = _mock_spark()
|
|
417
|
+
sf = SFConnector(account="acct", user="u", password="p", spark=spark)
|
|
418
|
+
|
|
419
|
+
result = sf.save_pipeline_to_uc(
|
|
420
|
+
"queries",
|
|
421
|
+
schema="analytics",
|
|
422
|
+
catalog="prod",
|
|
423
|
+
tables={"final": "churn_daily"},
|
|
424
|
+
table_prefix="stg_",
|
|
425
|
+
modes={"final": "append"},
|
|
426
|
+
zorder_by={"final": "customer_id"},
|
|
427
|
+
return_all=True,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
assert result == {"base": df, "final": df}
|
|
431
|
+
mode_calls = df.write.format.return_value.option.return_value.mode.call_args_list
|
|
432
|
+
assert [call.args[0] for call in mode_calls] == ["overwrite", "append"]
|
|
433
|
+
save_calls = df.write.format.return_value.option.return_value.mode.return_value.saveAsTable.call_args_list
|
|
434
|
+
assert [call.args[0] for call in save_calls] == [
|
|
435
|
+
"prod.analytics.stg_base",
|
|
436
|
+
"prod.analytics.churn_daily",
|
|
437
|
+
]
|
|
438
|
+
assert [call.args[0] for call in spark.sql.call_args_list] == [
|
|
439
|
+
"OPTIMIZE prod.analytics.stg_base",
|
|
440
|
+
"OPTIMIZE prod.analytics.churn_daily ZORDER BY (customer_id)",
|
|
441
|
+
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/requires.txt
RENAMED
|
File without changes
|
{ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|