ml-analytics-tools 0.4.3__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {ml_analytics_tools-0.4.3/ml_analytics_tools.egg-info → ml_analytics_tools-0.4.4}/PKG-INFO +9 -1
  2. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/README.md +8 -0
  3. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/__init__.py +2 -0
  4. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/data_connector.py +4 -0
  5. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/sf_connector.py +223 -5
  6. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/utils.py +56 -36
  7. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4/ml_analytics_tools.egg-info}/PKG-INFO +9 -1
  8. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/pyproject.toml +1 -1
  9. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_sf_connector.py +124 -2
  10. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/LICENSE +0 -0
  11. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/aws_auth.py +0 -0
  12. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/cli.py +0 -0
  13. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/gsheet_connector.py +0 -0
  14. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/model_manager.py +0 -0
  15. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/model_tools.py +0 -0
  16. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/s3_connector.py +0 -0
  17. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/slack_connector.py +0 -0
  18. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics/tunnel_manager.py +0 -0
  19. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
  20. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
  21. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
  22. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/requires.txt +0 -0
  23. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/ml_analytics_tools.egg-info/top_level.txt +0 -0
  24. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/setup.cfg +0 -0
  25. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_aws_auth.py +0 -0
  26. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_db_s3.py +0 -0
  27. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_gsheet_connector.py +0 -0
  28. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_identity_column.py +0 -0
  29. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_model_manager.py +0 -0
  30. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_model_tools.py +0 -0
  31. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_s3_redshift_validation.py +0 -0
  32. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_tunnel_manager.py +0 -0
  33. {ml_analytics_tools-0.4.3 → ml_analytics_tools-0.4.4}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -220,6 +220,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
220
220
 
221
221
  # or save any Spark DataFrame to Unity Catalog
222
222
  sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
223
+
224
+ # save a YAML-ordered folder of SQL queries as Unity Catalog tables
225
+ df = sf.save_pipeline_to_uc(
226
+ "queries/churn_pipeline",
227
+ pipeline="daily",
228
+ catalog="prod",
229
+ schema="analytics",
230
+ )
223
231
  ```
224
232
 
225
233
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -185,6 +185,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
185
185
 
186
186
  # or save any Spark DataFrame to Unity Catalog
187
187
  sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
188
+
189
+ # save a YAML-ordered folder of SQL queries as Unity Catalog tables
190
+ df = sf.save_pipeline_to_uc(
191
+ "queries/churn_pipeline",
192
+ pipeline="daily",
193
+ catalog="prod",
194
+ schema="analytics",
195
+ )
188
196
  ```
189
197
 
190
198
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -19,6 +19,7 @@ from .utils import (
19
19
  get_sql_files,
20
20
  load_sql_query,
21
21
  log_and_raise_error,
22
+ resolve_sql_query_paths,
22
23
  )
23
24
 
24
25
  # Automatically load .env file when the package is imported
@@ -50,6 +51,7 @@ __all__ = [
50
51
  "load_sql_query",
51
52
  "log_and_raise_error",
52
53
  "ModelManager",
54
+ "resolve_sql_query_paths",
53
55
  "S3Connector",
54
56
  "SFConnector",
55
57
  "SlackConnector",
@@ -232,6 +232,7 @@ class DataConnector:
232
232
  schema=None,
233
233
  role=None,
234
234
  authenticator=None,
235
+ token=None,
235
236
  private_key=None,
236
237
  private_key_path=None,
237
238
  private_key_passphrase=None,
@@ -262,6 +263,7 @@ class DataConnector:
262
263
  schema=schema,
263
264
  role=role,
264
265
  authenticator=authenticator,
266
+ token=token,
265
267
  private_key=private_key,
266
268
  private_key_path=private_key_path,
267
269
  private_key_passphrase=private_key_passphrase,
@@ -335,6 +337,7 @@ class DataConnector:
335
337
  schema=None,
336
338
  role=None,
337
339
  authenticator=None,
340
+ token=None,
338
341
  private_key=None,
339
342
  private_key_path=None,
340
343
  private_key_passphrase=None,
@@ -361,6 +364,7 @@ class DataConnector:
361
364
  )
362
365
  token = _get_snowflake_config_value(
363
366
  "SNOWFLAKE_TOKEN",
367
+ explicit=token,
364
368
  secret_scope=secret_scope,
365
369
  aliases=("SNOWFLAKE_OAUTH_TOKEN", "SNOWFLAKE_ACCESS_TOKEN"),
366
370
  )
@@ -18,7 +18,7 @@ from .data_connector import (
18
18
  _load_private_key_pem_for_spark,
19
19
  _snowflake_secret_scope,
20
20
  )
21
- from .utils import get_logger, load_sql_query, log_and_raise_error
21
+ from .utils import get_logger, load_sql_query, log_and_raise_error, resolve_sql_query_paths
22
22
 
23
23
  # Cached Spark session shared across SFConnector instances. Populated lazily by
24
24
  # get_spark(); never created at import time so the package stays importable
@@ -254,6 +254,10 @@ class SFConnector:
254
254
  schema: str = None,
255
255
  catalog: str = None,
256
256
  mode: str = "overwrite",
257
+ optimize: bool = True,
258
+ zorder_by=None,
259
+ merge_schema: bool = True,
260
+ comment: str = None,
257
261
  **kwargs,
258
262
  ):
259
263
  """
@@ -284,6 +288,15 @@ class SFConnector:
284
288
  mode : str, optional
285
289
  Spark write mode for the saved table ('overwrite', 'append',
286
290
  'ignore', 'error'). Defaults to 'overwrite'.
291
+ optimize : bool, optional
292
+ If saving to Unity Catalog, run ``OPTIMIZE`` after the write.
293
+ Defaults to True.
294
+ zorder_by : str or list[str], optional
295
+ Optional columns for Delta ``ZORDER BY`` during optimize.
296
+ merge_schema : bool, optional
297
+ If saving to Unity Catalog, set Delta ``mergeSchema=true``. Defaults to True.
298
+ comment : str, optional
299
+ Optional table comment stored as a Unity Catalog table property.
287
300
  **kwargs
288
301
  Template variables substituted into the SQL file using ``str.format()``.
289
302
  """
@@ -295,12 +308,120 @@ class SFConnector:
295
308
  log_and_raise_error(self._logger, f"Error reading from Snowflake: {e}")
296
309
 
297
310
  if save_table:
298
- self.save_to_uc(df, table=table, schema=schema, catalog=catalog, mode=mode)
311
+ self.save_to_uc(
312
+ df,
313
+ table=table,
314
+ schema=schema,
315
+ catalog=catalog,
316
+ mode=mode,
317
+ optimize=optimize,
318
+ zorder_by=zorder_by,
319
+ merge_schema=merge_schema,
320
+ comment=comment,
321
+ )
299
322
 
300
323
  if return_pandas:
301
324
  return df.toPandas()
302
325
  return df
303
326
 
327
+ def save_pipeline_to_uc(
328
+ self,
329
+ query_paths,
330
+ *,
331
+ pipeline: str | None = None,
332
+ catalog: str = None,
333
+ schema: str = None,
334
+ tables: dict[str, str] = None,
335
+ table_prefix: str = "",
336
+ table_suffix: str = "",
337
+ mode: str = "overwrite",
338
+ modes: dict[str, str] = None,
339
+ optimize: bool = True,
340
+ zorder_by=None,
341
+ merge_schema: bool = True,
342
+ comment: str = None,
343
+ comments: dict[str, str] = None,
344
+ return_all: bool = False,
345
+ **kwargs,
346
+ ):
347
+ """
348
+ Run YAML-ordered Snowflake queries and save each result as a Unity Catalog table.
349
+
350
+ This is a convenience wrapper around ``sql(..., save_table=True)``. It
351
+ uses the same folder/YAML resolution as ``execute_sql_scripts``:
352
+ ``steps`` define the SQL files to run and their order.
353
+
354
+ Parameters
355
+ ----------
356
+ query_paths
357
+ Folder, file, list, or ordered dict of SQL files.
358
+ pipeline
359
+ Optional YAML pipeline name.
360
+ catalog, schema
361
+ Default Unity Catalog destination for unqualified table names.
362
+ tables
363
+ Optional mapping of step name to destination table. Values may be
364
+ unqualified (using ``catalog`` / ``schema``) or fully qualified.
365
+ table_prefix, table_suffix
366
+ Applied to step names when ``tables`` does not define a destination.
367
+ mode
368
+ Default Spark write mode for every table.
369
+ modes
370
+ Optional mapping of step name to Spark write mode.
371
+ optimize
372
+ If True, run ``OPTIMIZE`` after saving each Unity Catalog table.
373
+ zorder_by
374
+ Optional columns for Delta ``ZORDER BY``. Pass a dict to configure
375
+ columns per step, or a string/list to use the same columns for every
376
+ saved table.
377
+ merge_schema
378
+ If True, set Delta ``mergeSchema=true`` for every saved table.
379
+ comment
380
+ Optional table comment applied to every saved table.
381
+ comments
382
+ Optional mapping of step name to table comment.
383
+ return_all
384
+ If True, return a dict of step name to Spark DataFrame. Otherwise
385
+ return the last step's Spark DataFrame.
386
+ **kwargs
387
+ Template variables substituted into SQL files via ``str.format()``.
388
+ """
389
+ resolved_paths = resolve_sql_query_paths(query_paths, pipeline=pipeline)
390
+ if not resolved_paths:
391
+ log_and_raise_error(self._logger, "No SQL files found for pipeline.")
392
+
393
+ tables = tables or {}
394
+ modes = modes or {}
395
+ comments = comments or {}
396
+ results = {}
397
+ last_df = None
398
+
399
+ for name, query_path in resolved_paths.items():
400
+ destination = tables.get(name) or f"{table_prefix}{name}{table_suffix}"
401
+ if not destination:
402
+ log_and_raise_error(self._logger, f"No Unity Catalog table configured for step '{name}'.")
403
+
404
+ step_mode = modes.get(name, mode)
405
+ step_zorder_by = zorder_by.get(name) if isinstance(zorder_by, dict) else zorder_by
406
+ step_comment = comments.get(name, comment)
407
+ self._logger.info(f"[{name}] saving to Unity Catalog table {destination} (mode={step_mode}) ...")
408
+ last_df = self.sql(
409
+ str(query_path),
410
+ save_table=True,
411
+ table=destination,
412
+ schema=schema,
413
+ catalog=catalog,
414
+ mode=step_mode,
415
+ optimize=optimize,
416
+ zorder_by=step_zorder_by,
417
+ merge_schema=merge_schema,
418
+ comment=step_comment,
419
+ **kwargs,
420
+ )
421
+ results[name] = last_df
422
+
423
+ return results if return_all else last_df
424
+
304
425
  @staticmethod
305
426
  def _qualified_uc_name(table: str, schema: str = None, catalog: str = None) -> str:
306
427
  """Build a Unity Catalog table identifier from its parts.
@@ -313,12 +434,91 @@ class SFConnector:
313
434
  parts = [part for part in (catalog, schema, table) if part]
314
435
  return ".".join(parts)
315
436
 
316
- def save_to_uc(self, df, table: str, schema: str = None, catalog: str = None, mode: str = "overwrite"):
437
+ @staticmethod
438
+ def _zorder_clause(zorder_by=None) -> str:
439
+ """Build the optional Delta ZORDER BY clause."""
440
+ if not zorder_by:
441
+ return ""
442
+ if isinstance(zorder_by, str):
443
+ columns = [column.strip() for column in zorder_by.split(",")]
444
+ else:
445
+ columns = [str(column).strip() for column in zorder_by]
446
+ columns = [column for column in columns if column]
447
+ if not columns:
448
+ return ""
449
+ return f" ZORDER BY ({', '.join(columns)})"
450
+
451
+ @staticmethod
452
+ def _sql_string_literal(value: str) -> str:
453
+ """Escape a value for use inside a single-quoted SQL string literal."""
454
+ return str(value).replace("'", "''")
455
+
456
+ def set_uc_table_comment(self, table: str, comment: str, schema: str = None, catalog: str = None, spark=None):
457
+ """
458
+ Set a Unity Catalog table comment using Databricks table properties.
459
+
460
+ Parameters
461
+ ----------
462
+ table
463
+ Table name. May be fully qualified.
464
+ comment
465
+ Comment text to store.
466
+ schema, catalog
467
+ Optional qualifiers when ``table`` is not fully qualified.
468
+ spark
469
+ Optional SparkSession to use. Defaults to this connector's Spark session.
470
+ """
471
+ full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
472
+ spark = spark or self._get_spark()
473
+ escaped_comment = self._sql_string_literal(comment)
474
+ try:
475
+ spark.sql(f"ALTER TABLE {full_name} SET TBLPROPERTIES ('comment' = '{escaped_comment}')")
476
+ except Exception as e:
477
+ log_and_raise_error(self._logger, f"Error setting comment for Unity Catalog table '{full_name}': {e}")
478
+ self._logger.info(f"Comment set for Unity Catalog table '{full_name}'.")
479
+
480
+ def optimize_uc_table(self, table: str, schema: str = None, catalog: str = None, zorder_by=None, spark=None):
481
+ """
482
+ Run Databricks Delta ``OPTIMIZE`` on a Unity Catalog table.
483
+
484
+ Parameters
485
+ ----------
486
+ table
487
+ Table name. May be fully qualified.
488
+ schema, catalog
489
+ Optional qualifiers when ``table`` is not fully qualified.
490
+ zorder_by
491
+ Optional column or columns for ``ZORDER BY``.
492
+ spark
493
+ Optional SparkSession to use. Defaults to this connector's Spark session.
494
+ """
495
+ full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
496
+ spark = spark or self._get_spark()
497
+ optimize_sql = f"OPTIMIZE {full_name}{self._zorder_clause(zorder_by)}"
498
+ try:
499
+ spark.sql(optimize_sql)
500
+ except Exception as e:
501
+ log_and_raise_error(self._logger, f"Error optimizing Unity Catalog table '{full_name}': {e}")
502
+ self._logger.info(f"Table '{full_name}' optimized.")
503
+
504
+ def save_to_uc(
505
+ self,
506
+ df,
507
+ table: str,
508
+ schema: str = None,
509
+ catalog: str = None,
510
+ mode: str = "overwrite",
511
+ optimize: bool = True,
512
+ zorder_by=None,
513
+ merge_schema: bool = True,
514
+ comment: str = None,
515
+ ):
317
516
  """
318
517
  Write a Spark DataFrame to a Databricks Unity Catalog table.
319
518
 
320
519
  Uses Spark's native ``df.write.saveAsTable(...)`` (a managed UC table),
321
- not the Snowflake connector.
520
+ not the Snowflake connector. By default, runs Delta ``OPTIMIZE`` after
521
+ the write.
322
522
 
323
523
  Parameters
324
524
  ----------
@@ -333,13 +533,31 @@ class SFConnector:
333
533
  mode : str, optional
334
534
  Spark write mode: 'overwrite', 'append', 'ignore', or 'error'.
335
535
  Defaults to 'overwrite'.
536
+ optimize : bool, optional
537
+ If True, run ``OPTIMIZE`` after saving. Defaults to True.
538
+ zorder_by : str or list[str], optional
539
+ Optional columns for Delta ``ZORDER BY`` during optimize.
540
+ merge_schema : bool, optional
541
+ If True, writes as Delta with ``mergeSchema=true``. Defaults to True.
542
+ comment : str, optional
543
+ Optional table comment stored as a Unity Catalog table property.
336
544
  """
337
545
  if not table:
338
546
  log_and_raise_error(self._logger, "A destination table name is required.")
339
547
 
340
548
  full_name = self._qualified_uc_name(table, schema=schema, catalog=catalog)
549
+ spark = getattr(df, "sparkSession", None) or self._spark
341
550
  try:
342
- df.write.mode(mode).saveAsTable(full_name)
551
+ writer = df.write.format("delta")
552
+ if merge_schema:
553
+ writer = writer.option("mergeSchema", "true")
554
+ writer.mode(mode).saveAsTable(full_name)
343
555
  except Exception as e:
344
556
  log_and_raise_error(self._logger, f"Error writing to Unity Catalog table '{full_name}': {e}")
345
557
  self._logger.info(f"Table '{full_name}' written to Unity Catalog (mode={mode}).")
558
+
559
+ if comment is not None:
560
+ self.set_uc_table_comment(full_name, comment, spark=spark)
561
+
562
+ if optimize:
563
+ self.optimize_uc_table(full_name, zorder_by=zorder_by, spark=spark)
@@ -624,6 +624,61 @@ def _is_select_statement(statement: str) -> bool:
624
624
  return True
625
625
 
626
626
 
627
+ def resolve_sql_query_paths(query_paths, pipeline: str | None = None) -> dict[str, Path]:
628
+ """
629
+ Normalize SQL pipeline input into an ordered mapping of query name to file path.
630
+
631
+ Args:
632
+ query_paths: one of:
633
+ - str: relative folder path from project root; SQL files are discovered
634
+ via get_sql_files() (respects pipeline.yaml if present).
635
+ - Path pointing to a directory: same as str, resolved relative to project root.
636
+ - Path pointing to a single .sql file: executes that file only.
637
+ - list[str | Path]: ordered list of individual SQL file paths.
638
+ - dict[str, str | Path]: explicit ordered mapping of name -> path; preserves insertion order.
639
+ pipeline: Optional pipeline name passed to get_sql_files() when query_paths is a folder.
640
+
641
+ Returns:
642
+ Ordered dict[str, Path].
643
+ """
644
+ logger = get_logger("ml_analytics.utils.resolve_sql_query_paths")
645
+
646
+ if isinstance(query_paths, str):
647
+ try:
648
+ project_root = find_project_root()
649
+ candidate = project_root / query_paths
650
+ except FileNotFoundError:
651
+ candidate = Path(query_paths)
652
+ if candidate.is_file():
653
+ return {candidate.stem: candidate}
654
+ resolved = get_sql_files(query_paths, pipeline=pipeline)
655
+ if not resolved:
656
+ log_and_raise_error(logger, f"No SQL files found for folder '{query_paths}'.")
657
+ return resolved
658
+
659
+ if isinstance(query_paths, Path):
660
+ if query_paths.is_dir():
661
+ try:
662
+ project_root = find_project_root()
663
+ relative = query_paths.relative_to(project_root)
664
+ except ValueError:
665
+ relative = query_paths
666
+ resolved = get_sql_files(str(relative), pipeline=pipeline)
667
+ if not resolved:
668
+ log_and_raise_error(logger, f"No SQL files found in directory '{query_paths}'.")
669
+ return resolved
670
+ return {query_paths.stem: query_paths}
671
+
672
+ if isinstance(query_paths, list):
673
+ return {Path(p).stem: Path(p) for p in query_paths}
674
+
675
+ if isinstance(query_paths, dict):
676
+ return {k: Path(v) if isinstance(v, str) else v for k, v in query_paths.items()}
677
+
678
+ log_and_raise_error(logger, f"Expected a folder path, list, or dict, got: {type(query_paths)}")
679
+ return {}
680
+
681
+
627
682
  def execute_sql_scripts(
628
683
  query_paths,
629
684
  data_connector=None,
@@ -662,42 +717,7 @@ def execute_sql_scripts(
662
717
 
663
718
  from ml_analytics.data_connector import DataConnector
664
719
 
665
- # Normalize input to an ordered dict[str, Path]
666
- if isinstance(query_paths, str):
667
- try:
668
- project_root = find_project_root()
669
- candidate = project_root / query_paths
670
- except FileNotFoundError:
671
- candidate = Path(query_paths)
672
- if candidate.is_file():
673
- query_paths = {candidate.stem: candidate}
674
- else:
675
- resolved = get_sql_files(query_paths, pipeline=pipeline)
676
- if not resolved:
677
- log_and_raise_error(logger, f"No SQL files found for folder '{query_paths}'.")
678
- return
679
- query_paths = resolved
680
- elif isinstance(query_paths, Path):
681
- if query_paths.is_dir():
682
- try:
683
- project_root = find_project_root()
684
- relative = query_paths.relative_to(project_root)
685
- except ValueError:
686
- relative = query_paths
687
- resolved = get_sql_files(str(relative), pipeline=pipeline)
688
- if not resolved:
689
- log_and_raise_error(logger, f"No SQL files found in directory '{query_paths}'.")
690
- return
691
- query_paths = resolved
692
- else:
693
- query_paths = {query_paths.stem: query_paths}
694
- elif isinstance(query_paths, list):
695
- query_paths = {Path(p).stem: Path(p) for p in query_paths}
696
- elif isinstance(query_paths, dict):
697
- query_paths = {k: Path(v) if isinstance(v, str) else v for k, v in query_paths.items()}
698
- else:
699
- log_and_raise_error(logger, f"Expected a folder path, list, or dict, got: {type(query_paths)}")
700
- return
720
+ query_paths = resolve_sql_query_paths(query_paths, pipeline=pipeline)
701
721
 
702
722
  def _run_scripts(dc):
703
723
  """Execute all scripts on the given DataConnector instance.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -220,6 +220,14 @@ sf.sql("queries/experiment.sql", save_table=True, schema="analytics", table="exp
220
220
 
221
221
  # or save any Spark DataFrame to Unity Catalog
222
222
  sf.save_to_uc(df, table="exp", schema="analytics", catalog="prod")
223
+
224
+ # save a YAML-ordered folder of SQL queries as Unity Catalog tables
225
+ df = sf.save_pipeline_to_uc(
226
+ "queries/churn_pipeline",
227
+ pipeline="daily",
228
+ catalog="prod",
229
+ schema="analytics",
230
+ )
223
231
  ```
224
232
 
225
233
  Credentials resolve per field as: explicit argument → `SNOWFLAKE_*` environment
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ml-analytics-tools"
3
- version = "0.4.3"
3
+ version = "0.4.4"
4
4
  description = "Tools for ML projects and data management"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -58,6 +58,7 @@ def _mock_spark():
58
58
  """Spark double whose read chain returns a DataFrame mock."""
59
59
  spark = MagicMock()
60
60
  df = MagicMock()
61
+ df.sparkSession = spark
61
62
  reader = spark.read.format.return_value
62
63
  reader.options.return_value.option.return_value.load.return_value = df
63
64
  reader.options.return_value.load.return_value = df
@@ -220,11 +221,20 @@ def test_qualified_uc_name_already_qualified():
220
221
 
221
222
  def test_save_to_uc_uses_saveastable(monkeypatch):
222
223
  _clear_snowflake_env(monkeypatch)
223
- sf = SFConnector(account="acct", user="u")
224
+ spark = MagicMock()
225
+ sf = SFConnector(account="acct", user="u", spark=spark)
224
226
 
225
227
  calls = {}
226
228
 
227
229
  class _Writer:
230
+ def format(self, fmt):
231
+ calls["format"] = fmt
232
+ return self
233
+
234
+ def option(self, key, value):
235
+ calls["option"] = (key, value)
236
+ return self
237
+
228
238
  def mode(self, m):
229
239
  calls["mode"] = m
230
240
  return self
@@ -234,9 +244,42 @@ def test_save_to_uc_uses_saveastable(monkeypatch):
234
244
 
235
245
  class _DF:
236
246
  write = _Writer()
247
+ sparkSession = spark
237
248
 
238
249
  sf.save_to_uc(_DF(), table="tbl", schema="sch", catalog="cat", mode="append")
239
- assert calls == {"mode": "append", "name": "cat.sch.tbl"}
250
+ assert calls == {
251
+ "format": "delta",
252
+ "option": ("mergeSchema", "true"),
253
+ "mode": "append",
254
+ "name": "cat.sch.tbl",
255
+ }
256
+ spark.sql.assert_called_once_with("OPTIMIZE cat.sch.tbl")
257
+
258
+
259
+ def test_save_to_uc_can_zorder_comment_or_skip_optimize(monkeypatch):
260
+ _clear_snowflake_env(monkeypatch)
261
+ spark = MagicMock()
262
+ sf = SFConnector(account="acct", user="u", spark=spark)
263
+
264
+ df = MagicMock()
265
+ df.sparkSession = spark
266
+
267
+ sf.save_to_uc(
268
+ df,
269
+ table="tbl",
270
+ schema="sch",
271
+ catalog="cat",
272
+ zorder_by=["customer_id", "event_date"],
273
+ comment="Tutor's metrics",
274
+ )
275
+ assert [call.args[0] for call in spark.sql.call_args_list] == [
276
+ "ALTER TABLE cat.sch.tbl SET TBLPROPERTIES ('comment' = 'Tutor''s metrics')",
277
+ "OPTIMIZE cat.sch.tbl ZORDER BY (customer_id, event_date)",
278
+ ]
279
+
280
+ spark.reset_mock()
281
+ sf.save_to_uc(df, table="tbl", schema="sch", catalog="cat", optimize=False)
282
+ spark.sql.assert_not_called()
240
283
 
241
284
 
242
285
  def test_save_to_uc_requires_table(monkeypatch):
@@ -317,3 +360,82 @@ def test_sql_return_pandas(monkeypatch):
317
360
  sf.sql("select 1", return_pandas=True)
318
361
 
319
362
  df.toPandas.assert_called_once()
363
+
364
+
365
+ def test_save_pipeline_to_uc_uses_yaml_order_and_file_stem_tables(monkeypatch, tmp_path):
366
+ _clear_snowflake_env(monkeypatch)
367
+ folder = tmp_path / "queries"
368
+ folder.mkdir()
369
+ (folder / "base.sql").write_text("SELECT '{run_date}' AS run_date;")
370
+ (folder / "features.sql").write_text("SELECT 1 AS feature;")
371
+ (folder / "daily.yaml").write_text(
372
+ """
373
+ steps:
374
+ - features
375
+ - base
376
+ """
377
+ )
378
+ monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *args, **kwargs: tmp_path)
379
+
380
+ spark, df = _mock_spark()
381
+ sf = SFConnector(account="acct", user="u", password="p", spark=spark)
382
+
383
+ result = sf.save_pipeline_to_uc(
384
+ "queries",
385
+ pipeline="daily",
386
+ catalog="prod",
387
+ schema="analytics",
388
+ run_date="2026-06-17",
389
+ )
390
+
391
+ assert result is df
392
+ query_calls = spark.read.format.return_value.options.return_value.option.call_args_list
393
+ assert [call.args for call in query_calls] == [
394
+ ("query", "SELECT 1 AS feature;"),
395
+ ("query", "SELECT '2026-06-17' AS run_date;"),
396
+ ]
397
+ save_calls = df.write.format.return_value.option.return_value.mode.return_value.saveAsTable.call_args_list
398
+ assert [call.args[0] for call in save_calls] == [
399
+ "prod.analytics.features",
400
+ "prod.analytics.base",
401
+ ]
402
+ assert [call.args[0] for call in spark.sql.call_args_list] == [
403
+ "OPTIMIZE prod.analytics.features",
404
+ "OPTIMIZE prod.analytics.base",
405
+ ]
406
+
407
+
408
+ def test_save_pipeline_to_uc_allows_table_and_mode_overrides(monkeypatch, tmp_path):
409
+ _clear_snowflake_env(monkeypatch)
410
+ folder = tmp_path / "queries"
411
+ folder.mkdir()
412
+ (folder / "base.sql").write_text("SELECT 1 AS col_1;")
413
+ (folder / "final.sql").write_text("SELECT 2 AS col_2;")
414
+ monkeypatch.setattr("ml_analytics.utils.find_project_root", lambda *args, **kwargs: tmp_path)
415
+
416
+ spark, df = _mock_spark()
417
+ sf = SFConnector(account="acct", user="u", password="p", spark=spark)
418
+
419
+ result = sf.save_pipeline_to_uc(
420
+ "queries",
421
+ schema="analytics",
422
+ catalog="prod",
423
+ tables={"final": "churn_daily"},
424
+ table_prefix="stg_",
425
+ modes={"final": "append"},
426
+ zorder_by={"final": "customer_id"},
427
+ return_all=True,
428
+ )
429
+
430
+ assert result == {"base": df, "final": df}
431
+ mode_calls = df.write.format.return_value.option.return_value.mode.call_args_list
432
+ assert [call.args[0] for call in mode_calls] == ["overwrite", "append"]
433
+ save_calls = df.write.format.return_value.option.return_value.mode.return_value.saveAsTable.call_args_list
434
+ assert [call.args[0] for call in save_calls] == [
435
+ "prod.analytics.stg_base",
436
+ "prod.analytics.churn_daily",
437
+ ]
438
+ assert [call.args[0] for call in spark.sql.call_args_list] == [
439
+ "OPTIMIZE prod.analytics.stg_base",
440
+ "OPTIMIZE prod.analytics.churn_daily ZORDER BY (customer_id)",
441
+ ]