PyPI - snowflake-ml-python - Versions diffs - 1.5.2__py3-none-any.whl → 1.5.4__py3-none-any.whl - Mend

snowflake-ml-python 1.5.2py3-none-any.whl → 1.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (250) hide show

snowflake/ml/_internal/lineage/lineage_utils.py CHANGED Viewed

@@ -1,21 +1,11 @@
 import copy
 import functools
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Optional
 from snowflake import snowpark
 from snowflake.ml._internal.lineage import data_source
-DATA_SOURCES_ATTR = "_data_sources"
-def _get_datasources(*args: Any) -> List[data_source.DataSource]:
-    """Helper method for extracting data sources attribute from DataFrames in an argument list"""
-    result = []
-    for arg in args:
-        srcs = getattr(arg, DATA_SOURCES_ATTR, None)
-        if isinstance(srcs, list) and all(isinstance(s, data_source.DataSource) for s in srcs):
-            result += srcs
-    return result
+_DATA_SOURCES_ATTR = "_data_sources"
 def _wrap_func(
@@ -32,6 +22,37 @@ def _wrap_func(
     return wrapped
+def _wrap_class_func(fn: Callable[..., snowpark.DataFrame]) -> Callable[..., snowpark.DataFrame]:
+    @functools.wraps(fn)
+    def wrapped(*args: Any, **kwargs: Any) -> snowpark.DataFrame:
+        df = fn(*args, **kwargs)
+        data_sources = get_data_sources(*args, *kwargs.values())
+        if data_sources:
+            patch_dataframe(df, data_sources, inplace=True)
+        return df
+    return wrapped
+def get_data_sources(*args: Any) -> Optional[List[data_source.DataSource]]:
+    """Helper method for extracting data sources attribute from DataFrames in an argument list"""
+    result: Optional[List[data_source.DataSource]] = None
+    for arg in args:
+        srcs = getattr(arg, _DATA_SOURCES_ATTR, None)
+        if isinstance(srcs, list) and all(isinstance(s, data_source.DataSource) for s in srcs):
+            if result is None:
+                result = []
+            result += srcs
+    return result
+def set_data_sources(obj: Any, data_sources: Optional[List[data_source.DataSource]]) -> None:
+    """Helper method for attaching data sources to an object"""
+    if data_sources:
+        assert all(isinstance(ds, data_source.DataSource) for ds in data_sources)
+    setattr(obj, _DATA_SOURCES_ATTR, data_sources)
 def patch_dataframe(
     df: snowpark.DataFrame, data_sources: List[data_source.DataSource], inplace: bool = False
 ) -> snowpark.DataFrame:
@@ -62,7 +83,7 @@ def patch_dataframe(
     ]
     if not inplace:
         df = copy.copy(df)
-    setattr(df, DATA_SOURCES_ATTR, data_sources)
+    set_data_sources(df, data_sources)
     for func in funcs:
         fn = getattr(df, func, None)
         if fn is not None:
@@ -70,18 +91,6 @@ def patch_dataframe(
     return df
-def _wrap_class_func(fn: Callable[..., snowpark.DataFrame]) -> Callable[..., snowpark.DataFrame]:
-    @functools.wraps(fn)
-    def wrapped(*args: Any, **kwargs: Any) -> snowpark.DataFrame:
-        df = fn(*args, **kwargs)
-        data_sources = _get_datasources(*args) + _get_datasources(*kwargs.values())
-        if data_sources:
-            patch_dataframe(df, data_sources, inplace=True)
-        return df
-    return wrapped
 # Class-level monkey-patches
 for klass, func_list in {
     snowpark.DataFrame: [

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import (
     Dict,
     Iterable,
     List,
+    Mapping,
     Optional,
     Tuple,
     TypeVar,
@@ -92,6 +93,31 @@ def get_statement_params(
     )
+def add_statement_params_custom_tags(
+    statement_params: Optional[Dict[str, Any]], custom_tags: Mapping[str, Any]
+) -> Dict[str, Any]:
+    """
+    Add custom_tags to existing statement_params.  Overwrite keys in custom_tags dict that already exist.
+    If existing statement_params are not provided, do nothing as the information cannot be effectively tracked.
+    Args:
+        statement_params: Existing statement_params dictionary.
+        custom_tags: Dictionary of existing k/v pairs to add as custom_tags
+    Returns:
+        new statement_params dictionary with all keys and an updated custom_tags field.
+    """
+    if not statement_params:
+        return {}
+    existing_custom_tags: Dict[str, Any] = statement_params.pop(TelemetryField.KEY_CUSTOM_TAGS.value, {})
+    existing_custom_tags.update(custom_tags)
+    # NOTE: This can be done with | operator after upgrade from py3.8
+    return {
+        **statement_params,
+        TelemetryField.KEY_CUSTOM_TAGS.value: existing_custom_tags,
+    }
 # TODO: we can merge this with get_statement_params after code clean up
 def get_statement_params_full_func_name(frame: Optional[types.FrameType], class_name: Optional[str] = None) -> str:
     """

snowflake/ml/_internal/utils/identifier.py CHANGED Viewed

@@ -165,6 +165,20 @@ def parse_schema_level_object_identifier(
     )
+def is_fully_qualified_name(name: str) -> bool:
+    """
+    Checks if a given name is a fully qualified name, which is in the format '<db>.<schema>.<object_name>'.
+    Args:
+        name: The name to be checked.
+    Returns:
+        bool: True if the name is fully qualified, False otherwise.
+    """
+    res = parse_schema_level_object_identifier(name)
+    return res[0] is not None and res[1] is not None and res[2] is not None and not res[3]
 def get_schema_level_object_identifier(
     db: Optional[str],
     schema: Optional[str],

snowflake/ml/_internal/utils/snowpark_dataframe_utils.py CHANGED Viewed

@@ -1,22 +1,27 @@
 import logging
 import warnings
+from typing import List, Optional
 from snowflake import snowpark
+from snowflake.ml._internal.utils import sql_identifier
 from snowflake.snowpark import functions, types
-def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
+def cast_snowpark_dataframe(df: snowpark.DataFrame, ignore_columns: Optional[List[str]] = None) -> snowpark.DataFrame:
     """Cast columns in the dataframe to types that are compatible with tensor.
     It assists FileSet.make() in performing implicit data casting.
     Args:
         df: A snowpark dataframe.
+        ignore_columns: Columns to exclude from casting. These columns will be propagated unchanged.
     Returns:
         A snowpark dataframe whose data type has been casted.
     """
+    ignore_cols_set = {sql_identifier.SqlIdentifier(c).identifier() for c in ignore_columns} if ignore_columns else {}
     fields = df.schema.fields
     selected_cols = []
     for field in fields:
@@ -40,7 +45,9 @@ def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
             dest = field.datatype
             selected_cols.append(functions.cast(functions.col(src), dest).alias(src))
         else:
-            if field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
+            if field.column_identifier.name in ignore_cols_set:
+                pass
+            elif field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
                 logging.warning(
                     "A Column with DATE or TIMESTAMP data type detected. "
                     "It might not be able to get converted to tensors. "
@@ -90,7 +97,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to DoubleType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             else:
                 # IntegerType default as NUMBER(38, 0), but
@@ -102,7 +111,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to LongType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             selected_cols.append(functions.cast(functions.col(src), dest_dtype).alias(src))
         # TODO: add more type handling or error message

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -19,6 +19,7 @@ from snowflake.ml._internal.utils import (
     snowpark_dataframe_utils,
 )
 from snowflake.ml.dataset import dataset_metadata, dataset_reader
+from snowflake.ml.lineage import lineage_node
 from snowflake.snowpark import exceptions as snowpark_exceptions, functions
 _PROJECT = "Dataset"
@@ -65,6 +66,20 @@ class DatasetVersion:
         comment: Optional[str] = self._get_property("comment")
         return comment
+    @property
+    def label_cols(self) -> List[str]:
+        metadata = self._get_metadata()
+        if metadata is None or metadata.label_cols is None:
+            return []
+        return metadata.label_cols
+    @property
+    def exclude_cols(self) -> List[str]:
+        metadata = self._get_metadata()
+        if metadata is None or metadata.exclude_cols is None:
+            return []
+        return metadata.exclude_cols
     def _get_property(self, property_name: str, default: Any = None) -> Any:
         if self._properties is None:
             sql_result = (
@@ -91,17 +106,6 @@ class DatasetVersion:
                 warnings.warn(f"Metadata parsing failed with error: {e}", UserWarning, stacklevel=2)
         return self._metadata
-    def _get_exclude_cols(self) -> List[str]:
-        metadata = self._get_metadata()
-        if metadata is None:
-            return []
-        cols = []
-        if metadata.exclude_cols:
-            cols.extend(metadata.exclude_cols)
-        if metadata.label_cols:
-            cols.extend(metadata.label_cols)
-        return cols
     def url(self) -> str:
         """Returns the URL of the DatasetVersion contents in Snowflake.
@@ -122,7 +126,7 @@ class DatasetVersion:
         return f"{self.__class__.__name__}(dataset='{self._parent.fully_qualified_name}', version='{self.name}')"
-class Dataset:
+class Dataset(lineage_node.LineageNode):
     """Represents a Snowflake Dataset which is organized into versions."""
     @telemetry.send_api_usage_telemetry(project=_PROJECT)
@@ -135,18 +139,31 @@ class Dataset:
         selected_version: Optional[str] = None,
     ) -> None:
         """Initialize a lazily evaluated Dataset object"""
-        self._session = session
         self._db = database
         self._schema = schema
         self._name = name
-        self._fully_qualified_name = identifier.get_schema_level_object_identifier(database, schema, name)
+        super().__init__(
+            session,
+            identifier.get_schema_level_object_identifier(database, schema, name),
+            domain="dataset",
+            version=selected_version,
+        )
         self._version = DatasetVersion(self, selected_version) if selected_version else None
         self._reader: Optional[dataset_reader.DatasetReader] = None
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  name='{self._lineage_node_name}',\n"
+            f"  version='{self._version._version if self._version else None}',\n"
+            f")"
+        )
     @property
     def fully_qualified_name(self) -> str:
-        return self._fully_qualified_name
+        return self._lineage_node_name
     @property
     def selected_version(self) -> Optional[DatasetVersion]:
@@ -165,10 +182,10 @@ class Dataset:
                 self._session,
                 [
                     data_source.DataSource(
-                        fully_qualified_name=self._fully_qualified_name,
+                        fully_qualified_name=self._lineage_node_name,
                         version=v.name,
                         url=v.url(),
-                        exclude_cols=v._get_exclude_cols(),
+                        exclude_cols=(v.label_cols + v.exclude_cols),
                     )
                 ],
             )
@@ -227,9 +244,8 @@ class Dataset:
         try:
             session.sql(query).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
             return Dataset(session, db, schema, ds_name)
-        except snowpark_exceptions.SnowparkClientException as e:
-            # Snowpark wraps the Python Connector error code in the head of the error message.
-            if e.message.startswith(dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.OBJECT_ALREADY_EXISTS,
                     original_exception=dataset_errors.DatasetExistError(
@@ -293,7 +309,7 @@ class Dataset:
         Raises:
             SnowflakeMLException: The Dataset no longer exists.
             SnowflakeMLException: The specified Dataset version already exists.
-            snowpark_exceptions.SnowparkClientException: An error occurred during Dataset creation.
+            snowpark_exceptions.SnowparkSQLException: An error occurred during Dataset creation.
         Note: During the generation of stage files, data casting will occur. The casting rules are as follows::
             - Data casting:
@@ -318,7 +334,8 @@ class Dataset:
                 - DateType(DATE): Not supported. A warning will be logged.
                 - VariantType(VARIANT): Not supported. A warning will be logged.
         """
-        casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe)
+        cast_ignore_cols = (exclude_cols or []) + (label_cols or [])
+        casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe, ignore_columns=cast_ignore_cols)
         if shuffle:
             casted_df = casted_df.order_by(functions.random())
@@ -364,19 +381,19 @@ class Dataset:
             return Dataset(self._session, self._db, self._schema, self._name, version)
-        except snowpark_exceptions.SnowparkClientException as e:
-            if e.message.startswith(dataset_errors.ERRNO_DATASET_NOT_EXIST):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_DATASET_NOT_EXIST:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.NOT_FOUND,
                     original_exception=dataset_errors.DatasetNotExistError(
                         dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
                     ),
                 ) from e
-            elif (
-                e.message.startswith(dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS)
-                or e.message.startswith(dataset_errors.ERRNO_VERSION_ALREADY_EXISTS)
-                or e.message.startswith(dataset_errors.ERRNO_FILES_ALREADY_EXISTING)
-            ):
+            elif e.sql_error_code in {
+                dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS,
+                dataset_errors.ERRNO_VERSION_ALREADY_EXISTS,
+                dataset_errors.ERRNO_FILES_ALREADY_EXISTING,
+            }:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.OBJECT_ALREADY_EXISTS,
                     original_exception=dataset_errors.DatasetExistError(
@@ -432,9 +449,8 @@ class Dataset:
                 .has_column(_DATASET_VERSION_NAME_COL, allow_empty=True)
                 .validate()
             )
-        except snowpark_exceptions.SnowparkClientException as e:
-            # Snowpark wraps the Python Connector error code in the head of the error message.
-            if e.message.startswith(dataset_errors.ERRNO_OBJECT_NOT_EXIST):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_OBJECT_NOT_EXIST:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.NOT_FOUND,
                     original_exception=dataset_errors.DatasetNotExistError(
@@ -456,6 +472,12 @@ class Dataset:
                 ),
             )
+    @staticmethod
+    def _load_from_lineage_node(session: snowpark.Session, name: str, version: str) -> "Dataset":
+        return Dataset.load(session, name).select_version(version)
+lineage_node.DOMAIN_LINEAGE_REGISTRY["dataset"] = Dataset
 # Utility methods

snowflake/ml/dataset/dataset_factory.py CHANGED Viewed

@@ -16,8 +16,7 @@ def create_from_dataframe(
     **version_kwargs: Any,
 ) -> dataset.Dataset:
     """
-    Create a new versioned Dataset from a DataFrame and returns
-    a DatasetReader for the newly created Dataset version.
+    Create a new versioned Dataset from a DataFrame.
     Args:
         session: The Snowpark Session instance to use.
@@ -39,7 +38,7 @@ def create_from_dataframe(
 @telemetry.send_api_usage_telemetry(project=_PROJECT)
 def load_dataset(session: snowpark.Session, name: str, version: str) -> dataset.Dataset:
     """
-    Load a versioned Dataset into a DatasetReader.
+    Load a versioned Dataset.
     Args:
         session: The Snowpark Session instance to use.
@@ -47,7 +46,7 @@ def load_dataset(session: snowpark.Session, name: str, version: str) -> dataset.
         version: The dataset version name.
     Returns:
-        A DatasetReader object.
+        A Dataset object.
     """
     ds: dataset.Dataset = dataset.Dataset.load(session, name).select_version(version)
     return ds

snowflake-ml-python 1.5.2__py3-none-any.whl → 1.5.4__py3-none-any.whl

snowflake-ml-python 1.5.2py3-none-any.whl → 1.5.4py3-none-any.whl