PyPI - snowflake-ml-python - Versions diffs - 1.5.3__py3-none-any.whl → 1.5.4__py3-none-any.whl - Mend

snowflake-ml-python 1.5.3py3-none-any.whl → 1.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import (
     Dict,
     Iterable,
     List,
+    Mapping,
     Optional,
     Tuple,
     TypeVar,
@@ -92,6 +93,31 @@ def get_statement_params(
     )
+def add_statement_params_custom_tags(
+    statement_params: Optional[Dict[str, Any]], custom_tags: Mapping[str, Any]
+) -> Dict[str, Any]:
+    """
+    Add custom_tags to existing statement_params.  Overwrite keys in custom_tags dict that already exist.
+    If existing statement_params are not provided, do nothing as the information cannot be effectively tracked.
+    Args:
+        statement_params: Existing statement_params dictionary.
+        custom_tags: Dictionary of existing k/v pairs to add as custom_tags
+    Returns:
+        new statement_params dictionary with all keys and an updated custom_tags field.
+    """
+    if not statement_params:
+        return {}
+    existing_custom_tags: Dict[str, Any] = statement_params.pop(TelemetryField.KEY_CUSTOM_TAGS.value, {})
+    existing_custom_tags.update(custom_tags)
+    # NOTE: This can be done with | operator after upgrade from py3.8
+    return {
+        **statement_params,
+        TelemetryField.KEY_CUSTOM_TAGS.value: existing_custom_tags,
+    }
 # TODO: we can merge this with get_statement_params after code clean up
 def get_statement_params_full_func_name(frame: Optional[types.FrameType], class_name: Optional[str] = None) -> str:
     """

snowflake/ml/_internal/utils/identifier.py CHANGED Viewed

@@ -165,6 +165,20 @@ def parse_schema_level_object_identifier(
     )
+def is_fully_qualified_name(name: str) -> bool:
+    """
+    Checks if a given name is a fully qualified name, which is in the format '<db>.<schema>.<object_name>'.
+    Args:
+        name: The name to be checked.
+    Returns:
+        bool: True if the name is fully qualified, False otherwise.
+    """
+    res = parse_schema_level_object_identifier(name)
+    return res[0] is not None and res[1] is not None and res[2] is not None and not res[3]
 def get_schema_level_object_identifier(
     db: Optional[str],
     schema: Optional[str],

snowflake/ml/_internal/utils/snowpark_dataframe_utils.py CHANGED Viewed

@@ -1,22 +1,27 @@
 import logging
 import warnings
+from typing import List, Optional
 from snowflake import snowpark
+from snowflake.ml._internal.utils import sql_identifier
 from snowflake.snowpark import functions, types
-def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
+def cast_snowpark_dataframe(df: snowpark.DataFrame, ignore_columns: Optional[List[str]] = None) -> snowpark.DataFrame:
     """Cast columns in the dataframe to types that are compatible with tensor.
     It assists FileSet.make() in performing implicit data casting.
     Args:
         df: A snowpark dataframe.
+        ignore_columns: Columns to exclude from casting. These columns will be propagated unchanged.
     Returns:
         A snowpark dataframe whose data type has been casted.
     """
+    ignore_cols_set = {sql_identifier.SqlIdentifier(c).identifier() for c in ignore_columns} if ignore_columns else {}
     fields = df.schema.fields
     selected_cols = []
     for field in fields:
@@ -40,7 +45,9 @@ def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
             dest = field.datatype
             selected_cols.append(functions.cast(functions.col(src), dest).alias(src))
         else:
-            if field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
+            if field.column_identifier.name in ignore_cols_set:
+                pass
+            elif field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
                 logging.warning(
                     "A Column with DATE or TIMESTAMP data type detected. "
                     "It might not be able to get converted to tensors. "
@@ -90,7 +97,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to DoubleType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             else:
                 # IntegerType default as NUMBER(38, 0), but
@@ -102,7 +111,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to LongType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             selected_cols.append(functions.cast(functions.col(src), dest_dtype).alias(src))
         # TODO: add more type handling or error message

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -19,6 +19,7 @@ from snowflake.ml._internal.utils import (
     snowpark_dataframe_utils,
 )
 from snowflake.ml.dataset import dataset_metadata, dataset_reader
+from snowflake.ml.lineage import lineage_node
 from snowflake.snowpark import exceptions as snowpark_exceptions, functions
 _PROJECT = "Dataset"
@@ -125,7 +126,7 @@ class DatasetVersion:
         return f"{self.__class__.__name__}(dataset='{self._parent.fully_qualified_name}', version='{self.name}')"
-class Dataset:
+class Dataset(lineage_node.LineageNode):
     """Represents a Snowflake Dataset which is organized into versions."""
     @telemetry.send_api_usage_telemetry(project=_PROJECT)
@@ -138,18 +139,31 @@ class Dataset:
         selected_version: Optional[str] = None,
     ) -> None:
         """Initialize a lazily evaluated Dataset object"""
-        self._session = session
         self._db = database
         self._schema = schema
         self._name = name
-        self._fully_qualified_name = identifier.get_schema_level_object_identifier(database, schema, name)
+        super().__init__(
+            session,
+            identifier.get_schema_level_object_identifier(database, schema, name),
+            domain="dataset",
+            version=selected_version,
+        )
         self._version = DatasetVersion(self, selected_version) if selected_version else None
         self._reader: Optional[dataset_reader.DatasetReader] = None
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  name='{self._lineage_node_name}',\n"
+            f"  version='{self._version._version if self._version else None}',\n"
+            f")"
+        )
     @property
     def fully_qualified_name(self) -> str:
-        return self._fully_qualified_name
+        return self._lineage_node_name
     @property
     def selected_version(self) -> Optional[DatasetVersion]:
@@ -168,7 +182,7 @@ class Dataset:
                 self._session,
                 [
                     data_source.DataSource(
-                        fully_qualified_name=self._fully_qualified_name,
+                        fully_qualified_name=self._lineage_node_name,
                         version=v.name,
                         url=v.url(),
                         exclude_cols=(v.label_cols + v.exclude_cols),
@@ -230,9 +244,8 @@ class Dataset:
         try:
             session.sql(query).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
             return Dataset(session, db, schema, ds_name)
-        except snowpark_exceptions.SnowparkClientException as e:
-            # Snowpark wraps the Python Connector error code in the head of the error message.
-            if e.message.startswith(dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.OBJECT_ALREADY_EXISTS,
                     original_exception=dataset_errors.DatasetExistError(
@@ -296,7 +309,7 @@ class Dataset:
         Raises:
             SnowflakeMLException: The Dataset no longer exists.
             SnowflakeMLException: The specified Dataset version already exists.
-            snowpark_exceptions.SnowparkClientException: An error occurred during Dataset creation.
+            snowpark_exceptions.SnowparkSQLException: An error occurred during Dataset creation.
         Note: During the generation of stage files, data casting will occur. The casting rules are as follows::
             - Data casting:
@@ -321,7 +334,8 @@ class Dataset:
                 - DateType(DATE): Not supported. A warning will be logged.
                 - VariantType(VARIANT): Not supported. A warning will be logged.
         """
-        casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe)
+        cast_ignore_cols = (exclude_cols or []) + (label_cols or [])
+        casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe, ignore_columns=cast_ignore_cols)
         if shuffle:
             casted_df = casted_df.order_by(functions.random())
@@ -367,19 +381,19 @@ class Dataset:
             return Dataset(self._session, self._db, self._schema, self._name, version)
-        except snowpark_exceptions.SnowparkClientException as e:
-            if e.message.startswith(dataset_errors.ERRNO_DATASET_NOT_EXIST):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_DATASET_NOT_EXIST:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.NOT_FOUND,
                     original_exception=dataset_errors.DatasetNotExistError(
                         dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
                     ),
                 ) from e
-            elif (
-                e.message.startswith(dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS)
-                or e.message.startswith(dataset_errors.ERRNO_VERSION_ALREADY_EXISTS)
-                or e.message.startswith(dataset_errors.ERRNO_FILES_ALREADY_EXISTING)
-            ):
+            elif e.sql_error_code in {
+                dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS,
+                dataset_errors.ERRNO_VERSION_ALREADY_EXISTS,
+                dataset_errors.ERRNO_FILES_ALREADY_EXISTING,
+            }:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.OBJECT_ALREADY_EXISTS,
                     original_exception=dataset_errors.DatasetExistError(
@@ -435,9 +449,8 @@ class Dataset:
                 .has_column(_DATASET_VERSION_NAME_COL, allow_empty=True)
                 .validate()
             )
-        except snowpark_exceptions.SnowparkClientException as e:
-            # Snowpark wraps the Python Connector error code in the head of the error message.
-            if e.message.startswith(dataset_errors.ERRNO_OBJECT_NOT_EXIST):
+        except snowpark_exceptions.SnowparkSQLException as e:
+            if e.sql_error_code == dataset_errors.ERRNO_OBJECT_NOT_EXIST:
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.NOT_FOUND,
                     original_exception=dataset_errors.DatasetNotExistError(
@@ -459,6 +472,12 @@ class Dataset:
                 ),
             )
+    @staticmethod
+    def _load_from_lineage_node(session: snowpark.Session, name: str, version: str) -> "Dataset":
+        return Dataset.load(session, name).select_version(version)
+lineage_node.DOMAIN_LINEAGE_REGISTRY["dataset"] = Dataset
 # Utility methods

snowflake-ml-python 1.5.3__py3-none-any.whl → 1.5.4__py3-none-any.whl

snowflake-ml-python 1.5.3py3-none-any.whl → 1.5.4py3-none-any.whl