PyPI - snowflake-ml-python - Versions diffs - 1.8.6__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

snowflake-ml-python 1.8.6py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

snowflake/ml/_internal/env_utils.py +44 -3
snowflake/ml/_internal/platform_capabilities.py +52 -2
snowflake/ml/_internal/type_utils.py +1 -1
snowflake/ml/_internal/utils/identifier.py +1 -1
snowflake/ml/_internal/utils/mixins.py +71 -0
snowflake/ml/_internal/utils/service_logger.py +4 -2
snowflake/ml/data/_internal/arrow_ingestor.py +11 -1
snowflake/ml/data/data_connector.py +43 -2
snowflake/ml/data/data_ingestor.py +8 -0
snowflake/ml/data/torch_utils.py +1 -1
snowflake/ml/dataset/dataset.py +3 -2
snowflake/ml/dataset/dataset_reader.py +22 -6
snowflake/ml/experiment/_client/experiment_tracking_sql_client.py +98 -0
snowflake/ml/experiment/_entities/__init__.py +4 -0
snowflake/ml/experiment/_entities/experiment.py +10 -0
snowflake/ml/experiment/_entities/run.py +62 -0
snowflake/ml/experiment/_entities/run_metadata.py +68 -0
snowflake/ml/experiment/_experiment_info.py +63 -0
snowflake/ml/experiment/experiment_tracking.py +319 -0
snowflake/ml/jobs/_utils/constants.py +1 -1
snowflake/ml/jobs/_utils/interop_utils.py +63 -4
snowflake/ml/jobs/_utils/payload_utils.py +5 -3
snowflake/ml/jobs/_utils/query_helper.py +20 -0
snowflake/ml/jobs/_utils/scripts/mljob_launcher.py +5 -1
snowflake/ml/jobs/_utils/spec_utils.py +21 -4
snowflake/ml/jobs/decorators.py +18 -25
snowflake/ml/jobs/job.py +137 -37
snowflake/ml/jobs/manager.py +228 -153
snowflake/ml/lineage/lineage_node.py +2 -2
snowflake/ml/model/_client/model/model_version_impl.py +16 -4
snowflake/ml/model/_client/ops/model_ops.py +12 -3
snowflake/ml/model/_client/ops/service_ops.py +324 -138
snowflake/ml/model/_client/service/model_deployment_spec.py +1 -1
snowflake/ml/model/_client/service/model_deployment_spec_schema.py +3 -1
snowflake/ml/model/_model_composer/model_composer.py +6 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +55 -13
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
snowflake/ml/model/_packager/model_env/model_env.py +35 -27
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +41 -2
snowflake/ml/model/_packager/model_handlers/pytorch.py +5 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +3 -1
snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -1
snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -3
snowflake/ml/model/_signatures/snowpark_handler.py +55 -3
snowflake/ml/model/event_handler.py +117 -0
snowflake/ml/model/model_signature.py +9 -9
snowflake/ml/model/models/huggingface_pipeline.py +170 -1
snowflake/ml/model/target_platform.py +11 -0
snowflake/ml/model/task.py +9 -0
snowflake/ml/model/type_hints.py +5 -13
snowflake/ml/modeling/framework/base.py +1 -1
snowflake/ml/modeling/metrics/classification.py +14 -14
snowflake/ml/modeling/metrics/correlation.py +19 -8
snowflake/ml/modeling/metrics/metrics_utils.py +2 -0
snowflake/ml/modeling/metrics/ranking.py +6 -6
snowflake/ml/modeling/metrics/regression.py +9 -9
snowflake/ml/monitoring/explain_visualize.py +12 -5
snowflake/ml/registry/_manager/model_manager.py +47 -15
snowflake/ml/registry/registry.py +109 -64
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.8.6.dist-info → snowflake_ml_python-1.9.1.dist-info}/METADATA +118 -18
{snowflake_ml_python-1.8.6.dist-info → snowflake_ml_python-1.9.1.dist-info}/RECORD +65 -53
{snowflake_ml_python-1.8.6.dist-info → snowflake_ml_python-1.9.1.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.8.6.dist-info → snowflake_ml_python-1.9.1.dist-info}/licenses/LICENSE.txt +0 -0
{snowflake_ml_python-1.8.6.dist-info → snowflake_ml_python-1.9.1.dist-info}/top_level.txt +0 -0

snowflake/ml/model/event_handler.py ADDED Viewed

@@ -0,0 +1,117 @@
+import os
+import sys
+from typing import Any, Optional
+class _TqdmStatusContext:
+    """A tqdm-based context manager for status updates."""
+    def __init__(self, label: str, tqdm_module: Any, total: Optional[int] = None) -> None:
+        self._label = label
+        self._tqdm = tqdm_module
+        self._total = total or 1
+    def __enter__(self) -> "_TqdmStatusContext":
+        self._progress_bar = self._tqdm.tqdm(desc=self._label, file=sys.stdout, total=self._total, leave=True)
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self._progress_bar.close()
+    def update(self, label: str, *, state: str = "running", expanded: bool = True) -> None:
+        """Update the status by updating the tqdm description."""
+        if state == "complete":
+            self._progress_bar.update(self._progress_bar.total - self._progress_bar.n)
+            self._progress_bar.set_description(label)
+        else:
+            self._progress_bar.set_description(f"{self._label}: {label}")
+    def increment(self, n: int = 1) -> None:
+        """Increment the progress bar."""
+        self._progress_bar.update(n)
+class _StreamlitStatusContext:
+    """A streamlit-based context manager for status updates with progress bar support."""
+    def __init__(self, label: str, streamlit_module: Any, total: Optional[int] = None) -> None:
+        self._label = label
+        self._streamlit = streamlit_module
+        self._total = total
+        self._current = 0
+        self._progress_bar = None
+    def __enter__(self) -> "_StreamlitStatusContext":
+        self._status_container = self._streamlit.status(self._label, state="running", expanded=True)
+        if self._total is not None:
+            with self._status_container:
+                self._progress_bar = self._streamlit.progress(0, text=f"0/{self._total}")
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self._status_container.update(state="complete")
+    def update(self, label: str, *, state: str = "running", expanded: bool = True) -> None:
+        """Update the status label."""
+        if state != "complete":
+            label = f"{self._label}: {label}"
+        self._status_container.update(label=label, state=state, expanded=expanded)
+        if self._progress_bar is not None:
+            self._progress_bar.progress(
+                self._current / self._total if self._total > 0 else 0,
+                text=f"{label} - {self._current}/{self._total}",
+            )
+    def increment(self, n: int = 1) -> None:
+        """Increment the progress."""
+        if self._total is not None:
+            self._current = min(self._current + n, self._total)
+            if self._progress_bar is not None:
+                progress_value = self._current / self._total if self._total > 0 else 0
+                self._progress_bar.progress(progress_value, text=f"{self._current}/{self._total}")
+class ModelEventHandler:
+    """Event handler for model operations with streamlit-aware status updates."""
+    def __init__(self) -> None:
+        self._streamlit = None
+        # Try streamlit first
+        try:
+            import streamlit as st
+            if st.runtime.exists():
+                USE_STREAMLIT_WIDGETS = os.getenv("USE_STREAMLIT_WIDGETS", "1") == "1"
+                if USE_STREAMLIT_WIDGETS:
+                    self._streamlit = st
+        except ImportError:
+            pass
+        import tqdm
+        self._tqdm = tqdm
+    def update(self, message: str) -> None:
+        """Write a message using streamlit if available, otherwise use tqdm."""
+        if self._streamlit is not None:
+            self._streamlit.write(message)
+        else:
+            self._tqdm.tqdm.write(message)
+    def status(self, label: str, *, state: str = "running", expanded: bool = True, total: Optional[int] = None) -> Any:
+        """Context manager that provides status updates with optional enhanced display capabilities.
+        Args:
+            label: The status label
+            state: The initial state ("running", "complete", "error")
+            expanded: Whether to show expanded view (streamlit only)
+            total: Total number of steps for progress tracking (optional)
+        Returns:
+            Status context (Streamlit or Tqdm)
+        """
+        if self._streamlit is not None:
+            return _StreamlitStatusContext(label, self._streamlit, total)
+        else:
+            return _TqdmStatusContext(label, self._tqdm, total)

snowflake/ml/model/model_signature.py CHANGED Viewed

@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
     exceptions as snowml_exceptions,
 )
 from snowflake.ml._internal.utils import formatting, identifier, sql_identifier
-from snowflake.ml.model import type_hints as model_types
+from snowflake.ml.model import type_hints
 from snowflake.ml.model._signatures import (
     base_handler,
     builtins_handler,
@@ -55,9 +55,9 @@ _MODEL_TELEMETRY_SUBPROJECT = "ModelSignature"
 def _truncate_data(
-    data: model_types.SupportedDataType,
+    data: type_hints.SupportedDataType,
     length: Optional[int] = 100,
-) -> model_types.SupportedDataType:
+) -> type_hints.SupportedDataType:
     for handler in _ALL_DATA_HANDLERS:
         if handler.can_handle(data):
             # If length is None, return the original data
@@ -89,7 +89,7 @@ def _truncate_data(
 def _infer_signature(
-    data: model_types.SupportedLocalDataType, role: Literal["input", "output"], use_snowflake_identifiers: bool = False
+    data: type_hints.SupportedLocalDataType, role: Literal["input", "output"], use_snowflake_identifiers: bool = False
 ) -> Sequence[core.BaseFeatureSpec]:
     """Infer the inputs/outputs signature given a data that could be dataframe, numpy array or list.
         Dispatching is used to separate logic for different types.
@@ -142,7 +142,7 @@ def _rename_signature_with_snowflake_identifiers(
 def _validate_array_or_series_type(
-    arr: Union[model_types._SupportedNumpyArray, pd.Series], feature_type: core.DataType, strict: bool = False
+    arr: Union[type_hints._SupportedNumpyArray, pd.Series], feature_type: core.DataType, strict: bool = False
 ) -> bool:
     original_dtype = arr.dtype
     dtype = arr.dtype
@@ -649,7 +649,7 @@ def _validate_snowpark_type_feature(
 def _convert_local_data_to_df(
-    data: model_types.SupportedLocalDataType, ensure_serializable: bool = False
+    data: type_hints.SupportedLocalDataType, ensure_serializable: bool = False
 ) -> pd.DataFrame:
     """Convert local data to pandas DataFrame or Snowpark DataFrame
@@ -679,7 +679,7 @@ def _convert_local_data_to_df(
 def _convert_and_validate_local_data(
-    data: model_types.SupportedLocalDataType, features: Sequence[core.BaseFeatureSpec], strict: bool = False
+    data: type_hints.SupportedLocalDataType, features: Sequence[core.BaseFeatureSpec], strict: bool = False
 ) -> pd.DataFrame:
     """Validate the data with features in model signature and convert to DataFrame
@@ -703,8 +703,8 @@ def _convert_and_validate_local_data(
     subproject=_MODEL_TELEMETRY_SUBPROJECT,
 )
 def infer_signature(
-    input_data: model_types.SupportedLocalDataType,
-    output_data: model_types.SupportedLocalDataType,
+    input_data: type_hints.SupportedLocalDataType,
+    output_data: type_hints.SupportedLocalDataType,
     input_feature_names: Optional[list[str]] = None,
     output_feature_names: Optional[list[str]] = None,
     input_data_limit: Optional[int] = 100,

snowflake/ml/model/models/huggingface_pipeline.py CHANGED Viewed

@@ -1,8 +1,22 @@
+import logging
 import warnings
-from typing import Any, Optional
+from typing import Any, Optional, Union
 from packaging import version
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.ml._internal.human_readable_id import hrid_generator
+from snowflake.ml._internal.utils import sql_identifier
+from snowflake.ml.model._client.ops import service_ops
+from snowflake.snowpark import async_job, session
+logger = logging.getLogger(__name__)
+_TELEMETRY_PROJECT = "MLOps"
+_TELEMETRY_SUBPROJECT = "ModelManagement"
 class HuggingFacePipelineModel:
     def __init__(
@@ -214,4 +228,159 @@ class HuggingFacePipelineModel:
         self.token = token
         self.trust_remote_code = trust_remote_code
         self.model_kwargs = model_kwargs
+        self.tokenizer = tokenizer
         self.__dict__.update(kwargs)
+    @telemetry.send_api_usage_telemetry(
+        project=_TELEMETRY_PROJECT,
+        subproject=_TELEMETRY_SUBPROJECT,
+        func_params_to_log=[
+            "service_name",
+            "image_build_compute_pool",
+            "service_compute_pool",
+            "image_repo",
+            "gpu_requests",
+            "num_workers",
+            "max_batch_rows",
+        ],
+    )
+    @snowpark._internal.utils.private_preview(version="1.9.1")
+    def create_service(
+        self,
+        *,
+        session: session.Session,
+        # registry.log_model parameters
+        model_name: str,
+        version_name: Optional[str] = None,
+        pip_requirements: Optional[list[str]] = None,
+        conda_dependencies: Optional[list[str]] = None,
+        comment: Optional[str] = None,
+        # model_version_impl.create_service parameters
+        service_name: str,
+        service_compute_pool: str,
+        image_repo: str,
+        image_build_compute_pool: Optional[str] = None,
+        ingress_enabled: bool = False,
+        max_instances: int = 1,
+        cpu_requests: Optional[str] = None,
+        memory_requests: Optional[str] = None,
+        gpu_requests: Optional[Union[str, int]] = None,
+        num_workers: Optional[int] = None,
+        max_batch_rows: Optional[int] = None,
+        force_rebuild: bool = False,
+        build_external_access_integrations: Optional[list[str]] = None,
+        block: bool = True,
+    ) -> Union[str, async_job.AsyncJob]:
+        """Logs a Hugging Face model and creates a service in Snowflake.
+        Args:
+            session: The Snowflake session object.
+            model_name: The name of the model in Snowflake.
+            version_name: The version name of the model. Defaults to None.
+            pip_requirements: Pip requirements for the model. Defaults to None.
+            conda_dependencies: Conda dependencies for the model. Defaults to None.
+            comment: Comment for the model. Defaults to None.
+            service_name: The name of the service to create.
+            service_compute_pool: The compute pool for the service.
+            image_repo: The name of the image repository.
+            image_build_compute_pool: The name of the compute pool used to build the model inference image. It uses
+            the service compute pool if None.
+            ingress_enabled: Whether ingress is enabled. Defaults to False.
+            max_instances: Maximum number of instances. Defaults to 1.
+            cpu_requests: CPU requests configuration. Defaults to None.
+            memory_requests: Memory requests configuration. Defaults to None.
+            gpu_requests: GPU requests configuration. Defaults to None.
+            num_workers: Number of workers. Defaults to None.
+            max_batch_rows: Maximum batch rows. Defaults to None.
+            force_rebuild: Whether to force rebuild the image. Defaults to False.
+            build_external_access_integrations: External access integrations for building the image. Defaults to None.
+            block: Whether to block the operation. Defaults to True.
+        Raises:
+            ValueError: if database and schema name is not provided and session doesn't have a
+            database and schema name.
+        Returns:
+            The service ID or an async job object.
+        .. # noqa: DAR003
+        """
+        statement_params = telemetry.get_statement_params(
+            project=_TELEMETRY_PROJECT,
+            subproject=_TELEMETRY_SUBPROJECT,
+        )
+        database_name_id, schema_name_id, model_name_id = sql_identifier.parse_fully_qualified_name(model_name)
+        session_database_name = session.get_current_database()
+        session_schema_name = session.get_current_schema()
+        if database_name_id is None:
+            if session_database_name is None:
+                raise ValueError("Either database needs to be provided or needs to be available in session.")
+            database_name_id = sql_identifier.SqlIdentifier(session_database_name)
+        if schema_name_id is None:
+            if session_schema_name is None:
+                raise ValueError("Either schema needs to be provided or needs to be available in session.")
+            schema_name_id = sql_identifier.SqlIdentifier(session_schema_name)
+        if version_name is None:
+            name_generator = hrid_generator.HRID16()
+            version_name = name_generator.generate()[1]
+        service_db_id, service_schema_id, service_id = sql_identifier.parse_fully_qualified_name(service_name)
+        image_repo_db_id, image_repo_schema_id, image_repo_id = sql_identifier.parse_fully_qualified_name(image_repo)
+        service_operator = service_ops.ServiceOperator(
+            session=session,
+            database_name=database_name_id,
+            schema_name=schema_name_id,
+        )
+        logger.info(f"A service job is going to register the hf model as: {model_name}.{version_name}")
+        return service_operator.create_service(
+            database_name=database_name_id,
+            schema_name=schema_name_id,
+            model_name=model_name_id,
+            version_name=sql_identifier.SqlIdentifier(version_name),
+            service_database_name=service_db_id,
+            service_schema_name=service_schema_id,
+            service_name=service_id,
+            image_build_compute_pool_name=(
+                sql_identifier.SqlIdentifier(image_build_compute_pool)
+                if image_build_compute_pool
+                else sql_identifier.SqlIdentifier(service_compute_pool)
+            ),
+            service_compute_pool_name=sql_identifier.SqlIdentifier(service_compute_pool),
+            image_repo_database_name=image_repo_db_id,
+            image_repo_schema_name=image_repo_schema_id,
+            image_repo_name=image_repo_id,
+            ingress_enabled=ingress_enabled,
+            max_instances=max_instances,
+            cpu_requests=cpu_requests,
+            memory_requests=memory_requests,
+            gpu_requests=gpu_requests,
+            num_workers=num_workers,
+            max_batch_rows=max_batch_rows,
+            force_rebuild=force_rebuild,
+            build_external_access_integrations=(
+                None
+                if build_external_access_integrations is None
+                else [sql_identifier.SqlIdentifier(eai) for eai in build_external_access_integrations]
+            ),
+            block=block,
+            statement_params=statement_params,
+            # hf model
+            hf_model_args=service_ops.HFModelArgs(
+                hf_model_name=self.model,
+                hf_task=self.task,
+                hf_tokenizer=self.tokenizer,
+                hf_revision=self.revision,
+                hf_token=self.token,
+                hf_trust_remote_code=bool(self.trust_remote_code),
+                hf_model_kwargs=self.model_kwargs,
+                pip_requirements=pip_requirements,
+                conda_dependencies=conda_dependencies,
+                comment=comment,
+                # TODO: remove warehouse in the next release
+                warehouse=session.get_current_warehouse(),
+            ),
+        )

snowflake/ml/model/target_platform.py ADDED Viewed

@@ -0,0 +1,11 @@
+from enum import Enum
+class TargetPlatform(Enum):
+    WAREHOUSE = "WAREHOUSE"
+    SNOWPARK_CONTAINER_SERVICES = "SNOWPARK_CONTAINER_SERVICES"
+WAREHOUSE_ONLY = [TargetPlatform.WAREHOUSE]
+SNOWPARK_CONTAINER_SERVICES_ONLY = [TargetPlatform.SNOWPARK_CONTAINER_SERVICES]
+BOTH_WAREHOUSE_AND_SNOWPARK_CONTAINER_SERVICES = [TargetPlatform.WAREHOUSE, TargetPlatform.SNOWPARK_CONTAINER_SERVICES]

snowflake/ml/model/task.py ADDED Viewed

@@ -0,0 +1,9 @@
+from enum import Enum
+class Task(Enum):
+    UNKNOWN = "UNKNOWN"
+    TABULAR_BINARY_CLASSIFICATION = "TABULAR_BINARY_CLASSIFICATION"
+    TABULAR_MULTI_CLASSIFICATION = "TABULAR_MULTI_CLASSIFICATION"
+    TABULAR_REGRESSION = "TABULAR_REGRESSION"
+    TABULAR_RANKING = "TABULAR_RANKING"

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -1,10 +1,12 @@
 # mypy: disable-error-code="import"
-from enum import Enum
 from typing import TYPE_CHECKING, Literal, Sequence, TypedDict, TypeVar, Union
 import numpy.typing as npt
 from typing_extensions import NotRequired
+from snowflake.ml.model.target_platform import TargetPlatform
+from snowflake.ml.model.task import Task
 if TYPE_CHECKING:
     import catboost
     import keras
@@ -321,17 +323,7 @@ ModelLoadOption = Union[
 ]
-class Task(Enum):
-    UNKNOWN = "UNKNOWN"
-    TABULAR_BINARY_CLASSIFICATION = "TABULAR_BINARY_CLASSIFICATION"
-    TABULAR_MULTI_CLASSIFICATION = "TABULAR_MULTI_CLASSIFICATION"
-    TABULAR_REGRESSION = "TABULAR_REGRESSION"
-    TABULAR_RANKING = "TABULAR_RANKING"
-class TargetPlatform(Enum):
-    WAREHOUSE = "WAREHOUSE"
-    SNOWPARK_CONTAINER_SERVICES = "SNOWPARK_CONTAINER_SERVICES"
+SupportedTargetPlatformType = Union[TargetPlatform, str]
-SupportedTargetPlatformType = Union[TargetPlatform, str]
+__all__ = ["TargetPlatform", "Task"]

snowflake/ml/modeling/framework/base.py CHANGED Viewed

@@ -698,7 +698,7 @@ class BaseTransformer(BaseEstimator):
         self,
         attribute: Optional[Mapping[str, Union[int, float, str, Iterable[Union[int, float, str]]]]],
         dtype: Optional[type] = None,
-    ) -> Optional[npt.NDArray[Union[np.int_, np.float_, np.str_]]]:
+    ) -> Optional[npt.NDArray[Union[np.int_, np.float64, np.str_]]]:
         """
         Convert the attribute from dict to ndarray based on the order of `self.input_cols`.

snowflake/ml/modeling/metrics/classification.py CHANGED Viewed

@@ -96,7 +96,7 @@ def confusion_matrix(
     labels: Optional[npt.ArrayLike] = None,
     sample_weight_col_name: Optional[str] = None,
     normalize: Optional[str] = None,
-) -> Union[npt.NDArray[np.int_], npt.NDArray[np.float_]]:
+) -> Union[npt.NDArray[np.int_], npt.NDArray[np.float64]]:
     """
     Compute confusion matrix to evaluate the accuracy of a classification.
@@ -320,7 +320,7 @@ def f1_score(
     average: Optional[str] = "binary",
     sample_weight_col_name: Optional[str] = None,
     zero_division: Union[str, int] = "warn",
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     """
     Compute the F1 score, also known as balanced F-score or F-measure.
@@ -414,7 +414,7 @@ def fbeta_score(
     average: Optional[str] = "binary",
     sample_weight_col_name: Optional[str] = None,
     zero_division: Union[str, int] = "warn",
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     """
     Compute the F-beta score.
@@ -696,7 +696,7 @@ def precision_recall_fscore_support(
     zero_division: Union[str, int] = "warn",
 ) -> Union[
     tuple[float, float, float, None],
-    tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]],
+    tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]],
 ]:
     """
     Compute precision, recall, F-measure and support for each class.
@@ -855,7 +855,7 @@ def precision_recall_fscore_support(
         res: Union[
             tuple[float, float, float, None],
-            tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]],
+            tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]],
         ] = result_object[:4]
         warning = result_object[-1]
         if warning:
@@ -1050,7 +1050,7 @@ def _register_multilabel_confusion_matrix_computer(
         def end_partition(
             self,
-        ) -> Iterable[tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]]]:
+        ) -> Iterable[tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]]:
             MCM = metrics.multilabel_confusion_matrix(
                 self._y_true,
                 self._y_pred,
@@ -1098,7 +1098,7 @@ def _binary_precision_score(
     pos_label: Union[str, int] = 1,
     sample_weight_col_name: Optional[str] = None,
     zero_division: Union[str, int] = "warn",
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
@@ -1173,7 +1173,7 @@ def precision_score(
     average: Optional[str] = "binary",
     sample_weight_col_name: Optional[str] = None,
     zero_division: Union[str, int] = "warn",
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     """
     Compute the precision.
@@ -1271,7 +1271,7 @@ def recall_score(
     average: Optional[str] = "binary",
     sample_weight_col_name: Optional[str] = None,
     zero_division: Union[str, int] = "warn",
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     """
     Compute the recall.
@@ -1406,14 +1406,14 @@ def _check_binary_labels(
 def _prf_divide(
-    numerator: npt.NDArray[np.float_],
-    denominator: npt.NDArray[np.float_],
+    numerator: npt.NDArray[np.float64],
+    denominator: npt.NDArray[np.float64],
     metric: str,
     modifier: str,
     average: Optional[str] = None,
     warn_for: Union[tuple[str, ...], set[str]] = ("precision", "recall", "f-score"),
     zero_division: Union[str, int] = "warn",
-) -> npt.NDArray[np.float_]:
+) -> npt.NDArray[np.float64]:
     """Performs division and handles divide-by-zero.
     On zero-division, sets the corresponding result elements equal to
@@ -1436,7 +1436,7 @@ def _prf_divide(
             "warn", this acts as 0, but warnings are also raised.
     Returns:
-        npt.NDArray[np.float_]: Result of the division, an array of floats.
+        npt.NDArray[np.float64]: Result of the division, an array of floats.
     """
     mask = denominator == 0.0
     denominator = denominator.copy()
@@ -1522,7 +1522,7 @@ def _check_zero_division(zero_division: Union[int, float, str]) -> float:
         return np.nan
-def _nanaverage(a: npt.NDArray[np.float_], weights: Optional[npt.ArrayLike] = None) -> Any:
+def _nanaverage(a: npt.NDArray[np.float64], weights: Optional[npt.ArrayLike] = None) -> Any:
     """Compute the weighted average, ignoring NaNs.
     Args:

snowflake/ml/modeling/metrics/correlation.py CHANGED Viewed

@@ -26,7 +26,7 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] =
     The below steps explain how correlation matrix is computed in a distributed way:
     Let n = # of rows in the dataframe; sqrt_n = sqrt(n); X, Y are 2 columns in the dataframe
     Correlation(X, Y) = numerator/denominator where
-    numerator = dot(X/sqrt_n, Y/sqrt_n) - sum(X/n)*sum(X/n)
+    numerator = dot(X/sqrt_n, Y/sqrt_n) - sum(X/n)*sum(Y/n)
     denominator = std_dev(X)*std_dev(Y)
     std_dev(X) = sqrt(dot(X/sqrt_n, X/sqrt_n) - sum(X/n)*sum(X/n))
@@ -74,27 +74,38 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] =
     # Pushing this to a udtf requires creating a temp udtf which takes about 20 secs, so it doesn't make sense
     # to have this in a udtf.
     n_cols = len(columns)
-    sum_arr = np.zeros(n_cols)
-    squared_sum_arr = np.zeros(n_cols)
+    column_means = np.zeros(n_cols)
+    mean_of_squares = np.zeros(n_cols)
     dot_prod = np.zeros((n_cols, n_cols))
     # Get sum, dot_prod and squared sum array from the results.
     for i in range(len(results)):
         x = results[i]
         if x[1] == "sum_by_count":
-            sum_arr = cloudpickle.loads(x[0])
+            column_means = cloudpickle.loads(x[0])
         else:
             row = int(x[1].strip("row_"))
             dot_prod[row, :] = cloudpickle.loads(x[0])
-            squared_sum_arr[row] = dot_prod[row, row]
+            mean_of_squares[row] = dot_prod[row, row]
     # sum(X/n)*sum(Y/n) is computed for all combinations of X,Y (columns in the dataframe)
-    exey_arr = np.einsum("t,m->tm", sum_arr, sum_arr, optimize="optimal")
+    exey_arr = np.einsum("t,m->tm", column_means, column_means, optimize="optimal")
     numerator_matrix = dot_prod - exey_arr
     # standard deviation for all columns in the dataframe
-    stddev_arr = np.sqrt(squared_sum_arr - np.einsum("i, i -> i", sum_arr, sum_arr, optimize="optimal"))
+    variance_arr = mean_of_squares - np.einsum("i, i -> i", column_means, column_means, optimize="optimal")
+    # ensure non-negative values from potential precision issues where variance might be slightly negative
+    variance_arr = np.maximum(variance_arr, 0)
+    stddev_arr = np.sqrt(variance_arr)
     # std_dev(X)*std_dev(Y) is computed for all combinations of X,Y (columns in the dataframe)
     denominator_matrix = np.einsum("t,m->tm", stddev_arr, stddev_arr, optimize="optimal")
-    corr_res = numerator_matrix / denominator_matrix
+    # Use np.divide to handle NaN cases
+    corr_res = np.divide(
+        numerator_matrix,
+        denominator_matrix,
+        out=np.full_like(numerator_matrix, np.nan),
+        where=(denominator_matrix != 0),
+    )
     correlation_matrix = pd.DataFrame(corr_res, columns=columns, index=columns)
     return correlation_matrix

snowflake/ml/modeling/metrics/metrics_utils.py CHANGED Viewed

@@ -60,6 +60,7 @@ def register_accumulator_udtf(*, session: Session, statement_params: dict[str, A
         ),
         input_types=[T.BinaryType()],
         packages=[f"numpy=={np.__version__}", f"cloudpickle=={cloudpickle.__version__}"],
+        imports=[],  # Prevents unnecessary import resolution.
         name=accumulator,
         is_permanent=False,
         replace=True,
@@ -175,6 +176,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: dic
         ),
         input_types=[T.ArrayType(), T.IntegerType(), T.IntegerType()],
         packages=[f"numpy=={np.__version__}", f"cloudpickle=={cloudpickle.__version__}"],
+        imports=[],  # Prevents unnecessary import resolution.
         name=sharded_dot_and_sum_computer,
         is_permanent=False,
         replace=True,

snowflake/ml/modeling/metrics/ranking.py CHANGED Viewed

@@ -26,7 +26,7 @@ def precision_recall_curve(
     probas_pred_col_name: str,
     pos_label: Optional[Union[str, int]] = None,
     sample_weight_col_name: Optional[str] = None,
-) -> tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]]:
+) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]:
     """
     Compute precision-recall pairs for different probability thresholds.
@@ -125,7 +125,7 @@ def precision_recall_curve(
     kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params)
     result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs))
-    res: tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
+    res: tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]] = result_object
     return res
@@ -140,7 +140,7 @@ def roc_auc_score(
     max_fpr: Optional[float] = None,
     multi_class: str = "raise",
     labels: Optional[npt.ArrayLike] = None,
-) -> Union[float, npt.NDArray[np.float_]]:
+) -> Union[float, npt.NDArray[np.float64]]:
     """
     Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
@@ -276,7 +276,7 @@ def roc_auc_score(
     kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params)
     result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs))
-    auc: Union[float, npt.NDArray[np.float_]] = result_object
+    auc: Union[float, npt.NDArray[np.float64]] = result_object
     return auc
@@ -289,7 +289,7 @@ def roc_curve(
     pos_label: Optional[Union[str, int]] = None,
     sample_weight_col_name: Optional[str] = None,
     drop_intermediate: bool = True,
-) -> tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]]:
+) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]:
     """
     Compute Receiver operating characteristic (ROC).
@@ -380,6 +380,6 @@ def roc_curve(
     kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params)
     result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs))
-    res: tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
+    res: tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]] = result_object
     return res

snowflake-ml-python 1.8.6__py3-none-any.whl → 1.9.1__py3-none-any.whl

snowflake-ml-python 1.8.6py3-none-any.whl → 1.9.1py3-none-any.whl