PyPI - snowflake-ml-python - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

snowflake-ml-python 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -584,3 +584,22 @@ class _SourceTelemetryClient:
         """Send the telemetry data batch immediately."""
         if self._telemetry:
             self._telemetry.send_batch()
+def get_sproc_statement_params_kwargs(sproc: Callable[..., Any], statement_params: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Get statement_params keyword argument for sproc call.
+    Args:
+        sproc: sproc function
+        statement_params: dictionary to be passed as statement params, if possible
+    Returns:
+        Keyword arguments dict
+    """
+    sproc_argspec = inspect.getfullargspec(sproc)
+    kwargs = {}
+    if "statement_params" in sproc_argspec.args:
+        kwargs["statement_params"] = statement_params
+    return kwargs

snowflake/ml/model/_client/ops/model_ops.py CHANGED Viewed

@@ -4,9 +4,8 @@ import tempfile
 from typing import Any, Dict, List, Optional, Union, cast
 import yaml
-from packaging import version
-from snowflake.ml._internal.utils import identifier, snowflake_env, sql_identifier
+from snowflake.ml._internal.utils import identifier, sql_identifier
 from snowflake.ml.model import model_signature, type_hints
 from snowflake.ml.model._client.ops import metadata_ops
 from snowflake.ml.model._client.sql import (
@@ -25,8 +24,6 @@ from snowflake.ml.model._signatures import snowpark_handler
 from snowflake.snowpark import dataframe, row, session
 from snowflake.snowpark._internal import utils as snowpark_utils
-_TAG_ON_MODEL_AVAILABLE_VERSION = version.parse("8.2.0")
 class ModelOperator:
     def __init__(
@@ -296,21 +293,14 @@ class ModelOperator:
         tag_value: str,
         statement_params: Optional[Dict[str, Any]] = None,
     ) -> None:
-        sf_version = snowflake_env.get_current_snowflake_version(self._session, statement_params=statement_params)
-        if sf_version >= _TAG_ON_MODEL_AVAILABLE_VERSION:
-            self._tag_client.set_tag_on_model(
-                model_name=model_name,
-                tag_database_name=tag_database_name,
-                tag_schema_name=tag_schema_name,
-                tag_name=tag_name,
-                tag_value=tag_value,
-                statement_params=statement_params,
-            )
-        else:
-            raise NotImplementedError(
-                f"`set_tag` won't work before Snowflake version {_TAG_ON_MODEL_AVAILABLE_VERSION},"
-                f" currently is {sf_version}"
-            )
+        self._tag_client.set_tag_on_model(
+            model_name=model_name,
+            tag_database_name=tag_database_name,
+            tag_schema_name=tag_schema_name,
+            tag_name=tag_name,
+            tag_value=tag_value,
+            statement_params=statement_params,
+        )
     def unset_tag(
         self,
@@ -321,20 +311,13 @@ class ModelOperator:
         tag_name: sql_identifier.SqlIdentifier,
         statement_params: Optional[Dict[str, Any]] = None,
     ) -> None:
-        sf_version = snowflake_env.get_current_snowflake_version(self._session, statement_params=statement_params)
-        if sf_version >= _TAG_ON_MODEL_AVAILABLE_VERSION:
-            self._tag_client.unset_tag_on_model(
-                model_name=model_name,
-                tag_database_name=tag_database_name,
-                tag_schema_name=tag_schema_name,
-                tag_name=tag_name,
-                statement_params=statement_params,
-            )
-        else:
-            raise NotImplementedError(
-                f"`unset_tag` won't work before Snowflake version {_TAG_ON_MODEL_AVAILABLE_VERSION},"
-                f" currently is {sf_version}"
-            )
+        self._tag_client.unset_tag_on_model(
+            model_name=model_name,
+            tag_database_name=tag_database_name,
+            tag_schema_name=tag_schema_name,
+            tag_name=tag_name,
+            statement_params=statement_params,
+        )
     def get_model_version_manifest(
         self,
@@ -382,11 +365,6 @@ class ModelOperator:
         version_name: sql_identifier.SqlIdentifier,
         statement_params: Optional[Dict[str, Any]] = None,
     ) -> model_manifest_schema.SnowparkMLDataDict:
-        if (
-            snowflake_env.get_current_snowflake_version(self._session)
-            < model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION
-        ):
-            raise NotImplementedError("User_data has not been supported yet.")
         raw_user_data_json_string = self._model_client.show_versions(
             model_name=model_name,
             version_name=version_name,

snowflake/ml/model/_client/sql/model.py CHANGED Viewed

@@ -3,10 +3,8 @@ from typing import Any, Dict, List, Optional
 from snowflake.ml._internal.utils import (
     identifier,
     query_result_checker,
-    snowflake_env,
     sql_identifier,
 )
-from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema
 from snowflake.snowpark import row, session
@@ -89,12 +87,8 @@ class ModelSQLClient:
             .has_column(ModelSQLClient.MODEL_VERSION_NAME_COL_NAME, allow_empty=True)
             .has_column(ModelSQLClient.MODEL_VERSION_COMMENT_COL_NAME, allow_empty=True)
             .has_column(ModelSQLClient.MODEL_VERSION_METADATA_COL_NAME, allow_empty=True)
+            .has_column(ModelSQLClient.MODEL_VERSION_USER_DATA_COL_NAME, allow_empty=True)
         )
-        if (
-            snowflake_env.get_current_snowflake_version(self._session)
-            >= model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION
-        ):
-            res = res.has_column(ModelSQLClient.MODEL_VERSION_USER_DATA_COL_NAME, allow_empty=True)
         if validate_result and version_name:
             res = res.has_dimensions(expected_rows=1)

snowflake/ml/model/_client/sql/model_version.py CHANGED Viewed

@@ -146,24 +146,29 @@ class ModelVersionSQLClient:
         returns: List[Tuple[str, spt.DataType, sql_identifier.SqlIdentifier]],
         statement_params: Optional[Dict[str, Any]] = None,
     ) -> dataframe.DataFrame:
-        tmp_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE)
-        INTERMEDIATE_TABLE_NAME = identifier.get_schema_level_object_identifier(
-            self._database_name.identifier(),
-            self._schema_name.identifier(),
-            tmp_table_name,
-        )
-        input_df.write.save_as_table(  # type: ignore[call-overload]
-            table_name=INTERMEDIATE_TABLE_NAME,
-            mode="errorifexists",
-            table_type="temporary",
-            statement_params=statement_params,
-        )
+        with_statements = []
+        if len(input_df.queries["queries"]) == 1 and len(input_df.queries["post_actions"]) == 0:
+            INTERMEDIATE_TABLE_NAME = "SNOWPARK_ML_MODEL_INFERENCE_INPUT"
+            with_statements.append(f"{INTERMEDIATE_TABLE_NAME} AS ({input_df.queries['queries'][0]})")
+        else:
+            tmp_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE)
+            INTERMEDIATE_TABLE_NAME = identifier.get_schema_level_object_identifier(
+                self._database_name.identifier(),
+                self._schema_name.identifier(),
+                tmp_table_name,
+            )
+            input_df.write.save_as_table(  # type: ignore[call-overload]
+                table_name=INTERMEDIATE_TABLE_NAME,
+                mode="errorifexists",
+                table_type="temporary",
+                statement_params=statement_params,
+            )
         INTERMEDIATE_OBJ_NAME = "TMP_RESULT"
         module_version_alias = "MODEL_VERSION_ALIAS"
-        model_version_alias_sql = (
-            f"WITH {module_version_alias} AS "
+        with_statements.append(
+            f"{module_version_alias} AS "
             f"MODEL {self.fully_qualified_model_name(model_name)} VERSION {version_name.identifier()}"
         )
@@ -174,7 +179,7 @@ class ModelVersionSQLClient:
         args_sql = ", ".join(args_sql_list)
         sql = textwrap.dedent(
-            f"""{model_version_alias_sql}
+            f"""WITH {','.join(with_statements)}
                 SELECT *,
                     {module_version_alias}!{method_name.identifier()}({args_sql}) AS {INTERMEDIATE_OBJ_NAME}
                 FROM {INTERMEDIATE_TABLE_NAME}"""

snowflake/ml/model/_model_composer/model_manifest/model_manifest.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict, List, Optional, cast
 import yaml
-from snowflake.ml._internal.utils import snowflake_env
 from snowflake.ml.model import type_hints
 from snowflake.ml.model._model_composer.model_manifest import model_manifest_schema
 from snowflake.ml.model._model_composer.model_method import (
@@ -84,11 +83,7 @@ class ModelManifest:
             ],
         )
-        if (
-            snowflake_env.get_current_snowflake_version(session)
-            >= model_manifest_schema.MANIFEST_USER_DATA_ENABLE_VERSION
-        ):
-            manifest_dict["user_data"] = self.generate_user_data_with_client_data(model_meta)
+        manifest_dict["user_data"] = self.generate_user_data_with_client_data(model_meta)
         with (self.workspace_path / ModelManifest.MANIFEST_FILE_REL_PATH).open("w", encoding="utf-8") as f:
             # Anchors are not supported in the server, avoid that.

snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py CHANGED Viewed

@@ -2,14 +2,12 @@
 from typing import Any, Dict, List, Literal, TypedDict
-from packaging import version
 from typing_extensions import NotRequired, Required
 from snowflake.ml.model import model_signature
 MODEL_MANIFEST_VERSION = "1.0"
-MANIFEST_USER_DATA_ENABLE_VERSION = version.parse("8.2.0")
 MANIFEST_CLIENT_DATA_KEY_NAME = "snowpark_ml_data"
 MANIFEST_CLIENT_DATA_SCHEMA_VERSION = "2024-02-01"

snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py CHANGED Viewed

@@ -1 +1,10 @@
-REQUIREMENTS = ['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'snowflake-snowpark-python>=1.8.0,<2', 'typing-extensions>=4.1.0,<5']
+REQUIREMENTS = [
+    "absl-py>=0.15,<2",
+    "anyio>=3.5.0,<4",
+    "numpy>=1.23,<2",
+    "packaging>=20.9,<24",
+    "pandas>=1.0.0,<2",
+    "pyyaml>=6.0,<7",
+    "snowflake-snowpark-python>=1.8.0,<2",
+    "typing-extensions>=4.1.0,<5"
+]

snowflake/ml/model/_model_composer/model_runtime/model_runtime.py CHANGED Viewed

@@ -62,7 +62,6 @@ class ModelRuntime:
                     model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name)
                     for dep in _UDF_INFERENCE_DEPENDENCIES
                 ],
-                check_local_version=True,
             )
         else:
             self.runtime_env.include_if_absent(
@@ -70,7 +69,6 @@ class ModelRuntime:
                     model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name)
                     for dep in _UDF_INFERENCE_DEPENDENCIES + [snowml_pkg_spec]
                 ],
-                check_local_version=True,
             )
     def save(self, workspace_path: pathlib.Path) -> model_manifest_schema.ModelRuntimeDict:

snowflake/ml/model/_packager/model_meta/_core_requirements.py CHANGED Viewed

@@ -1 +1,11 @@
-REQUIREMENTS = ['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle>=2.0.0', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'snowflake-snowpark-python>=1.8.0,<2', 'typing-extensions>=4.1.0,<5']
+REQUIREMENTS = [
+    "absl-py>=0.15,<2",
+    "anyio>=3.5.0,<4",
+    "cloudpickle>=2.0.0",
+    "numpy>=1.23,<2",
+    "packaging>=20.9,<24",
+    "pandas>=1.0.0,<2",
+    "pyyaml>=6.0,<7",
+    "snowflake-snowpark-python>=1.8.0,<2",
+    "typing-extensions>=4.1.0,<5"
+]

snowflake/ml/model/_packager/model_meta/_packaging_requirements.py ADDED Viewed

@@ -0,0 +1,3 @@
+REQUIREMENTS = [
+    "cloudpickle>=2.0.0"
+]

snowflake/ml/model/_packager/model_meta/model_meta.py CHANGED Viewed

@@ -18,6 +18,7 @@ from snowflake.ml.model import model_signature, type_hints as model_types
 from snowflake.ml.model._packager.model_env import model_env
 from snowflake.ml.model._packager.model_meta import (
     _core_requirements,
+    _packaging_requirements,
     model_blob_meta,
     model_meta_schema,
 )
@@ -26,7 +27,8 @@ from snowflake.ml.model._packager.model_meta_migrator import migrator_plans
 MODEL_METADATA_FILE = "model.yaml"
 MODEL_CODE_DIR = "code"
-_PACKAGING_CORE_DEPENDENCIES = _core_requirements.REQUIREMENTS
+_PACKAGING_CORE_DEPENDENCIES = _core_requirements.REQUIREMENTS  # Legacy Model only
+_PACKAGING_REQUIREMENTS = _packaging_requirements.REQUIREMENTS  # New Model only
 _SNOWFLAKE_PKG_NAME = "snowflake"
 _SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml"
@@ -73,6 +75,8 @@ def create_model_metadata(
     model_dir_path = os.path.normpath(model_dir_path)
     embed_local_ml_library = kwargs.pop("embed_local_ml_library", False)
     legacy_save = kwargs.pop("_legacy_save", False)
+    relax_version = kwargs.pop("relax_version", False)
     if embed_local_ml_library:
         # Use the last one which is loaded first, that is mean, it is loaded from site-packages.
         # We could make sure that user does not overwrite our library with their code follow the same naming.
@@ -94,6 +98,8 @@ def create_model_metadata(
         pip_requirements=pip_requirements,
         python_version=python_version,
         embed_local_ml_library=embed_local_ml_library,
+        legacy_save=legacy_save,
+        relax_version=relax_version,
     )
     if embed_local_ml_library:
@@ -146,6 +152,8 @@ def _create_env_for_model_metadata(
     pip_requirements: Optional[List[str]] = None,
     python_version: Optional[str] = None,
     embed_local_ml_library: bool = False,
+    legacy_save: bool = False,
+    relax_version: bool = False,
 ) -> model_env.ModelEnv:
     env = model_env.ModelEnv()
@@ -154,11 +162,14 @@ def _create_env_for_model_metadata(
     env.pip_requirements = pip_requirements  # type: ignore[assignment]
     env.python_version = python_version  # type: ignore[assignment]
     env.snowpark_ml_version = snowml_env.VERSION
+    requirements_to_add = _PACKAGING_CORE_DEPENDENCIES if legacy_save else _PACKAGING_REQUIREMENTS
     if embed_local_ml_library:
         env.include_if_absent(
             [
                 model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name)
-                for dep in _PACKAGING_CORE_DEPENDENCIES
+                for dep in requirements_to_add
             ],
             check_local_version=True,
         )
@@ -166,11 +177,14 @@ def _create_env_for_model_metadata(
         env.include_if_absent(
             [
                 model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name)
-                for dep in _PACKAGING_CORE_DEPENDENCIES + [env_utils.SNOWPARK_ML_PKG_NAME]
+                for dep in requirements_to_add + [env_utils.SNOWPARK_ML_PKG_NAME]
             ],
             check_local_version=True,
         )
+    if relax_version:
+        env.relax_version()
     return env

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -198,9 +198,12 @@ class BaseModelSaveOption(TypedDict):
     """Options for saving the model.
     embed_local_ml_library: Embedding local SnowML into the code directory of the folder.
+    relax_version: Whether or not relax the version constraints of the dependencies if unresolvable. It detects any
+        ==x.y.z in specifiers and replaced with >=x.y, <(x+1). Defaults to False.
     """
     embed_local_ml_library: NotRequired[bool]
+    relax_version: NotRequired[bool]
     _legacy_save: NotRequired[bool]
     method_options: NotRequired[Dict[str, ModelMethodSaveOptions]]

snowflake/ml/modeling/_internal/distributed_hpo_trainer.py CHANGED Viewed

@@ -4,11 +4,12 @@ import io
 import os
 import posixpath
 import sys
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import cloudpickle as cp
 import numpy as np
 from sklearn import model_selection
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.utils import (
@@ -41,23 +42,28 @@ DEFAULT_UDTF_NJOBS = 3
 def construct_cv_results(
+    estimator: Union[GridSearchCV, RandomizedSearchCV],
+    n_split: int,
+    param_grid: List[Dict[str, Any]],
     cv_results_raw_hex: List[Row],
     cross_validator_indices_length: int,
     parameter_grid_length: int,
-    search_cv_kwargs: Dict[str, Any],
-) -> Tuple[bool, Dict[str, Any], int, Set[str]]:
+) -> Tuple[bool, Dict[str, Any]]:
     """Construct the cross validation result from the UDF. Because we accelerate the process
     by the number of cross validation number, and the combination of parameter grids.
     Therefore, we need to stick them back together instead of returning the raw result
     to align with original sklearn result.
     Args:
+        estimator (Union[GridSearchCV, RandomizedSearchCV]): The sklearn object of estimator
+            GridSearchCV or RandomizedSearchCV
+        n_split (int): The number of split, which is determined by build_cross_validator.get_n_splits(X, y, groups)
+        param_grid (List[Dict[str, Any]]): the list of parameter grid or parameter sampler
         cv_results_raw_hex (List[Row]): the list of cv_results from each cv and parameter grid combination.
             Because UDxF can only return string, and numpy array/masked arrays cannot be encoded in a
             json format. Each cv_result is encoded into hex string.
         cross_validator_indices_length (int): the length of cross validator indices
         parameter_grid_length (int): the length of parameter grid combination
-        search_cv_kwargs (Dict[str, Any]): the kwargs for GridSearchCV/RandomSearchCV.
     Raises:
         ValueError: Retrieved empty cross validation results
@@ -67,7 +73,7 @@ def construct_cv_results(
         RuntimeError: Cross validation results are unexpectedly empty for one fold.
     Returns:
-        Tuple[bool, Dict[str, Any], int, Set[str]]: returns multimetric, cv_results_, best_param_index, scorers
+        Tuple[bool, Dict[str, Any]]: returns multimetric, cv_results_
     """
     # Filter corner cases: either the snowpark dataframe result is empty; or index length is empty
     if len(cv_results_raw_hex) == 0:
@@ -79,12 +85,8 @@ def construct_cv_results(
     if parameter_grid_length == 0:
         raise ValueError("Parameter index length is 0. Were there no candidates?")
-    from scipy.stats import rankdata
     # cv_result maintains the original order
     multimetric = False
-    cv_results_ = dict()
-    scorers = set()
     # retrieve the cv_results from udtf table; results are encoded by hex and cloudpickle;
     # We are constructing the raw information back to original form
     if len(cv_results_raw_hex) != cross_validator_indices_length * parameter_grid_length:
@@ -94,7 +96,9 @@ def construct_cv_results(
             "Please retry or contact snowflake support."
         )
-    for param_cv_indices, each_cv_result_hex in enumerate(cv_results_raw_hex):
+    out = []
+    for each_cv_result_hex in cv_results_raw_hex:
         # convert the hex string back to cv_results_
         hex_str = bytes.fromhex(each_cv_result_hex[0])
         with io.BytesIO(hex_str) as f_reload:
@@ -103,85 +107,46 @@ def construct_cv_results(
                 raise RuntimeError(
                     "Cross validation response is empty. This issue may be temporary - please try again."
                 )
-            for k, v in each_cv_result.items():
-                cur_cv_idx = param_cv_indices % cross_validator_indices_length
-                key = k
-                if "split0_test_" in k:
+            temp_dict = dict()
+            """
+            This dictionary has the following keys
+            train_scores : dict of scorer name -> float
+                Score on training set (for all the scorers),
+                returned only if `return_train_score` is `True`.
+            test_scores : dict of scorer name -> float
+                Score on testing set (for all the scorers).
+            fit_time : float
+                Time spent for fitting in seconds.
+            score_time : float
+                Time spent for scoring in seconds.
+            """
+            if estimator.return_train_score:
+                if each_cv_result.get("split0_train_score", None):
+                    # for single scorer, the split0_train_score only contains an array with one value
+                    temp_dict["train_scores"] = each_cv_result["split0_train_score"][0]
+                else:
+                    # if multimetric situation, the format would be
+                    # {metric_name1: value, metric_name2: value, ...}
+                    temp_dict["train_scores"] = {}
                     # For multi-metric evaluation, the scores for all the scorers are available in the
                     # cv_results_ dict at the keys ending with that scorer’s name ('_<scorer_name>')
                     # instead of '_score'.
-                    scorers.add(k[len("split0_test_") :])
-                    key = k.replace("split0_test", f"split{cur_cv_idx}_test")
-                if search_cv_kwargs.get("return_train_score", None) and "split0_train_" in k:
-                    key = k.replace("split0_train", f"split{cur_cv_idx}_train")
-                elif k.startswith("param"):
-                    if cur_cv_idx != 0:
-                        continue
-                if key:
-                    if key not in cv_results_:
-                        cv_results_[key] = v
-                    else:
-                        cv_results_[key] = np.concatenate([cv_results_[key], v])
-    multimetric = len(scorers) > 1
-    # Use numpy to re-calculate all the information in cv_results_ again
-    # Generally speaking, reshape all the results into the (scorers+2, idx_length, params_length) shape,
-    # and average them by the idx_length;
-    # idx_length is the number of cv folds; params_length is the number of parameter combinations
-    scores_test = [
-        np.reshape(
-            np.concatenate(
-                [cv_results_[f"split{cur_cv}_test_{score}"] for cur_cv in range(cross_validator_indices_length)]
-            ),
-            (cross_validator_indices_length, -1),
-        )
-        for score in scorers
-    ]
-    fit_score_test_matrix = np.stack(
-        [
-            np.reshape(cv_results_["mean_fit_time"], (cross_validator_indices_length, -1)),
-            np.reshape(cv_results_["mean_score_time"], (cross_validator_indices_length, -1)),
-        ]
-        + scores_test
-    )
-    mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1)
-    std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1)
-    if search_cv_kwargs.get("return_train_score", None):
-        scores_train = [
-            np.reshape(
-                np.concatenate(
-                    [cv_results_[f"split{cur_cv}_train_{score}"] for cur_cv in range(cross_validator_indices_length)]
-                ),
-                (cross_validator_indices_length, -1),
-            )
-            for score in scorers
-        ]
-        mean_fit_score_train_matrix = np.mean(scores_train, axis=1)
-        std_fit_score_train_matrix = np.std(scores_train, axis=1)
-    cv_results_["std_fit_time"] = std_fit_score_test_matrix[0]
-    cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0]
-    cv_results_["std_score_time"] = std_fit_score_test_matrix[1]
-    cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1]
-    for idx, score in enumerate(scorers):
-        cv_results_[f"std_test_{score}"] = std_fit_score_test_matrix[idx + 2]
-        cv_results_[f"mean_test_{score}"] = mean_fit_score_test_matrix[idx + 2]
-        if search_cv_kwargs.get("return_train_score", None):
-            cv_results_[f"std_train_{score}"] = std_fit_score_train_matrix[idx]
-            cv_results_[f"mean_train_{score}"] = mean_fit_score_train_matrix[idx]
-        # re-compute the ranking again with mean_test_<score>.
-        cv_results_[f"rank_test_{score}"] = rankdata(-cv_results_[f"mean_test_{score}"], method="min")
-        # The best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared.
-        # If all scores are `nan`, `rankdata` will also produce an array of `nan` values.
-        # In that case, default to first index.
-        best_param_index = (
-            np.where(cv_results_[f"rank_test_{score}"] == 1)[0][0]
-            if not np.isnan(cv_results_[f"rank_test_{score}"]).all()
-            else 0
-        )
-    return multimetric, cv_results_, best_param_index, scorers
+                    for k, v in each_cv_result.items():
+                        if "split0_train_" in k:
+                            temp_dict["train_scores"][k[len("split0_train_") :]] = v
+            if isinstance(each_cv_result.get("split0_test_score"), np.ndarray):
+                temp_dict["test_scores"] = each_cv_result["split0_test_score"][0]
+            else:
+                temp_dict["test_scores"] = {}
+                for k, v in each_cv_result.items():
+                    if "split0_test_" in k:
+                        temp_dict["test_scores"][k[len("split0_test_") :]] = v
+            temp_dict["fit_time"] = each_cv_result["mean_fit_time"][0]
+            temp_dict["score_time"] = each_cv_result["mean_score_time"][0]
+            out.append(temp_dict)
+    first_test_score = out[0]["test_scores"]
+    multimetric = isinstance(first_test_score, dict)
+    return multimetric, estimator._format_results(param_grid, n_split, out)
 cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
@@ -288,7 +253,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[sproc],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
         udtf_statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
@@ -297,7 +261,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[udtf],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+            custom_tags=dict([("hpo_udtf", True)]),
         )
         # Put locally serialized estimator on stage.
@@ -375,8 +339,12 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 estimator = cp.load(local_estimator_file_obj)["estimator"]
             build_cross_validator = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
+            from sklearn.utils.validation import indexable
+            X, y, _ = indexable(X, y, None)
+            n_splits = build_cross_validator.get_n_splits(X, y, None)
             # store the cross_validator's test indices only to save space
-            cross_validator_indices = [test for _, test in build_cross_validator.split(X, y)]
+            cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)]
             local_indices_file_name = get_temp_file_path()
             with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
                 cp.dump(cross_validator_indices, local_indices_file_obj)
@@ -529,14 +497,14 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     )
                 ),
             )
-            multimetric, cv_results_, best_param_index, scorers = construct_cv_results(
+            # multimetric, cv_results_, best_param_index, scorers
+            multimetric, cv_results_ = construct_cv_results(
+                estimator,
+                n_splits,
+                list(param_grid),
                 HP_raw_results.select("CV_RESULTS").sort(F.col("PARAM_CV_IND")).collect(),
                 cross_validator_indices_length,
                 parameter_grid_length,
-                {
-                    "return_train_score": estimator.return_train_score,
-                },  # TODO(xjiang): support more kwargs in here
             )
             estimator.cv_results_ = cv_results_
@@ -568,7 +536,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     # With a non-custom callable, we can select the best score
                     # based on the best index
                     estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
-                estimator.best_params_ = cv_results_["params"][best_param_index]
+                estimator.best_params_ = cv_results_["params"][estimator.best_index_]
             if original_refit:
                 estimator.best_estimator_ = clone(estimator.estimator).set_params(

snowflake-ml-python 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

snowflake-ml-python 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl