PyPI - snowflake-ml-python - Versions diffs - 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl - Mend

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -4,11 +4,11 @@ import io
 import os
 import posixpath
 import sys
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
 import cloudpickle as cp
 import numpy as np
-import numpy.typing as npt
 from sklearn import model_selection
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
@@ -17,10 +17,7 @@ from snowflake.ml._internal.utils import (
     identifier,
     pkg_version_utils,
     snowpark_dataframe_utils,
-)
-from snowflake.ml._internal.utils.temp_file_utils import (
-    cleanup_temp_files,
-    get_temp_file_path,
+    temp_file_utils,
 )
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecificationsBuilder,
@@ -36,14 +33,16 @@ from snowflake.snowpark._internal.utils import (
 from snowflake.snowpark.functions import sproc, udtf
 from snowflake.snowpark.row import Row
 from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
+from snowflake.snowpark.udtf import UDTFRegistration
-cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
+cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(snowpark_dataframe_utils.cast_snowpark_dataframe))
 _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
 ENABLE_EFFICIENT_MEMORY_USAGE = False
+_UDTF_STAGE_NAME = f"MEMORY_EFFICIENT_UDTF_{str(uuid.uuid4()).replace('-', '_')}"
 def construct_cv_results(
@@ -318,7 +317,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         original_refit = estimator.refit
         # Create a temp file and dump the estimator to that file.
-        estimator_file_name = get_temp_file_path()
+        estimator_file_name = temp_file_utils.get_temp_file_path()
         params_to_evaluate = []
         for param_to_eval in list(param_grid):
             for k, v in param_to_eval.items():
@@ -357,6 +356,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         )
         estimator_location = put_result[0].target
         imports.append(f"@{temp_stage_name}/{estimator_location}")
+        temp_file_utils.cleanup_temp_files([estimator_file_name])
         search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
         random_udtf_name = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION)
@@ -413,7 +413,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             X = df[input_cols]
             y = df[label_cols].squeeze() if label_cols else None
-            local_estimator_file_name = get_temp_file_path()
+            local_estimator_file_name = temp_file_utils.get_temp_file_path()
             session.file.get(stage_estimator_file_name, local_estimator_file_name)
             local_estimator_file_path = os.path.join(
@@ -429,7 +429,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             n_splits = build_cross_validator.get_n_splits(X, y, None)
             # store the cross_validator's test indices only to save space
             cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)]
-            local_indices_file_name = get_temp_file_path()
+            local_indices_file_name = temp_file_utils.get_temp_file_path()
             with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
                 cp.dump(cross_validator_indices, local_indices_file_obj)
@@ -445,6 +445,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             cross_validator_indices_length = int(len(cross_validator_indices))
             parameter_grid_length = len(param_grid)
+            temp_file_utils.cleanup_temp_files([local_estimator_file_name, local_indices_file_name])
             assert estimator is not None
             @cachetools.cached(cache={})
@@ -647,7 +649,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 if hasattr(estimator.best_estimator_, "feature_names_in_"):
                     estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
-            local_result_file_name = get_temp_file_path()
+            local_result_file_name = temp_file_utils.get_temp_file_path()
             with open(local_result_file_name, mode="w+b") as local_result_file_obj:
                 cp.dump(estimator, local_result_file_obj)
@@ -658,6 +660,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 auto_compress=False,
                 overwrite=True,
             )
+            temp_file_utils.cleanup_temp_files([local_result_file_name])
             # Note: you can add something like  + "|" + str(df) to the return string
             # to pass debug information to the caller.
@@ -671,7 +674,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             label_cols,
         )
-        local_estimator_path = get_temp_file_path()
+        local_estimator_path = temp_file_utils.get_temp_file_path()
         session.file.get(
             posixpath.join(temp_stage_name, sproc_export_file_name),
             local_estimator_path,
@@ -680,7 +683,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj:
             fit_estimator = cp.load(result_file_obj)
-        cleanup_temp_files([local_estimator_path])
+        temp_file_utils.cleanup_temp_files([local_estimator_path])
         return fit_estimator
@@ -698,7 +701,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
     ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
         from itertools import product
-        import cachetools
         from sklearn.base import clone, is_classifier
         from sklearn.calibration import check_cv
@@ -717,11 +719,13 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}/{dataset_file_name}").collect()]
         # Create a temp file and dump the estimator to that file.
-        estimator_file_name = get_temp_file_path()
+        estimator_file_name = temp_file_utils.get_temp_file_path()
         params_to_evaluate = list(param_grid)
-        n_candidates = len(params_to_evaluate)
-        _N_JOBS = estimator.n_jobs
-        _PRE_DISPATCH = estimator.pre_dispatch
+        CONSTANTS: Dict[str, Any] = dict()
+        CONSTANTS["dataset_snowpark_cols"] = dataset.columns
+        CONSTANTS["n_candidates"] = len(params_to_evaluate)
+        CONSTANTS["_N_JOBS"] = estimator.n_jobs
+        CONSTANTS["_PRE_DISPATCH"] = estimator.pre_dispatch
         with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
             cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
@@ -743,6 +747,9 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             api_calls=[udtf],
             custom_tags=dict([("hpo_memory_efficient", True)]),
         )
+        from snowflake.ml.modeling._internal.snowpark_implementations.distributed_search_udf_file import (
+            execute_template,
+        )
         # Put locally serialized estimator on stage.
         session.file.put(
@@ -753,6 +760,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         )
         estimator_location = os.path.basename(estimator_file_name)
         imports.append(f"@{temp_stage_name}/{estimator_location}")
+        temp_file_utils.cleanup_temp_files([estimator_file_name])
+        CONSTANTS["estimator_location"] = estimator_location
         search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
         random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
@@ -783,7 +792,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         ) -> str:
             import os
             import time
-            from typing import Iterator
             import cloudpickle as cp
             import pandas as pd
@@ -819,7 +827,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             if sample_weight_col:
                 fit_params["sample_weight"] = df[sample_weight_col].squeeze()
-            local_estimator_file_folder_name = get_temp_file_path()
+            local_estimator_file_folder_name = temp_file_utils.get_temp_file_path()
             session.file.get(stage_estimator_file_name, local_estimator_file_folder_name)
             local_estimator_file_path = os.path.join(
@@ -865,7 +873,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             # (1) store the cross_validator's test indices only to save space
             cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)]
-            local_indices_file_name = get_temp_file_path()
+            local_indices_file_name = temp_file_utils.get_temp_file_path()
             with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
                 cp.dump(cross_validator_indices, local_indices_file_obj)
@@ -880,7 +888,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             imports.append(f"@{temp_stage_name}/{indices_location}")
             # (2) store the base estimator
-            local_base_estimator_file_name = get_temp_file_path()
+            local_base_estimator_file_name = temp_file_utils.get_temp_file_path()
             with open(local_base_estimator_file_name, mode="w+b") as local_base_estimator_file_obj:
                 cp.dump(base_estimator, local_base_estimator_file_obj)
             session.file.put(
@@ -893,7 +901,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             imports.append(f"@{temp_stage_name}/{base_estimator_location}")
             # (3) store the fit_and_score_kwargs
-            local_fit_and_score_kwargs_file_name = get_temp_file_path()
+            local_fit_and_score_kwargs_file_name = temp_file_utils.get_temp_file_path()
             with open(local_fit_and_score_kwargs_file_name, mode="w+b") as local_fit_and_score_kwargs_file_obj:
                 cp.dump(fit_and_score_kwargs, local_fit_and_score_kwargs_file_obj)
             session.file.put(
@@ -905,242 +913,188 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             fit_and_score_kwargs_location = os.path.basename(local_fit_and_score_kwargs_file_name)
             imports.append(f"@{temp_stage_name}/{fit_and_score_kwargs_location}")
-            cross_validator_indices_length = int(len(cross_validator_indices))
-            parameter_grid_length = len(param_grid)
-            assert estimator is not None
+            CONSTANTS["input_cols"] = input_cols
+            CONSTANTS["label_cols"] = label_cols
+            CONSTANTS["DATA_LENGTH"] = DATA_LENGTH
+            CONSTANTS["n_splits"] = n_splits
+            CONSTANTS["indices_location"] = indices_location
+            CONSTANTS["base_estimator_location"] = base_estimator_location
+            CONSTANTS["fit_and_score_kwargs_location"] = fit_and_score_kwargs_location
-            @cachetools.cached(cache={})
-            def _load_data_into_udf() -> Tuple[
-                npt.NDArray[Any],
-                npt.NDArray[Any],
-                List[List[int]],
-                List[Dict[str, Any]],
-                object,
-                Dict[str, Any],
-            ]:
-                import pyarrow.parquet as pq
+            # (6) store the constants
+            local_constant_file_name = temp_file_utils.get_temp_file_path(prefix="constant")
+            with open(local_constant_file_name, mode="w+b") as local_indices_file_obj:
+                cp.dump(CONSTANTS, local_indices_file_obj)
-                data_files = [
-                    filename
-                    for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
-                    if filename.startswith(dataset_file_name)
-                ]
-                partial_df = [
-                    pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
-                    for file_name in data_files
+            # Put locally serialized indices on stage.
+            session.file.put(
+                local_constant_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            constant_location = os.path.basename(local_constant_file_name)
+            imports.append(f"@{temp_stage_name}/{constant_location}")
+            temp_file_utils.cleanup_temp_files(
+                [
+                    local_estimator_file_folder_name,
+                    local_indices_file_name,
+                    local_base_estimator_file_name,
+                    local_base_estimator_file_name,
+                    local_fit_and_score_kwargs_file_name,
+                    local_constant_file_name,
                 ]
-                df = pd.concat(partial_df, ignore_index=True)
-                df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
+            )
-                # load parameter grid
-                local_estimator_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
-                )
-                with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
-                    estimator_objects = cp.load(local_estimator_file_obj)
-                    params_to_evaluate = estimator_objects["param_grid"]
+            cross_validator_indices_length = int(len(cross_validator_indices))
+            parameter_grid_length = len(param_grid)
-                # load indices
-                local_indices_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{indices_location}"
-                )
-                with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
-                    indices = cp.load(local_indices_file_obj)
+            assert estimator is not None
-                # load base estimator
-                local_base_estimator_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{base_estimator_location}"
-                )
-                with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
-                    base_estimator = cp.load(local_base_estimator_file_obj)
+            # Instantiate UDTFRegistration with the session object
+            udtf_registration = UDTFRegistration(session)
+            import tempfile
+            # delete is set to False to support Windows environment
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
+                udf_code = execute_template
+                f.file.write(udf_code)
+                f.file.flush()
+                # Use catchall exception handling and a finally block to clean up the _UDTF_STAGE_NAME
+                try:
+                    # Create one stage for data and for estimators.
+                    # Because only permanent functions support _sf_node_singleton for now, therefore,
+                    # UDTF creation would change to is_permanent=True, and manually drop the stage after UDTF is done
+                    _stage_creation_query_udtf = f"CREATE OR REPLACE STAGE {_UDTF_STAGE_NAME};"
+                    session.sql(_stage_creation_query_udtf).collect()
+                    # Register the UDTF function from the file
+                    udtf_registration.register_from_file(
+                        file_path=f.name,
+                        handler_name="SearchCV",
+                        name=random_udtf_name,
+                        output_schema=StructType(
+                            [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
+                        ),
+                        input_types=[IntegerType(), IntegerType(), IntegerType()],
+                        replace=True,
+                        imports=imports,  # type: ignore[arg-type]
+                        stage_location=_UDTF_STAGE_NAME,
+                        is_permanent=True,
+                        packages=required_deps,  # type: ignore[arg-type]
+                        statement_params=udtf_statement_params,
+                    )
-                # load fit_and_score_kwargs
-                local_fit_and_score_kwargs_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{fit_and_score_kwargs_location}"
-                )
-                with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
-                    fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
-                # convert dataframe to numpy would save memory consumption
-                return (
-                    df[input_cols].to_numpy(),
-                    df[label_cols].squeeze().to_numpy(),
-                    indices,
-                    params_to_evaluate,
-                    base_estimator,
-                    fit_and_score_kwargs,
-                )
+                    HP_TUNING = F.table_function(random_udtf_name)
-            # Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
-            class SearchCV:
-                def __init__(self) -> None:
-                    X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
-                    self.X = X
-                    self.y = y
-                    self.test_indices = indices
-                    self.params_to_evaluate = params_to_evaluate
-                    self.base_estimator = base_estimator
-                    self.fit_and_score_kwargs = fit_and_score_kwargs
-                    self.fit_score_params: List[Any] = []
-                    self.cv_indices_set: Set[int] = set()
-                def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
-                    self.fit_score_params.extend([[idx, params_idx, cv_idx]])
-                    self.cv_indices_set.add(cv_idx)
-                def end_partition(self) -> Iterator[Tuple[int, str]]:
-                    from sklearn.base import clone
-                    from sklearn.model_selection._validation import _fit_and_score
-                    from sklearn.utils.parallel import Parallel, delayed
-                    cached_train_test_indices = {}
-                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
-                    full_index = np.arange(DATA_LENGTH)
-                    for i in self.cv_indices_set:
-                        cached_train_test_indices[i] = [
-                            np.setdiff1d(full_index, self.test_indices[i]),
-                            self.test_indices[i],
-                        ]
-                    parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
-                    out = parallel(
-                        delayed(_fit_and_score)(
-                            clone(self.base_estimator),
-                            self.X,
-                            self.y,
-                            train=cached_train_test_indices[split_idx][0],
-                            test=cached_train_test_indices[split_idx][1],
-                            parameters=self.params_to_evaluate[cand_idx],
-                            split_progress=(split_idx, n_splits),
-                            candidate_progress=(cand_idx, n_candidates),
-                            **self.fit_and_score_kwargs,  # load sample weight here
-                        )
-                        for _, cand_idx, split_idx in self.fit_score_params
+                    # param_indices is for the index for each parameter grid;
+                    # cv_indices is for the index for each cross_validator's fold;
+                    # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
+                    cv_indices, param_indices = zip(
+                        *product(range(cross_validator_indices_length), range(parameter_grid_length))
                     )
-                    binary_cv_results = None
-                    with io.BytesIO() as f:
-                        cp.dump(out, f)
-                        f.seek(0)
-                        binary_cv_results = f.getvalue().hex()
-                    yield (
-                        self.fit_score_params[0][0],
-                        binary_cv_results,
+                    indices_info_pandas = pd.DataFrame(
+                        {
+                            "IDX": [
+                                i // _NUM_CPUs for i in range(parameter_grid_length * cross_validator_indices_length)
+                            ],
+                            "PARAM_IND": param_indices,
+                            "CV_IND": cv_indices,
+                        }
                     )
-            session.udtf.register(
-                SearchCV,
-                output_schema=StructType(
-                    [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
-                ),
-                input_types=[IntegerType(), IntegerType(), IntegerType()],
-                name=random_udtf_name,
-                packages=required_deps,  # type: ignore[arg-type]
-                replace=True,
-                is_permanent=False,
-                imports=imports,  # type: ignore[arg-type]
-                statement_params=udtf_statement_params,
-            )
-            HP_TUNING = F.table_function(random_udtf_name)
-            # param_indices is for the index for each parameter grid;
-            # cv_indices is for the index for each cross_validator's fold;
-            # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
-            cv_indices, param_indices = zip(
-                *product(range(cross_validator_indices_length), range(parameter_grid_length))
-            )
-            indices_info_pandas = pd.DataFrame(
-                {
-                    "IDX": [i // _NUM_CPUs for i in range(parameter_grid_length * cross_validator_indices_length)],
-                    "PARAM_IND": param_indices,
-                    "CV_IND": cv_indices,
-                }
-            )
-            indices_info_sp = session.create_dataframe(indices_info_pandas)
-            # execute udtf by querying HP_TUNING table
-            HP_raw_results = indices_info_sp.select(
-                (
-                    HP_TUNING(indices_info_sp["IDX"], indices_info_sp["PARAM_IND"], indices_info_sp["CV_IND"]).over(
-                        partition_by="IDX"
+                    indices_info_sp = session.create_dataframe(indices_info_pandas)
+                    # execute udtf by querying HP_TUNING table
+                    HP_raw_results = indices_info_sp.select(
+                        (
+                            HP_TUNING(
+                                indices_info_sp["IDX"], indices_info_sp["PARAM_IND"], indices_info_sp["CV_IND"]
+                            ).over(partition_by="IDX")
+                        ),
                     )
-                ),
-            )
-            first_test_score, cv_results_ = construct_cv_results_memory_efficient_version(
-                estimator,
-                n_splits,
-                list(param_grid),
-                HP_raw_results.select("EACH_CV_RESULTS").sort(F.col("FIRST_IDX")).collect(),
-                cross_validator_indices_length,
-                parameter_grid_length,
-            )
-            estimator.cv_results_ = cv_results_
-            estimator.multimetric_ = isinstance(first_test_score, dict)
-            # check refit_metric now for a callable scorer that is multimetric
-            if callable(estimator.scoring) and estimator.multimetric_:
-                estimator._check_refit_for_multimetric(first_test_score)
-                refit_metric = estimator.refit
-            # For multi-metric evaluation, store the best_index_, best_params_ and
-            # best_score_ iff refit is one of the scorer names
-            # In single metric evaluation, refit_metric is "score"
-            if estimator.refit or not estimator.multimetric_:
-                estimator.best_index_ = estimator._select_best_index(estimator.refit, refit_metric, cv_results_)
-                if not callable(estimator.refit):
-                    # With a non-custom callable, we can select the best score
-                    # based on the best index
-                    estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
-                estimator.best_params_ = cv_results_["params"][estimator.best_index_]
-            if estimator.refit:
-                estimator.best_estimator_ = clone(base_estimator).set_params(
-                    **clone(estimator.best_params_, safe=False)
-                )
-                # Let the sproc use all cores to refit.
-                estimator.n_jobs = estimator.n_jobs or -1
-                # process the input as args
-                argspec = inspect.getfullargspec(estimator.fit)
-                args = {"X": X}
-                if label_cols:
-                    label_arg_name = "Y" if "Y" in argspec.args else "y"
-                    args[label_arg_name] = y
-                if sample_weight_col is not None and "sample_weight" in argspec.args:
-                    args["sample_weight"] = df[sample_weight_col].squeeze()
-                # estimator.refit = original_refit
-                refit_start_time = time.time()
-                estimator.best_estimator_.fit(**args)
-                refit_end_time = time.time()
-                estimator.refit_time_ = refit_end_time - refit_start_time
+                    first_test_score, cv_results_ = construct_cv_results_memory_efficient_version(
+                        estimator,
+                        n_splits,
+                        list(param_grid),
+                        HP_raw_results.select("EACH_CV_RESULTS").sort(F.col("FIRST_IDX")).collect(),
+                        cross_validator_indices_length,
+                        parameter_grid_length,
+                    )
-                if hasattr(estimator.best_estimator_, "feature_names_in_"):
-                    estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
+                    estimator.cv_results_ = cv_results_
+                    estimator.multimetric_ = isinstance(first_test_score, dict)
+                    # check refit_metric now for a callable scorer that is multimetric
+                    if callable(estimator.scoring) and estimator.multimetric_:
+                        estimator._check_refit_for_multimetric(first_test_score)
+                        refit_metric = estimator.refit
+                    # For multi-metric evaluation, store the best_index_, best_params_ and
+                    # best_score_ iff refit is one of the scorer names
+                    # In single metric evaluation, refit_metric is "score"
+                    if estimator.refit or not estimator.multimetric_:
+                        estimator.best_index_ = estimator._select_best_index(estimator.refit, refit_metric, cv_results_)
+                        if not callable(estimator.refit):
+                            # With a non-custom callable, we can select the best score
+                            # based on the best index
+                            estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
+                        estimator.best_params_ = cv_results_["params"][estimator.best_index_]
+                    if estimator.refit:
+                        estimator.best_estimator_ = clone(base_estimator).set_params(
+                            **clone(estimator.best_params_, safe=False)
+                        )
-            # Store the only scorer not as a dict for single metric evaluation
-            estimator.scorer_ = scorers
-            estimator.n_splits_ = n_splits
+                        # Let the sproc use all cores to refit.
+                        estimator.n_jobs = estimator.n_jobs or -1
+                        # process the input as args
+                        argspec = inspect.getfullargspec(estimator.fit)
+                        args = {"X": X}
+                        if label_cols:
+                            label_arg_name = "Y" if "Y" in argspec.args else "y"
+                            args[label_arg_name] = y
+                        if sample_weight_col is not None and "sample_weight" in argspec.args:
+                            args["sample_weight"] = df[sample_weight_col].squeeze()
+                        # estimator.refit = original_refit
+                        refit_start_time = time.time()
+                        estimator.best_estimator_.fit(**args)
+                        refit_end_time = time.time()
+                        estimator.refit_time_ = refit_end_time - refit_start_time
+                        if hasattr(estimator.best_estimator_, "feature_names_in_"):
+                            estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
+                    # Store the only scorer not as a dict for single metric evaluation
+                    estimator.scorer_ = scorers
+                    estimator.n_splits_ = n_splits
+                    local_result_file_name = temp_file_utils.get_temp_file_path()
+                    with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                        cp.dump(estimator, local_result_file_obj)
+                    session.file.put(
+                        local_result_file_name,
+                        temp_stage_name,
+                        auto_compress=False,
+                        overwrite=True,
+                    )
-            local_result_file_name = get_temp_file_path()
+                    # Clean up the stages and files
+                    session.sql(f"DROP STAGE IF EXISTS {_UDTF_STAGE_NAME}")
-            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
-                cp.dump(estimator, local_result_file_obj)
+                    temp_file_utils.cleanup_temp_files([local_result_file_name])
-            session.file.put(
-                local_result_file_name,
-                temp_stage_name,
-                auto_compress=False,
-                overwrite=True,
-            )
-            return str(os.path.basename(local_result_file_name))
+                    return str(os.path.basename(local_result_file_name))
+                finally:
+                    # Clean up the stages
+                    session.sql(f"DROP STAGE IF EXISTS {_UDTF_STAGE_NAME}")
         sproc_export_file_name = _distributed_search(
             session,
@@ -1150,7 +1104,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             label_cols,
         )
-        local_estimator_path = get_temp_file_path()
+        local_estimator_path = temp_file_utils.get_temp_file_path()
         session.file.get(
             posixpath.join(temp_stage_name, sproc_export_file_name),
             local_estimator_path,
@@ -1159,7 +1113,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj:
             fit_estimator = cp.load(result_file_obj)
-        cleanup_temp_files(local_estimator_path)
+        temp_file_utils.cleanup_temp_files(local_estimator_path)
         return fit_estimator

snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl