PyPI - snowflake-ml-python - Versions diffs - 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl - Mend

snowflake-ml-python 1.5.0py3-none-any.whl → 1.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import cloudpickle as cp
 import numpy as np
-import numpy.typing as npt
 from sklearn import model_selection
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
@@ -36,6 +35,7 @@ from snowflake.snowpark._internal.utils import (
 from snowflake.snowpark.functions import sproc, udtf
 from snowflake.snowpark.row import Row
 from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
+from snowflake.snowpark.udtf import UDTFRegistration
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
@@ -154,7 +154,7 @@ def construct_cv_results(
     return multimetric, estimator._format_results(param_grid, n_split, out)
-def construct_cv_results_new_implementation(
+def construct_cv_results_memory_efficient_version(
     estimator: Union[GridSearchCV, RandomizedSearchCV],
     n_split: int,
     param_grid: List[Dict[str, Any]],
@@ -205,12 +205,35 @@ def construct_cv_results_new_implementation(
         with io.BytesIO(hex_str) as f_reload:
             out = cp.load(f_reload)
             all_out.extend(out)
+    # because original SearchCV is ranked by parameter first and cv second,
+    # to make the memory efficient, we implemented by fitting on cv first and parameter second
+    # when retrieving the results back, the ordering should revert back to remain the same result as original SearchCV
+    def generate_the_order_by_parameter_index(all_combination_length: int) -> List[int]:
+        pattern = []
+        for i in range(all_combination_length):
+            if i % parameter_grid_length == 0:
+                pattern.append(i)
+        for i in range(1, parameter_grid_length):
+            for j in range(all_combination_length):
+                if j % parameter_grid_length == i:
+                    pattern.append(j)
+        return pattern
+    def rerank_array(original_array: List[Any], pattern: List[int]) -> List[Any]:
+        reranked_array = []
+        for index in pattern:
+            reranked_array.append(original_array[index])
+        return reranked_array
+    pattern = generate_the_order_by_parameter_index(len(all_out))
+    reranked_all_out = rerank_array(all_out, pattern)
     first_test_score = all_out[0]["test_scores"]
-    return first_test_score, estimator._format_results(param_grid, n_split, all_out)
+    return first_test_score, estimator._format_results(param_grid, n_split, reranked_all_out)
 cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
-cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_new_implementation))
+cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_memory_efficient_version))
 class DistributedHPOTrainer(SnowparkModelTrainer):
@@ -661,7 +684,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         return fit_estimator
-    def fit_search_snowpark_new_implementation(
+    def fit_search_snowpark_enable_efficient_memory_usage(
         self,
         param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
         dataset: DataFrame,
@@ -675,7 +698,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
     ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
         from itertools import product
-        import cachetools
         from sklearn.base import clone, is_classifier
         from sklearn.calibration import check_cv
@@ -696,9 +718,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         # Create a temp file and dump the estimator to that file.
         estimator_file_name = get_temp_file_path()
         params_to_evaluate = list(param_grid)
-        n_candidates = len(params_to_evaluate)
-        _N_JOBS = estimator.n_jobs
-        _PRE_DISPATCH = estimator.pre_dispatch
+        CONSTANTS: Dict[str, Any] = dict()
+        CONSTANTS["dataset_snowpark_cols"] = dataset.columns
+        CONSTANTS["n_candidates"] = len(params_to_evaluate)
+        CONSTANTS["_N_JOBS"] = estimator.n_jobs
+        CONSTANTS["_PRE_DISPATCH"] = estimator.pre_dispatch
         with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
             cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
@@ -718,7 +742,10 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[udtf],
-            custom_tags=dict([("hpo_udtf", True)]),
+            custom_tags=dict([("hpo_memory_efficient", True)]),
+        )
+        from snowflake.ml.modeling._internal.snowpark_implementations.distributed_search_udf_file import (
+            execute_template,
         )
         # Put locally serialized estimator on stage.
@@ -730,6 +757,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         )
         estimator_location = os.path.basename(estimator_file_name)
         imports.append(f"@{temp_stage_name}/{estimator_location}")
+        CONSTANTS["estimator_location"] = estimator_location
         search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
         random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
@@ -760,7 +788,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         ) -> str:
             import os
             import time
-            from typing import Iterator
             import cloudpickle as cp
             import pandas as pd
@@ -882,146 +909,67 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             fit_and_score_kwargs_location = os.path.basename(local_fit_and_score_kwargs_file_name)
             imports.append(f"@{temp_stage_name}/{fit_and_score_kwargs_location}")
-            cross_validator_indices_length = int(len(cross_validator_indices))
-            parameter_grid_length = len(param_grid)
-            assert estimator is not None
-            @cachetools.cached(cache={})
-            def _load_data_into_udf() -> Tuple[
-                npt.NDArray[Any],
-                npt.NDArray[Any],
-                List[List[int]],
-                List[Dict[str, Any]],
-                object,
-                Dict[str, Any],
-            ]:
-                import pyarrow.parquet as pq
+            CONSTANTS["input_cols"] = input_cols
+            CONSTANTS["label_cols"] = label_cols
+            CONSTANTS["DATA_LENGTH"] = DATA_LENGTH
+            CONSTANTS["n_splits"] = n_splits
+            CONSTANTS["indices_location"] = indices_location
+            CONSTANTS["base_estimator_location"] = base_estimator_location
+            CONSTANTS["fit_and_score_kwargs_location"] = fit_and_score_kwargs_location
-                data_files = [
-                    filename
-                    for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
-                    if filename.startswith(dataset_file_name)
-                ]
-                partial_df = [
-                    pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
-                    for file_name in data_files
-                ]
-                df = pd.concat(partial_df, ignore_index=True)
-                df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
+            # (6) store the constants
+            local_constant_file_name = get_temp_file_path(prefix="constant")
+            with open(local_constant_file_name, mode="w+b") as local_indices_file_obj:
+                cp.dump(CONSTANTS, local_indices_file_obj)
-                # load parameter grid
-                local_estimator_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
-                )
-                with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
-                    estimator_objects = cp.load(local_estimator_file_obj)
-                    params_to_evaluate = estimator_objects["param_grid"]
+            # Put locally serialized indices on stage.
+            session.file.put(
+                local_constant_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            constant_location = os.path.basename(local_constant_file_name)
+            imports.append(f"@{temp_stage_name}/{constant_location}")
-                # load indices
-                local_indices_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{indices_location}"
-                )
-                with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
-                    indices = cp.load(local_indices_file_obj)
+            cross_validator_indices_length = int(len(cross_validator_indices))
+            parameter_grid_length = len(param_grid)
-                # load base estimator
-                local_base_estimator_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{base_estimator_location}"
-                )
-                with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
-                    base_estimator = cp.load(local_base_estimator_file_obj)
+            assert estimator is not None
-                # load fit_and_score_kwargs
-                local_fit_and_score_kwargs_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{fit_and_score_kwargs_location}"
-                )
-                with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
-                    fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
-                # convert dataframe to numpy would save memory consumption
-                return (
-                    df[input_cols].to_numpy(),
-                    df[label_cols].squeeze().to_numpy(),
-                    indices,
-                    params_to_evaluate,
-                    base_estimator,
-                    fit_and_score_kwargs,
+            # Instantiate UDTFRegistration with the session object
+            udtf_registration = UDTFRegistration(session)
+            import tempfile
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
+                udf_code = execute_template
+                f.file.write(udf_code)
+                f.file.flush()
+                # Register the UDTF function from the file
+                udtf_registration.register_from_file(
+                    file_path=f.name,
+                    handler_name="SearchCV",
+                    name=random_udtf_name,
+                    output_schema=StructType(
+                        [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
+                    ),
+                    input_types=[IntegerType(), IntegerType(), IntegerType()],
+                    replace=True,
+                    imports=imports,  # type: ignore[arg-type]
+                    is_permanent=False,
+                    packages=required_deps,  # type: ignore[arg-type]
+                    statement_params=udtf_statement_params,
                 )
-            # Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
-            class SearchCV:
-                def __init__(self) -> None:
-                    X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
-                    self.X = X
-                    self.y = y
-                    self.test_indices = indices
-                    self.params_to_evaluate = params_to_evaluate
-                    self.base_estimator = base_estimator
-                    self.fit_and_score_kwargs = fit_and_score_kwargs
-                    self.fit_score_params: List[Any] = []
-                    self.cached_train_test_indices = []
-                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
-                    full_index = np.arange(DATA_LENGTH)
-                    for i in range(n_splits):
-                        self.cached_train_test_indices.extend(
-                            [[np.setdiff1d(full_index, self.test_indices[i]), self.test_indices[i]]]
-                        )
-                def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
-                    self.fit_score_params.extend([[idx, params_idx, cv_idx]])
-                def end_partition(self) -> Iterator[Tuple[int, str]]:
-                    from sklearn.base import clone
-                    from sklearn.model_selection._validation import _fit_and_score
-                    from sklearn.utils.parallel import Parallel, delayed
-                    parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
-                    out = parallel(
-                        delayed(_fit_and_score)(
-                            clone(self.base_estimator),
-                            self.X,
-                            self.y,
-                            train=self.cached_train_test_indices[split_idx][0],
-                            test=self.cached_train_test_indices[split_idx][1],
-                            parameters=self.params_to_evaluate[cand_idx],
-                            split_progress=(split_idx, n_splits),
-                            candidate_progress=(cand_idx, n_candidates),
-                            **self.fit_and_score_kwargs,  # load sample weight here
-                        )
-                        for _, cand_idx, split_idx in self.fit_score_params
-                    )
-                    binary_cv_results = None
-                    with io.BytesIO() as f:
-                        cp.dump(out, f)
-                        f.seek(0)
-                        binary_cv_results = f.getvalue().hex()
-                    yield (
-                        self.fit_score_params[0][0],
-                        binary_cv_results,
-                    )
-            session.udtf.register(
-                SearchCV,
-                output_schema=StructType([StructField("IDX", IntegerType()), StructField("CV_RESULTS", StringType())]),
-                input_types=[IntegerType(), IntegerType(), IntegerType()],
-                name=random_udtf_name,
-                packages=required_deps,  # type: ignore[arg-type]
-                replace=True,
-                is_permanent=False,
-                imports=imports,  # type: ignore[arg-type]
-                statement_params=udtf_statement_params,
-            )
             HP_TUNING = F.table_function(random_udtf_name)
             # param_indices is for the index for each parameter grid;
             # cv_indices is for the index for each cross_validator's fold;
             # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
-            param_indices, cv_indices = zip(
-                *product(range(parameter_grid_length), range(cross_validator_indices_length))
+            cv_indices, param_indices = zip(
+                *product(range(cross_validator_indices_length), range(parameter_grid_length))
             )
             indices_info_pandas = pd.DataFrame(
@@ -1042,11 +990,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 ),
             )
-            first_test_score, cv_results_ = construct_cv_results_new_implementation(
+            first_test_score, cv_results_ = construct_cv_results_memory_efficient_version(
                 estimator,
                 n_splits,
                 list(param_grid),
-                HP_raw_results.select("CV_RESULTS").sort(F.col("IDX")).collect(),
+                HP_raw_results.select("EACH_CV_RESULTS").sort(F.col("FIRST_IDX")).collect(),
                 cross_validator_indices_length,
                 parameter_grid_length,
             )
@@ -1163,7 +1111,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             pkg_versions=model_spec.pkgDependencies, session=self.session
         )
         if ENABLE_EFFICIENT_MEMORY_USAGE:
-            return self.fit_search_snowpark_new_implementation(
+            return self.fit_search_snowpark_enable_efficient_memory_usage(
                 param_grid=param_grid,
                 dataset=self.dataset,
                 session=self.session,

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""
+Description:
+    This is the helper file for distributed_hpo_trainer.py to create UDTF by `register_from_file`.
+Performance Benefits:
+    The performance benefits come from two aspects,
+    1. register_from_file can reduce duplicating loading data by only loading data once in each node
+    2. register_from_file enable user to load data in global variable, whereas writing UDF in python script cannot.
+Developer Tips:
+    Because this script is now a string, so there's no type hinting, linting, etc. It is highly recommended
+    to develop in a python script, test the type hinting, and then convert it into a string.
+"""
+execute_template = """
+from typing import Tuple, Any, List, Dict, Set, Iterator
+import os
+import sys
+import pandas as pd
+import numpy as np
+import numpy.typing as npt
+import cloudpickle as cp
+import io
+def _load_data_into_udf() -> Tuple[
+    npt.NDArray[Any],
+    npt.NDArray[Any],
+    List[List[int]],
+    List[Dict[str, Any]],
+    object,
+    Dict[str, Any],
+    Dict[str, Any],
+]:
+    import pyarrow.parquet as pq
+    data_files = [
+        filename
+        for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+        if filename.startswith("dataset")
+    ]
+    partial_df = [
+        pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+        for file_name in data_files
+    ]
+    df = pd.concat(partial_df, ignore_index=True)
+    constant_file_path = None
+    for filename in os.listdir(sys._xoptions["snowflake_import_directory"]):
+        if filename.startswith("constant"):
+            constant_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{filename}")
+    if constant_file_path is None:
+        raise ValueError("UDTF cannot find the constant location, abort!")
+    with open(constant_file_path, mode="rb") as constant_file_obj:
+        CONSTANTS = cp.load(constant_file_obj)
+    df.columns = CONSTANTS['dataset_snowpark_cols']
+    # load parameter grid
+    local_estimator_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"],
+        f"{CONSTANTS['estimator_location']}"
+    )
+    with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
+        estimator_objects = cp.load(local_estimator_file_obj)
+        params_to_evaluate = estimator_objects["param_grid"]
+    # load indices
+    local_indices_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"],
+        f"{CONSTANTS['indices_location']}"
+    )
+    with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
+        indices = cp.load(local_indices_file_obj)
+    # load base estimator
+    local_base_estimator_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['base_estimator_location']}"
+    )
+    with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
+        base_estimator = cp.load(local_base_estimator_file_obj)
+    # load fit_and_score_kwargs
+    local_fit_and_score_kwargs_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['fit_and_score_kwargs_location']}"
+    )
+    with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
+        fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
+    # convert dataframe to numpy would save memory consumption
+    return (
+        df[CONSTANTS['input_cols']].to_numpy(),
+        df[CONSTANTS['label_cols']].squeeze().to_numpy(),
+        indices,
+        params_to_evaluate,
+        base_estimator,
+        fit_and_score_kwargs,
+        CONSTANTS
+    )
+global_load_data = _load_data_into_udf()
+# Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
+class SearchCV:
+    def __init__(self) -> None:
+        X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs, CONSTANTS = global_load_data
+        self.X = X
+        self.y = y
+        self.test_indices = indices
+        self.params_to_evaluate = params_to_evaluate
+        self.base_estimator = base_estimator
+        self.fit_and_score_kwargs = fit_and_score_kwargs
+        self.fit_score_params: List[Any] = []
+        self.CONSTANTS = CONSTANTS
+        self.cv_indices_set: Set[int] = set()
+    def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
+        self.fit_score_params.extend([[idx, params_idx, cv_idx]])
+        self.cv_indices_set.add(cv_idx)
+    def end_partition(self) -> Iterator[Tuple[int, str]]:
+        from sklearn.base import clone
+        from sklearn.model_selection._validation import _fit_and_score
+        from sklearn.utils.parallel import Parallel, delayed
+        cached_train_test_indices = {}
+        # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
+        full_index = np.arange(self.CONSTANTS['DATA_LENGTH'])
+        for i in self.cv_indices_set:
+            cached_train_test_indices[i] = [
+                np.setdiff1d(full_index, self.test_indices[i]),
+                self.test_indices[i],
+            ]
+        parallel = Parallel(n_jobs=self.CONSTANTS['_N_JOBS'], pre_dispatch=self.CONSTANTS['_PRE_DISPATCH'])
+        out = parallel(
+            delayed(_fit_and_score)(
+                clone(self.base_estimator),
+                self.X,
+                self.y,
+                train=cached_train_test_indices[split_idx][0],
+                test=cached_train_test_indices[split_idx][1],
+                parameters=self.params_to_evaluate[cand_idx],
+                split_progress=(split_idx, self.CONSTANTS['n_splits']),
+                candidate_progress=(cand_idx, self.CONSTANTS['n_candidates']),
+                **self.fit_and_score_kwargs,  # load sample weight here
+            )
+            for _, cand_idx, split_idx in self.fit_score_params
+        )
+        binary_cv_results = None
+        with io.BytesIO() as f:
+            cp.dump(out, f)
+            f.seek(0)
+            binary_cv_results = f.getvalue().hex()
+        yield (
+            self.fit_score_params[0][0],
+            binary_cv_results,
+        )
+"""

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py CHANGED Viewed

@@ -45,6 +45,7 @@ cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
 _PROJECT = "ModelDevelopment"
+_ENABLE_ANONYMOUS_SPROC = False
 class SnowparkModelTrainer:
@@ -251,6 +252,27 @@ class SnowparkModelTrainer:
         return fit_wrapper_function
+    def _get_fit_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
+        fit_wrapper_sproc = self.session.sproc.register(
+            func=self._build_fit_wrapper_sproc(model_spec=model_spec),
+            is_permanent=False,
+            name=fit_sproc_name,
+            packages=["snowflake-snowpark-python"] + relaxed_dependencies,  # type: ignore[arg-type]
+            replace=True,
+            session=self.session,
+            statement_params=statement_params,
+            anonymous=True,
+        )
+        return fit_wrapper_sproc
     def _get_fit_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
         # If the sproc already exists, don't register.
         if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
@@ -510,6 +532,28 @@ class SnowparkModelTrainer:
         return fit_transform_wrapper_function
+    def _get_fit_predict_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        fit_predict_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
+        fit_predict_wrapper_sproc = self.session.sproc.register(
+            func=self._build_fit_predict_wrapper_sproc(model_spec=model_spec),
+            is_permanent=False,
+            name=fit_predict_sproc_name,
+            packages=["snowflake-snowpark-python"] + relaxed_dependencies,  # type: ignore[arg-type]
+            replace=True,
+            session=self.session,
+            statement_params=statement_params,
+            anonymous=True,
+        )
+        return fit_predict_wrapper_sproc
     def _get_fit_predict_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
         # If the sproc already exists, don't register.
         if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
@@ -545,6 +589,27 @@ class SnowparkModelTrainer:
         return fit_predict_wrapper_sproc
+    def _get_fit_transform_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        fit_transform_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
+        fit_transform_wrapper_sproc = self.session.sproc.register(
+            func=self._build_fit_transform_wrapper_sproc(model_spec=model_spec),
+            is_permanent=False,
+            name=fit_transform_sproc_name,
+            packages=["snowflake-snowpark-python"] + relaxed_dependencies,  # type: ignore[arg-type]
+            replace=True,
+            session=self.session,
+            statement_params=statement_params,
+            anonymous=True,
+        )
+        return fit_transform_wrapper_sproc
     def _get_fit_transform_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
         # If the sproc already exists, don't register.
         if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
@@ -612,7 +677,10 @@ class SnowparkModelTrainer:
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
-        fit_wrapper_sproc = self._get_fit_wrapper_sproc(statement_params=statement_params)
+        if _ENABLE_ANONYMOUS_SPROC:
+            fit_wrapper_sproc = self._get_fit_wrapper_sproc_anonymous(statement_params=statement_params)
+        else:
+            fit_wrapper_sproc = self._get_fit_wrapper_sproc(statement_params=statement_params)
         try:
             sproc_export_file_name: str = fit_wrapper_sproc(
@@ -680,7 +748,11 @@ class SnowparkModelTrainer:
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
-        fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc(statement_params=statement_params)
+        if _ENABLE_ANONYMOUS_SPROC:
+            fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc_anonymous(statement_params=statement_params)
+        else:
+            fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc(statement_params=statement_params)
         fit_predict_result_name = random_name_for_temp_object(TempObjectType.TABLE)
         sproc_export_file_name: str = fit_predict_wrapper_sproc(
@@ -741,7 +813,13 @@ class SnowparkModelTrainer:
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
-        fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc(statement_params=statement_params)
+        if _ENABLE_ANONYMOUS_SPROC:
+            fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc_anonymous(
+                statement_params=statement_params
+            )
+        else:
+            fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc(statement_params=statement_params)
         fit_transform_result_name = random_name_for_temp_object(TempObjectType.TABLE)
         sproc_export_file_name: str = fit_transform_wrapper_sproc(

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -629,7 +629,14 @@ class CalibratedClassifierCV(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake/ml/modeling/cluster/affinity_propagation.py CHANGED Viewed

@@ -606,7 +606,14 @@ class AffinityPropagation(BaseTransformer):
     ) -> List[str]:
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
+        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
+        # seen during the fit.
+        snowpark_column_names = dataset.select(self.input_cols).columns
+        sample_pd_df.columns = snowpark_column_names
+        output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
         output_df_columns = list(output_df_pd.columns)
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:

snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl

snowflake-ml-python 1.5.0py3-none-any.whl → 1.5.2py3-none-any.whl