PyPI - snowflake-ml-python - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (218) hide show

snowflake/ml/_internal/env_utils.py +72 -31
snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
snowflake/ml/_internal/exceptions/error_codes.py +3 -0
snowflake/ml/_internal/lineage/data_source.py +10 -0
snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
snowflake/ml/_internal/telemetry.py +1 -0
snowflake/ml/_internal/utils/identifier.py +1 -1
snowflake/ml/_internal/utils/sql_identifier.py +14 -1
snowflake/ml/dataset/__init__.py +11 -0
snowflake/ml/dataset/dataset.py +455 -129
snowflake/ml/dataset/dataset_factory.py +53 -0
snowflake/ml/dataset/dataset_metadata.py +103 -0
snowflake/ml/dataset/dataset_reader.py +199 -0
snowflake/ml/feature_store/__init__.py +6 -0
snowflake/ml/feature_store/access_manager.py +279 -0
snowflake/ml/feature_store/feature_store.py +544 -358
snowflake/ml/feature_store/feature_view.py +55 -16
snowflake/ml/fileset/embedded_stage_fs.py +149 -0
snowflake/ml/fileset/sfcfs.py +0 -4
snowflake/ml/fileset/snowfs.py +160 -0
snowflake/ml/fileset/stage_fs.py +25 -10
snowflake/ml/model/__init__.py +2 -2
snowflake/ml/model/_api.py +16 -1
snowflake/ml/model/_client/model/model_impl.py +65 -31
snowflake/ml/model/_client/model/model_version_impl.py +159 -2
snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
snowflake/ml/model/_client/ops/model_ops.py +268 -83
snowflake/ml/model/_client/sql/_base.py +34 -0
snowflake/ml/model/_client/sql/model.py +42 -47
snowflake/ml/model/_client/sql/model_version.py +164 -39
snowflake/ml/model/_client/sql/stage.py +6 -32
snowflake/ml/model/_client/sql/tag.py +32 -56
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
snowflake/ml/model/_model_composer/model_composer.py +22 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
snowflake/ml/model/_packager/model_env/model_env.py +41 -0
snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
snowflake/ml/model/_packager/model_packager.py +0 -3
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
snowflake/ml/modeling/_internal/model_trainer.py +7 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +340 -17
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
snowflake/ml/modeling/cluster/birch.py +53 -52
snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
snowflake/ml/modeling/cluster/dbscan.py +51 -52
snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
snowflake/ml/modeling/cluster/k_means.py +53 -52
snowflake/ml/modeling/cluster/mean_shift.py +51 -52
snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
snowflake/ml/modeling/cluster/optics.py +51 -52
snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
snowflake/ml/modeling/compose/column_transformer.py +53 -52
snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
snowflake/ml/modeling/covariance/oas.py +51 -52
snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/pca.py +53 -52
snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
snowflake/ml/modeling/framework/base.py +64 -36
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
snowflake/ml/modeling/impute/knn_imputer.py +53 -52
snowflake/ml/modeling/impute/missing_indicator.py +53 -52
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
snowflake/ml/modeling/linear_model/lars.py +51 -52
snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso.py +51 -52
snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
snowflake/ml/modeling/linear_model/perceptron.py +51 -52
snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ridge.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
snowflake/ml/modeling/manifold/isomap.py +53 -52
snowflake/ml/modeling/manifold/mds.py +53 -52
snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
snowflake/ml/modeling/manifold/tsne.py +53 -52
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
snowflake/ml/modeling/pipeline/pipeline.py +538 -36
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
snowflake/ml/modeling/svm/linear_svc.py +51 -52
snowflake/ml/modeling/svm/linear_svr.py +51 -52
snowflake/ml/modeling/svm/nu_svc.py +51 -52
snowflake/ml/modeling/svm/nu_svr.py +51 -52
snowflake/ml/modeling/svm/svc.py +51 -52
snowflake/ml/modeling/svm/svr.py +51 -52
snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
snowflake/ml/registry/_manager/model_manager.py +36 -7
snowflake/ml/registry/model_registry.py +3 -149
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +112 -7
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +216 -206
snowflake/ml/registry/_artifact_manager.py +0 -156
snowflake/ml/registry/artifact.py +0 -46
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py CHANGED Viewed

@@ -72,24 +72,40 @@ class MLRuntimeTransformHandlers:
         """
-        handler = SnowparkTransformHandlers(
-            dataset=self.dataset,
-            estimator=self.estimator,
-            class_name=self._class_name,
-            subproject=self._subproject,
-            autogenerated=self._autogenerated,
-        )
-        return handler.batch_inference(
-            inference_method,
-            input_cols,
-            expected_output_cols,
-            session,
-            dependencies,
-            drop_input_cols,
-            expected_output_cols_type,
-            *args,
-            **kwargs,
-        )
+        mlrs_inference_methods = ["predict", "predict_proba", "predict_log_proba"]
+        if inference_method in mlrs_inference_methods:
+            result_df = self.client.inference(
+                estimator=self.estimator,
+                dataset=self.dataset,
+                inference_method=inference_method,
+                input_cols=input_cols,
+                output_cols=expected_output_cols,
+                drop_input_cols=drop_input_cols,
+            )
+        else:
+            handler = SnowparkTransformHandlers(
+                dataset=self.dataset,
+                estimator=self.estimator,
+                class_name=self._class_name,
+                subproject=self._subproject,
+                autogenerated=self._autogenerated,
+            )
+            result_df = handler.batch_inference(
+                inference_method,
+                input_cols,
+                expected_output_cols,
+                session,
+                dependencies,
+                drop_input_cols,
+                expected_output_cols_type,
+                *args,
+                **kwargs,
+            )
+        assert isinstance(result_df, DataFrame)  # mypy - The MLRS return types are annotated as `object`.
+        return result_df
     def score(
         self,

snowflake/ml/modeling/_internal/model_trainer.py CHANGED Viewed

@@ -22,3 +22,10 @@ class ModelTrainer(Protocol):
         drop_input_cols: Optional[bool] = False,
     ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
         raise NotImplementedError
+    def train_fit_transform(
+        self,
+        expected_output_cols_list: List[str],
+        drop_input_cols: Optional[bool] = False,
+    ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
+        raise NotImplementedError

snowflake/ml/modeling/_internal/model_trainer_builder.py CHANGED Viewed

@@ -138,21 +138,13 @@ class ModelTrainerBuilder:
         cls,
         estimator: object,
         dataset: Union[DataFrame, pd.DataFrame],
-        input_cols: Optional[List[str]] = None,
+        input_cols: List[str],
         autogenerated: bool = False,
         subproject: str = "",
     ) -> ModelTrainer:
         """
         Builder method that creates an appropriate ModelTrainer instance based on the given params.
         """
-        if input_cols is None:
-            raise exceptions.SnowflakeMLException(
-                error_code=error_codes.NOT_FOUND,
-                original_exception=ValueError(
-                    "The input column names (input_cols) is None.\n"
-                    "Please put your input_cols when initializing the estimator\n"
-                ),
-            )
         if isinstance(dataset, pd.DataFrame):
             return PandasModelTrainer(
                 estimator=estimator,
@@ -179,3 +171,44 @@ class ModelTrainerBuilder:
                 f"Unexpected dataset type: {type(dataset)}."
                 "Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
             )
+    @classmethod
+    def build_fit_transform(
+        cls,
+        estimator: object,
+        dataset: Union[DataFrame, pd.DataFrame],
+        input_cols: List[str],
+        label_cols: Optional[List[str]] = None,
+        sample_weight_col: Optional[str] = None,
+        autogenerated: bool = False,
+        subproject: str = "",
+    ) -> ModelTrainer:
+        """
+        Builder method that creates an appropriate ModelTrainer instance based on the given params.
+        """
+        if isinstance(dataset, pd.DataFrame):
+            return PandasModelTrainer(
+                estimator=estimator,
+                dataset=dataset,
+                input_cols=input_cols,
+                label_cols=label_cols,
+                sample_weight_col=sample_weight_col,
+            )
+        elif isinstance(dataset, DataFrame):
+            trainer_klass = SnowparkModelTrainer
+            init_args = {
+                "estimator": estimator,
+                "dataset": dataset,
+                "session": dataset._session,
+                "input_cols": input_cols,
+                "label_cols": label_cols,
+                "sample_weight_col": sample_weight_col,
+                "autogenerated": autogenerated,
+                "subproject": subproject,
+            }
+            return trainer_klass(**init_args)  # type: ignore[arg-type]
+        else:
+            raise TypeError(
+                f"Unexpected dataset type: {type(dataset)}."
+                "Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
+            )

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 import os
 import posixpath
 import sys
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import cloudpickle as cp
 import numpy as np
@@ -154,7 +154,7 @@ def construct_cv_results(
     return multimetric, estimator._format_results(param_grid, n_split, out)
-def construct_cv_results_new_implementation(
+def construct_cv_results_memory_efficient_version(
     estimator: Union[GridSearchCV, RandomizedSearchCV],
     n_split: int,
     param_grid: List[Dict[str, Any]],
@@ -205,12 +205,35 @@ def construct_cv_results_new_implementation(
         with io.BytesIO(hex_str) as f_reload:
             out = cp.load(f_reload)
             all_out.extend(out)
+    # because original SearchCV is ranked by parameter first and cv second,
+    # to make the memory efficient, we implemented by fitting on cv first and parameter second
+    # when retrieving the results back, the ordering should revert back to remain the same result as original SearchCV
+    def generate_the_order_by_parameter_index(all_combination_length: int) -> List[int]:
+        pattern = []
+        for i in range(all_combination_length):
+            if i % parameter_grid_length == 0:
+                pattern.append(i)
+        for i in range(1, parameter_grid_length):
+            for j in range(all_combination_length):
+                if j % parameter_grid_length == i:
+                    pattern.append(j)
+        return pattern
+    def rerank_array(original_array: List[Any], pattern: List[int]) -> List[Any]:
+        reranked_array = []
+        for index in pattern:
+            reranked_array.append(original_array[index])
+        return reranked_array
+    pattern = generate_the_order_by_parameter_index(len(all_out))
+    reranked_all_out = rerank_array(all_out, pattern)
     first_test_score = all_out[0]["test_scores"]
-    return first_test_score, estimator._format_results(param_grid, n_split, all_out)
+    return first_test_score, estimator._format_results(param_grid, n_split, reranked_all_out)
 cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
-cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_new_implementation))
+cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_memory_efficient_version))
 class DistributedHPOTrainer(SnowparkModelTrainer):
@@ -661,7 +684,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         return fit_estimator
-    def fit_search_snowpark_new_implementation(
+    def fit_search_snowpark_enable_efficient_memory_usage(
         self,
         param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
         dataset: DataFrame,
@@ -718,7 +741,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[udtf],
-            custom_tags=dict([("hpo_udtf", True)]),
+            custom_tags=dict([("hpo_memory_efficient", True)]),
         )
         # Put locally serialized estimator on stage.
@@ -960,22 +983,26 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     self.base_estimator = base_estimator
                     self.fit_and_score_kwargs = fit_and_score_kwargs
                     self.fit_score_params: List[Any] = []
-                    self.cached_train_test_indices = []
-                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
-                    full_index = np.arange(DATA_LENGTH)
-                    for i in range(n_splits):
-                        self.cached_train_test_indices.extend(
-                            [[np.setdiff1d(full_index, self.test_indices[i]), self.test_indices[i]]]
-                        )
+                    self.cv_indices_set: Set[int] = set()
                 def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
                     self.fit_score_params.extend([[idx, params_idx, cv_idx]])
+                    self.cv_indices_set.add(cv_idx)
                 def end_partition(self) -> Iterator[Tuple[int, str]]:
                     from sklearn.base import clone
                     from sklearn.model_selection._validation import _fit_and_score
                     from sklearn.utils.parallel import Parallel, delayed
+                    cached_train_test_indices = {}
+                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
+                    full_index = np.arange(DATA_LENGTH)
+                    for i in self.cv_indices_set:
+                        cached_train_test_indices[i] = [
+                            np.setdiff1d(full_index, self.test_indices[i]),
+                            self.test_indices[i],
+                        ]
                     parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
                     out = parallel(
@@ -983,8 +1010,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                             clone(self.base_estimator),
                             self.X,
                             self.y,
-                            train=self.cached_train_test_indices[split_idx][0],
-                            test=self.cached_train_test_indices[split_idx][1],
+                            train=cached_train_test_indices[split_idx][0],
+                            test=cached_train_test_indices[split_idx][1],
                             parameters=self.params_to_evaluate[cand_idx],
                             split_progress=(split_idx, n_splits),
                             candidate_progress=(cand_idx, n_candidates),
@@ -1005,7 +1032,9 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             session.udtf.register(
                 SearchCV,
-                output_schema=StructType([StructField("IDX", IntegerType()), StructField("CV_RESULTS", StringType())]),
+                output_schema=StructType(
+                    [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
+                ),
                 input_types=[IntegerType(), IntegerType(), IntegerType()],
                 name=random_udtf_name,
                 packages=required_deps,  # type: ignore[arg-type]
@@ -1020,8 +1049,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             # param_indices is for the index for each parameter grid;
             # cv_indices is for the index for each cross_validator's fold;
             # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
-            param_indices, cv_indices = zip(
-                *product(range(parameter_grid_length), range(cross_validator_indices_length))
+            cv_indices, param_indices = zip(
+                *product(range(cross_validator_indices_length), range(parameter_grid_length))
             )
             indices_info_pandas = pd.DataFrame(
@@ -1042,11 +1071,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 ),
             )
-            first_test_score, cv_results_ = construct_cv_results_new_implementation(
+            first_test_score, cv_results_ = construct_cv_results_memory_efficient_version(
                 estimator,
                 n_splits,
                 list(param_grid),
-                HP_raw_results.select("CV_RESULTS").sort(F.col("IDX")).collect(),
+                HP_raw_results.select("EACH_CV_RESULTS").sort(F.col("FIRST_IDX")).collect(),
                 cross_validator_indices_length,
                 parameter_grid_length,
             )
@@ -1163,7 +1192,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             pkg_versions=model_spec.pkgDependencies, session=self.session
         )
         if ENABLE_EFFICIENT_MEMORY_USAGE:
-            return self.fit_search_snowpark_new_implementation(
+            return self.fit_search_snowpark_enable_efficient_memory_usage(
                 param_grid=param_grid,
                 dataset=self.dataset,
                 session=self.session,

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -9,7 +9,11 @@ import cloudpickle as cp
 import pandas as pd
 from snowflake.ml._internal import telemetry
-from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
+from snowflake.ml._internal.utils import (
+    identifier,
+    pkg_version_utils,
+    snowpark_dataframe_utils,
+)
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
@@ -91,6 +95,7 @@ class SnowparkTransformHandlers:
             A new dataset of the same type as the input dataset.
         """
+        dependencies = self._get_validated_snowpark_dependencies(session, dependencies)
         dataset = self.dataset
         estimator = self.estimator
         # Register vectorized UDF for batch inference
@@ -210,7 +215,8 @@ class SnowparkTransformHandlers:
         Returns:
             An accuracy score for the model on the given test data.
         """
+        dependencies = self._get_validated_snowpark_dependencies(session, dependencies)
+        dependencies.append("snowflake-snowpark-python")
         dataset = self.dataset
         estimator = self.estimator
         dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
@@ -335,3 +341,19 @@ class SnowparkTransformHandlers:
         cleanup_temp_files([local_score_file_name])
         return score
+    def _get_validated_snowpark_dependencies(self, session: Session, dependencies: List[str]) -> List[str]:
+        """A helper function to validate dependencies and return the available packages that exists
+        in the snowflake anaconda channel
+        Args:
+            session: the active snowpark Session
+            dependencies: unvalidated dependencies
+        Returns:
+            A list of packages present in the snoflake conda channel.
+        """
+        return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=dependencies, session=session, subproject=self._subproject
+        )

snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl