PyPI - snowflake-ml-python - Versions diffs - 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

snowflake-ml-python 1.3.1py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

snowflake/ml/_internal/env_utils.py +11 -1
snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
snowflake/ml/_internal/utils/formatting.py +1 -1
snowflake/ml/_internal/utils/identifier.py +3 -1
snowflake/ml/_internal/utils/sql_identifier.py +2 -6
snowflake/ml/feature_store/feature_store.py +166 -184
snowflake/ml/feature_store/feature_view.py +12 -24
snowflake/ml/fileset/sfcfs.py +56 -50
snowflake/ml/fileset/stage_fs.py +48 -13
snowflake/ml/model/_client/model/model_version_impl.py +6 -49
snowflake/ml/model/_client/ops/model_ops.py +78 -29
snowflake/ml/model/_client/sql/model.py +23 -2
snowflake/ml/model/_client/sql/model_version.py +22 -1
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -3
snowflake/ml/model/_deploy_client/snowservice/deploy.py +5 -2
snowflake/ml/model/_model_composer/model_composer.py +7 -5
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +1 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +13 -1
snowflake/ml/model/_packager/model_handlers/xgboost.py +1 -1
snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
snowflake/ml/model/_packager/model_packager.py +2 -2
snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
snowflake/ml/model/custom_model.py +3 -1
snowflake/ml/model/type_hints.py +21 -2
snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
snowflake/ml/modeling/_internal/model_specifications.py +3 -1
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +545 -0
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +8 -5
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
snowflake/ml/modeling/cluster/birch.py +195 -123
snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
snowflake/ml/modeling/cluster/dbscan.py +195 -123
snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
snowflake/ml/modeling/cluster/k_means.py +195 -123
snowflake/ml/modeling/cluster/mean_shift.py +195 -123
snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
snowflake/ml/modeling/cluster/optics.py +195 -123
snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
snowflake/ml/modeling/compose/column_transformer.py +195 -123
snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
snowflake/ml/modeling/covariance/oas.py +195 -123
snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
snowflake/ml/modeling/decomposition/pca.py +195 -123
snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
snowflake/ml/modeling/framework/_utils.py +8 -1
snowflake/ml/modeling/framework/base.py +24 -6
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
snowflake/ml/modeling/impute/knn_imputer.py +195 -123
snowflake/ml/modeling/impute/missing_indicator.py +195 -123
snowflake/ml/modeling/impute/simple_imputer.py +4 -15
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +198 -125
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +198 -125
snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
snowflake/ml/modeling/linear_model/lars.py +195 -123
snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
snowflake/ml/modeling/linear_model/lasso.py +195 -123
snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
snowflake/ml/modeling/linear_model/perceptron.py +195 -123
snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
snowflake/ml/modeling/linear_model/ridge.py +195 -123
snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
snowflake/ml/modeling/manifold/isomap.py +195 -123
snowflake/ml/modeling/manifold/mds.py +195 -123
snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
snowflake/ml/modeling/manifold/tsne.py +195 -123
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
snowflake/ml/modeling/pipeline/pipeline.py +4 -4
snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
snowflake/ml/modeling/svm/linear_svc.py +195 -123
snowflake/ml/modeling/svm/linear_svr.py +195 -123
snowflake/ml/modeling/svm/nu_svc.py +195 -123
snowflake/ml/modeling/svm/nu_svr.py +195 -123
snowflake/ml/modeling/svm/svc.py +195 -123
snowflake/ml/modeling/svm/svr.py +195 -123
snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
snowflake/ml/registry/_manager/model_manager.py +5 -1
snowflake/ml/registry/model_registry.py +99 -26
snowflake/ml/registry/registry.py +3 -2
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +94 -55
{snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +218 -212
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
{snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import cloudpickle as cp
 import numpy as np
+import numpy.typing as npt
 from sklearn import model_selection
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
@@ -38,9 +39,11 @@ from snowflake.snowpark.types import IntegerType, StringType, StructField, Struc
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
+cp.register_pickle_by_value(inspect.getmodule(snowpark_dataframe_utils.cast_snowpark_dataframe))
 _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
+ENABLE_EFFICIENT_MEMORY_USAGE = False
 def construct_cv_results(
@@ -151,7 +154,63 @@ def construct_cv_results(
     return multimetric, estimator._format_results(param_grid, n_split, out)
+def construct_cv_results_new_implementation(
+    estimator: Union[GridSearchCV, RandomizedSearchCV],
+    n_split: int,
+    param_grid: List[Dict[str, Any]],
+    cv_results_raw_hex: List[Row],
+    cross_validator_indices_length: int,
+    parameter_grid_length: int,
+) -> Tuple[Any, Dict[str, Any]]:
+    """Construct the cross validation result from the UDF.
+    The output is a raw dictionary generated by _fit_and_score, encoded into hex binary.
+    This function need to decode the string and then call _format_result to stick them back together
+    to align with original sklearn result.
+    Args:
+        estimator (Union[GridSearchCV, RandomizedSearchCV]): The sklearn object of estimator
+            GridSearchCV or RandomizedSearchCV
+        n_split (int): The number of split, which is determined by build_cross_validator.get_n_splits(X, y, groups)
+        param_grid (List[Dict[str, Any]]): the list of parameter grid or parameter sampler
+        cv_results_raw_hex (List[Row]): the list of cv_results from each cv and parameter grid combination.
+            Because UDxF can only return string, and numpy array/masked arrays cannot be encoded in a
+            json format. Each cv_result is encoded into hex string.
+        cross_validator_indices_length (int): the length of cross validator indices
+        parameter_grid_length (int): the length of parameter grid combination
+    Raises:
+        ValueError: Retrieved empty cross validation results
+        ValueError: Cross validator index length is 0
+        ValueError: Parameter index length is 0
+        ValueError: Retrieved incorrect dataframe dimension from Snowpark's UDTF.
+    Returns:
+        Tuple[Any, Dict[str, Any]]: returns first_test_score, cv_results_
+    """
+    # Filter corner cases: either the snowpark dataframe result is empty; or index length is empty
+    if len(cv_results_raw_hex) == 0:
+        raise ValueError(
+            "Retrieved empty cross validation results from snowpark. Please retry or contact snowflake support."
+        )
+    if cross_validator_indices_length == 0:
+        raise ValueError("Cross validator index length is 0. Was the CV iterator empty? ")
+    if parameter_grid_length == 0:
+        raise ValueError("Parameter index length is 0. Were there no candidates?")
+    all_out = []
+    for each_cv_result_hex in cv_results_raw_hex:
+        # convert the hex string back to cv_results_
+        hex_str = bytes.fromhex(each_cv_result_hex[0])
+        with io.BytesIO(hex_str) as f_reload:
+            out = cp.load(f_reload)
+            all_out.extend(out)
+    first_test_score = all_out[0]["test_scores"]
+    return first_test_score, estimator._format_results(param_grid, n_split, all_out)
 cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
+cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_new_implementation))
 class DistributedHPOTrainer(SnowparkModelTrainer):
@@ -602,6 +661,479 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         return fit_estimator
+    def fit_search_snowpark_new_implementation(
+        self,
+        param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
+        dataset: DataFrame,
+        session: Session,
+        estimator: Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
+        dependencies: List[str],
+        udf_imports: List[str],
+        input_cols: List[str],
+        label_cols: Optional[List[str]],
+        sample_weight_col: Optional[str],
+    ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
+        from itertools import product
+        import cachetools
+        from sklearn.base import clone, is_classifier
+        from sklearn.calibration import check_cv
+        # Create one stage for data and for estimators.
+        temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE)
+        temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};"
+        session.sql(temp_stage_creation_query).collect()
+        # Stage data as parquet file
+        dataset = snowpark_dataframe_utils.cast_snowpark_dataframe(dataset)
+        dataset_file_name = "dataset"
+        remote_file_path = f"{temp_stage_name}/{dataset_file_name}.parquet"
+        dataset.write.copy_into_location(  # type:ignore[call-overload]
+            remote_file_path, file_format_type="parquet", header=True, overwrite=True
+        )
+        imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}/{dataset_file_name}").collect()]
+        # Create a temp file and dump the estimator to that file.
+        estimator_file_name = get_temp_file_path()
+        params_to_evaluate = list(param_grid)
+        n_candidates = len(params_to_evaluate)
+        _N_JOBS = estimator.n_jobs
+        _PRE_DISPATCH = estimator.pre_dispatch
+        with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
+            cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
+        stage_estimator_file_name = posixpath.join(temp_stage_name, os.path.basename(estimator_file_name))
+        sproc_statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), self.__class__.__name__
+            ),
+            api_calls=[sproc],
+        )
+        udtf_statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), self.__class__.__name__
+            ),
+            api_calls=[udtf],
+            custom_tags=dict([("hpo_udtf", True)]),
+        )
+        # Put locally serialized estimator on stage.
+        session.file.put(
+            estimator_file_name,
+            temp_stage_name,
+            auto_compress=False,
+            overwrite=True,
+        )
+        estimator_location = os.path.basename(estimator_file_name)
+        imports.append(f"@{temp_stage_name}/{estimator_location}")
+        search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
+        required_deps = dependencies + [
+            "snowflake-snowpark-python<2",
+            "fastparquet<2023.11",
+            "pyarrow<14",
+            "cachetools<6",
+        ]
+        @sproc(  # type: ignore[misc]
+            is_permanent=False,
+            name=search_sproc_name,
+            packages=required_deps,  # type: ignore[arg-type]
+            replace=True,
+            session=session,
+            anonymous=True,
+            imports=imports,  # type: ignore[arg-type]
+            statement_params=sproc_statement_params,
+        )
+        def _distributed_search(
+            session: Session,
+            imports: List[str],
+            stage_estimator_file_name: str,
+            input_cols: List[str],
+            label_cols: Optional[List[str]],
+        ) -> str:
+            import os
+            import time
+            from typing import Iterator
+            import cloudpickle as cp
+            import pandas as pd
+            import pyarrow.parquet as pq
+            from sklearn.metrics import check_scoring
+            from sklearn.metrics._scorer import _check_multimetric_scoring
+            from sklearn.utils.validation import _check_fit_params, indexable
+            # import packages in sproc
+            for import_name in udf_imports:
+                importlib.import_module(import_name)
+            # os.cpu_count() returns the number of logical CPUs in the system. Returns None if undetermined.
+            _NUM_CPUs = os.cpu_count() or 1
+            # load dataset
+            data_files = [
+                filename
+                for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+                if filename.startswith(dataset_file_name)
+            ]
+            partial_df = [
+                pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+                for file_name in data_files
+            ]
+            df = pd.concat(partial_df, ignore_index=True)
+            df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
+            X = df[input_cols]
+            y = df[label_cols].squeeze() if label_cols else None
+            DATA_LENGTH = len(df)
+            fit_params = {}
+            if sample_weight_col:
+                fit_params["sample_weight"] = df[sample_weight_col].squeeze()
+            local_estimator_file_folder_name = get_temp_file_path()
+            session.file.get(stage_estimator_file_name, local_estimator_file_folder_name)
+            local_estimator_file_path = os.path.join(
+                local_estimator_file_folder_name, os.listdir(local_estimator_file_folder_name)[0]
+            )
+            with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
+                estimator = cp.load(local_estimator_file_obj)["estimator"]
+            # preprocess the attributes - (1) scorer
+            refit_metric = "score"
+            if callable(estimator.scoring):
+                scorers = estimator.scoring
+            elif estimator.scoring is None or isinstance(estimator.scoring, str):
+                scorers = check_scoring(estimator.estimator, estimator.scoring)
+            else:
+                scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
+                estimator._check_refit_for_multimetric(scorers)
+                refit_metric = estimator.refit
+            # preprocess the attributes - (2) check fit_params
+            groups = None
+            X, y, _ = indexable(X, y, groups)
+            fit_params = _check_fit_params(X, fit_params)
+            # preprocess the attributes - (3) safe clone base estimator
+            base_estimator = clone(estimator.estimator)
+            # preprocess the attributes - (4) check cv
+            build_cross_validator = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
+            n_splits = build_cross_validator.get_n_splits(X, y, groups)
+            # preprocess the attributes - (5) generate fit_and_score_kwargs
+            fit_and_score_kwargs = dict(
+                scorer=scorers,
+                fit_params=fit_params,
+                return_train_score=estimator.return_train_score,
+                return_n_test_samples=True,
+                return_times=True,
+                return_parameters=False,
+                error_score=estimator.error_score,
+                verbose=estimator.verbose,
+            )
+            # (1) store the cross_validator's test indices only to save space
+            cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)]
+            local_indices_file_name = get_temp_file_path()
+            with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
+                cp.dump(cross_validator_indices, local_indices_file_obj)
+            # Put locally serialized indices on stage.
+            session.file.put(
+                local_indices_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            indices_location = os.path.basename(local_indices_file_name)
+            imports.append(f"@{temp_stage_name}/{indices_location}")
+            # (2) store the base estimator
+            local_base_estimator_file_name = get_temp_file_path()
+            with open(local_base_estimator_file_name, mode="w+b") as local_base_estimator_file_obj:
+                cp.dump(base_estimator, local_base_estimator_file_obj)
+            session.file.put(
+                local_base_estimator_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            base_estimator_location = os.path.basename(local_base_estimator_file_name)
+            imports.append(f"@{temp_stage_name}/{base_estimator_location}")
+            # (3) store the fit_and_score_kwargs
+            local_fit_and_score_kwargs_file_name = get_temp_file_path()
+            with open(local_fit_and_score_kwargs_file_name, mode="w+b") as local_fit_and_score_kwargs_file_obj:
+                cp.dump(fit_and_score_kwargs, local_fit_and_score_kwargs_file_obj)
+            session.file.put(
+                local_fit_and_score_kwargs_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            fit_and_score_kwargs_location = os.path.basename(local_fit_and_score_kwargs_file_name)
+            imports.append(f"@{temp_stage_name}/{fit_and_score_kwargs_location}")
+            cross_validator_indices_length = int(len(cross_validator_indices))
+            parameter_grid_length = len(param_grid)
+            assert estimator is not None
+            @cachetools.cached(cache={})
+            def _load_data_into_udf() -> Tuple[
+                npt.NDArray[Any],
+                npt.NDArray[Any],
+                List[List[int]],
+                List[Dict[str, Any]],
+                object,
+                Dict[str, Any],
+            ]:
+                import pyarrow.parquet as pq
+                data_files = [
+                    filename
+                    for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+                    if filename.startswith(dataset_file_name)
+                ]
+                partial_df = [
+                    pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+                    for file_name in data_files
+                ]
+                df = pd.concat(partial_df, ignore_index=True)
+                df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
+                # load parameter grid
+                local_estimator_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
+                )
+                with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
+                    estimator_objects = cp.load(local_estimator_file_obj)
+                    params_to_evaluate = estimator_objects["param_grid"]
+                # load indices
+                local_indices_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{indices_location}"
+                )
+                with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
+                    indices = cp.load(local_indices_file_obj)
+                # load base estimator
+                local_base_estimator_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{base_estimator_location}"
+                )
+                with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
+                    base_estimator = cp.load(local_base_estimator_file_obj)
+                # load fit_and_score_kwargs
+                local_fit_and_score_kwargs_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{fit_and_score_kwargs_location}"
+                )
+                with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
+                    fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
+                # convert dataframe to numpy would save memory consumption
+                return (
+                    df[input_cols].to_numpy(),
+                    df[label_cols].squeeze().to_numpy(),
+                    indices,
+                    params_to_evaluate,
+                    base_estimator,
+                    fit_and_score_kwargs,
+                )
+            # Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
+            class SearchCV:
+                def __init__(self) -> None:
+                    X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
+                    self.X = X
+                    self.y = y
+                    self.test_indices = indices
+                    self.params_to_evaluate = params_to_evaluate
+                    self.base_estimator = base_estimator
+                    self.fit_and_score_kwargs = fit_and_score_kwargs
+                    self.fit_score_params: List[Any] = []
+                    self.cached_train_test_indices = []
+                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
+                    full_index = np.arange(DATA_LENGTH)
+                    for i in range(n_splits):
+                        self.cached_train_test_indices.extend(
+                            [[np.setdiff1d(full_index, self.test_indices[i]), self.test_indices[i]]]
+                        )
+                def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
+                    self.fit_score_params.extend([[idx, params_idx, cv_idx]])
+                def end_partition(self) -> Iterator[Tuple[int, str]]:
+                    from sklearn.base import clone
+                    from sklearn.model_selection._validation import _fit_and_score
+                    from sklearn.utils.parallel import Parallel, delayed
+                    parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
+                    out = parallel(
+                        delayed(_fit_and_score)(
+                            clone(self.base_estimator),
+                            self.X,
+                            self.y,
+                            train=self.cached_train_test_indices[split_idx][0],
+                            test=self.cached_train_test_indices[split_idx][1],
+                            parameters=self.params_to_evaluate[cand_idx],
+                            split_progress=(split_idx, n_splits),
+                            candidate_progress=(cand_idx, n_candidates),
+                            **self.fit_and_score_kwargs,  # load sample weight here
+                        )
+                        for _, cand_idx, split_idx in self.fit_score_params
+                    )
+                    binary_cv_results = None
+                    with io.BytesIO() as f:
+                        cp.dump(out, f)
+                        f.seek(0)
+                        binary_cv_results = f.getvalue().hex()
+                    yield (
+                        self.fit_score_params[0][0],
+                        binary_cv_results,
+                    )
+            session.udtf.register(
+                SearchCV,
+                output_schema=StructType([StructField("IDX", IntegerType()), StructField("CV_RESULTS", StringType())]),
+                input_types=[IntegerType(), IntegerType(), IntegerType()],
+                name=random_udtf_name,
+                packages=required_deps,  # type: ignore[arg-type]
+                replace=True,
+                is_permanent=False,
+                imports=imports,  # type: ignore[arg-type]
+                statement_params=udtf_statement_params,
+            )
+            HP_TUNING = F.table_function(random_udtf_name)
+            # param_indices is for the index for each parameter grid;
+            # cv_indices is for the index for each cross_validator's fold;
+            # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
+            param_indices, cv_indices = zip(
+                *product(range(parameter_grid_length), range(cross_validator_indices_length))
+            )
+            indices_info_pandas = pd.DataFrame(
+                {
+                    "IDX": [i // _NUM_CPUs for i in range(parameter_grid_length * cross_validator_indices_length)],
+                    "PARAM_IND": param_indices,
+                    "CV_IND": cv_indices,
+                }
+            )
+            indices_info_sp = session.create_dataframe(indices_info_pandas)
+            # execute udtf by querying HP_TUNING table
+            HP_raw_results = indices_info_sp.select(
+                (
+                    HP_TUNING(indices_info_sp["IDX"], indices_info_sp["PARAM_IND"], indices_info_sp["CV_IND"]).over(
+                        partition_by="IDX"
+                    )
+                ),
+            )
+            first_test_score, cv_results_ = construct_cv_results_new_implementation(
+                estimator,
+                n_splits,
+                list(param_grid),
+                HP_raw_results.select("CV_RESULTS").sort(F.col("IDX")).collect(),
+                cross_validator_indices_length,
+                parameter_grid_length,
+            )
+            estimator.cv_results_ = cv_results_
+            estimator.multimetric_ = isinstance(first_test_score, dict)
+            # check refit_metric now for a callable scorer that is multimetric
+            if callable(estimator.scoring) and estimator.multimetric_:
+                estimator._check_refit_for_multimetric(first_test_score)
+                refit_metric = estimator.refit
+            # For multi-metric evaluation, store the best_index_, best_params_ and
+            # best_score_ iff refit is one of the scorer names
+            # In single metric evaluation, refit_metric is "score"
+            if estimator.refit or not estimator.multimetric_:
+                estimator.best_index_ = estimator._select_best_index(estimator.refit, refit_metric, cv_results_)
+                if not callable(estimator.refit):
+                    # With a non-custom callable, we can select the best score
+                    # based on the best index
+                    estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
+                estimator.best_params_ = cv_results_["params"][estimator.best_index_]
+            if estimator.refit:
+                estimator.best_estimator_ = clone(base_estimator).set_params(
+                    **clone(estimator.best_params_, safe=False)
+                )
+                # Let the sproc use all cores to refit.
+                estimator.n_jobs = estimator.n_jobs or -1
+                # process the input as args
+                argspec = inspect.getfullargspec(estimator.fit)
+                args = {"X": X}
+                if label_cols:
+                    label_arg_name = "Y" if "Y" in argspec.args else "y"
+                    args[label_arg_name] = y
+                if sample_weight_col is not None and "sample_weight" in argspec.args:
+                    args["sample_weight"] = df[sample_weight_col].squeeze()
+                # estimator.refit = original_refit
+                refit_start_time = time.time()
+                estimator.best_estimator_.fit(**args)
+                refit_end_time = time.time()
+                estimator.refit_time_ = refit_end_time - refit_start_time
+                if hasattr(estimator.best_estimator_, "feature_names_in_"):
+                    estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
+            # Store the only scorer not as a dict for single metric evaluation
+            estimator.scorer_ = scorers
+            estimator.n_splits_ = n_splits
+            local_result_file_name = get_temp_file_path()
+            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                cp.dump(estimator, local_result_file_obj)
+            session.file.put(
+                local_result_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            return str(os.path.basename(local_result_file_name))
+        sproc_export_file_name = _distributed_search(
+            session,
+            imports,
+            stage_estimator_file_name,
+            input_cols,
+            label_cols,
+        )
+        local_estimator_path = get_temp_file_path()
+        session.file.get(
+            posixpath.join(temp_stage_name, sproc_export_file_name),
+            local_estimator_path,
+        )
+        with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj:
+            fit_estimator = cp.load(result_file_obj)
+        cleanup_temp_files(local_estimator_path)
+        return fit_estimator
     def train(self) -> object:
         """
         Runs hyper parameter optimization by distributing the tasks across warehouse.
@@ -630,6 +1162,19 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
             pkg_versions=model_spec.pkgDependencies, session=self.session
         )
+        if ENABLE_EFFICIENT_MEMORY_USAGE:
+            return self.fit_search_snowpark_new_implementation(
+                param_grid=param_grid,
+                dataset=self.dataset,
+                session=self.session,
+                estimator=self.estimator,
+                dependencies=relaxed_dependencies,
+                udf_imports=["sklearn"],
+                input_cols=self.input_cols,
+                label_cols=self.label_cols,
+                sample_weight_col=self.sample_weight_col,
+            )
         return self.fit_search_snowpark(
             param_grid=param_grid,
             dataset=self.dataset,

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -131,9 +131,12 @@ class SnowparkTransformHandlers:
             input_df.columns = snowpark_cols
+            if hasattr(estimator, "n_jobs"):
+                # Vectorized UDF cannot handle joblib multiprocessing right now, deactivate the n_jobs
+                estimator.n_jobs = 1
             inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs)
-            transformed_numpy_array, output_cols = handle_inference_result(
+            transformed_numpy_array, _ = handle_inference_result(
                 inference_res=inference_res,
                 output_cols=expected_output_cols,
                 inference_method=inference_method,
@@ -141,13 +144,13 @@ class SnowparkTransformHandlers:
             )
             if len(transformed_numpy_array.shape) > 1:
-                if transformed_numpy_array.shape[1] != len(output_cols):
+                if transformed_numpy_array.shape[1] != len(expected_output_cols):
                     series = pd.Series(transformed_numpy_array.tolist())
-                    transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
+                    transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols)
                 else:
-                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
+                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=expected_output_cols)
             else:
-                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
+                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols)
             return transformed_pandas_df.to_dict("records")  # type: ignore[no-any-return]

snowflake-ml-python 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

snowflake-ml-python 1.3.1py3-none-any.whl → 1.4.1py3-none-any.whl