PyPI - snowflake-ml-python - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

snowflake/cortex/_complete.py +1 -1
snowflake/cortex/_extract_answer.py +1 -1
snowflake/cortex/_sentiment.py +1 -1
snowflake/cortex/_summarize.py +1 -1
snowflake/cortex/_translate.py +1 -1
snowflake/ml/_internal/env_utils.py +68 -6
snowflake/ml/_internal/file_utils.py +34 -4
snowflake/ml/_internal/telemetry.py +79 -91
snowflake/ml/_internal/utils/identifier.py +78 -72
snowflake/ml/_internal/utils/retryable_http.py +16 -4
snowflake/ml/_internal/utils/spcs_attribution_utils.py +122 -0
snowflake/ml/dataset/dataset.py +1 -1
snowflake/ml/model/_api.py +21 -14
snowflake/ml/model/_client/model/model_impl.py +176 -0
snowflake/ml/model/_client/model/model_method_info.py +19 -0
snowflake/ml/model/_client/model/model_version_impl.py +291 -0
snowflake/ml/model/_client/ops/metadata_ops.py +107 -0
snowflake/ml/model/_client/ops/model_ops.py +308 -0
snowflake/ml/model/_client/sql/model.py +75 -0
snowflake/ml/model/_client/sql/model_version.py +213 -0
snowflake/ml/model/_client/sql/stage.py +40 -0
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -4
snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +24 -8
snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +23 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +14 -2
snowflake/ml/model/_deploy_client/utils/constants.py +1 -0
snowflake/ml/model/_deploy_client/warehouse/deploy.py +2 -2
snowflake/ml/model/_model_composer/model_composer.py +31 -9
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +25 -10
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -2
snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +34 -3
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +1 -1
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +3 -1
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +10 -28
snowflake/ml/model/_packager/model_meta/model_meta.py +18 -16
snowflake/ml/model/_signatures/snowpark_handler.py +1 -1
snowflake/ml/model/model_signature.py +108 -53
snowflake/ml/model/type_hints.py +1 -0
snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +554 -0
snowflake/ml/modeling/_internal/estimator_protocols.py +1 -60
snowflake/ml/modeling/_internal/model_specifications.py +146 -0
snowflake/ml/modeling/_internal/model_trainer.py +13 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +78 -0
snowflake/ml/modeling/_internal/pandas_trainer.py +54 -0
snowflake/ml/modeling/_internal/snowpark_handlers.py +6 -760
snowflake/ml/modeling/_internal/snowpark_trainer.py +331 -0
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +108 -135
snowflake/ml/modeling/cluster/affinity_propagation.py +106 -135
snowflake/ml/modeling/cluster/agglomerative_clustering.py +106 -135
snowflake/ml/modeling/cluster/birch.py +106 -135
snowflake/ml/modeling/cluster/bisecting_k_means.py +106 -135
snowflake/ml/modeling/cluster/dbscan.py +106 -135
snowflake/ml/modeling/cluster/feature_agglomeration.py +106 -135
snowflake/ml/modeling/cluster/k_means.py +105 -135
snowflake/ml/modeling/cluster/mean_shift.py +106 -135
snowflake/ml/modeling/cluster/mini_batch_k_means.py +105 -135
snowflake/ml/modeling/cluster/optics.py +106 -135
snowflake/ml/modeling/cluster/spectral_biclustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_clustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_coclustering.py +106 -135
snowflake/ml/modeling/compose/column_transformer.py +106 -135
snowflake/ml/modeling/compose/transformed_target_regressor.py +108 -135
snowflake/ml/modeling/covariance/elliptic_envelope.py +106 -135
snowflake/ml/modeling/covariance/empirical_covariance.py +99 -128
snowflake/ml/modeling/covariance/graphical_lasso.py +106 -135
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +106 -135
snowflake/ml/modeling/covariance/ledoit_wolf.py +104 -133
snowflake/ml/modeling/covariance/min_cov_det.py +106 -135
snowflake/ml/modeling/covariance/oas.py +99 -128
snowflake/ml/modeling/covariance/shrunk_covariance.py +103 -132
snowflake/ml/modeling/decomposition/dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/factor_analysis.py +106 -135
snowflake/ml/modeling/decomposition/fast_ica.py +106 -135
snowflake/ml/modeling/decomposition/incremental_pca.py +106 -135
snowflake/ml/modeling/decomposition/kernel_pca.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/pca.py +106 -135
snowflake/ml/modeling/decomposition/sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/truncated_svd.py +106 -135
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +108 -135
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +108 -135
snowflake/ml/modeling/ensemble/bagging_classifier.py +108 -135
snowflake/ml/modeling/ensemble/bagging_regressor.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/isolation_forest.py +106 -135
snowflake/ml/modeling/ensemble/random_forest_classifier.py +108 -135
snowflake/ml/modeling/ensemble/random_forest_regressor.py +108 -135
snowflake/ml/modeling/ensemble/stacking_regressor.py +108 -135
snowflake/ml/modeling/ensemble/voting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/voting_regressor.py +108 -135
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +101 -128
snowflake/ml/modeling/feature_selection/select_fdr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fpr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fwe.py +99 -126
snowflake/ml/modeling/feature_selection/select_k_best.py +100 -127
snowflake/ml/modeling/feature_selection/select_percentile.py +99 -126
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +106 -135
snowflake/ml/modeling/feature_selection/variance_threshold.py +95 -124
snowflake/ml/modeling/framework/base.py +83 -1
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +108 -135
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +108 -135
snowflake/ml/modeling/impute/iterative_imputer.py +106 -135
snowflake/ml/modeling/impute/knn_imputer.py +106 -135
snowflake/ml/modeling/impute/missing_indicator.py +106 -135
snowflake/ml/modeling/impute/simple_imputer.py +9 -1
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +96 -125
snowflake/ml/modeling/kernel_approximation/nystroem.py +106 -135
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +106 -135
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +105 -134
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +103 -132
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +108 -135
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +90 -118
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +90 -118
snowflake/ml/modeling/linear_model/ard_regression.py +108 -135
snowflake/ml/modeling/linear_model/bayesian_ridge.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/gamma_regressor.py +108 -135
snowflake/ml/modeling/linear_model/huber_regressor.py +108 -135
snowflake/ml/modeling/linear_model/lars.py +108 -135
snowflake/ml/modeling/linear_model/lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso.py +108 -135
snowflake/ml/modeling/linear_model/lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +108 -135
snowflake/ml/modeling/linear_model/linear_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +107 -135
snowflake/ml/modeling/linear_model/perceptron.py +107 -135
snowflake/ml/modeling/linear_model/poisson_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ransac_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ridge.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +108 -135
snowflake/ml/modeling/linear_model/ridge_cv.py +108 -135
snowflake/ml/modeling/linear_model/sgd_classifier.py +108 -135
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +106 -135
snowflake/ml/modeling/linear_model/sgd_regressor.py +108 -135
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +108 -135
snowflake/ml/modeling/linear_model/tweedie_regressor.py +108 -135
snowflake/ml/modeling/manifold/isomap.py +106 -135
snowflake/ml/modeling/manifold/mds.py +106 -135
snowflake/ml/modeling/manifold/spectral_embedding.py +106 -135
snowflake/ml/modeling/manifold/tsne.py +106 -135
snowflake/ml/modeling/metrics/classification.py +196 -55
snowflake/ml/modeling/metrics/correlation.py +4 -2
snowflake/ml/modeling/metrics/covariance.py +7 -4
snowflake/ml/modeling/metrics/ranking.py +32 -16
snowflake/ml/modeling/metrics/regression.py +60 -32
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +106 -135
snowflake/ml/modeling/mixture/gaussian_mixture.py +106 -135
snowflake/ml/modeling/model_selection/grid_search_cv.py +91 -148
snowflake/ml/modeling/model_selection/randomized_search_cv.py +93 -154
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +105 -132
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +108 -135
snowflake/ml/modeling/multiclass/output_code_classifier.py +108 -135
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/categorical_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/complement_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +98 -125
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +107 -134
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neighbors/kernel_density.py +106 -135
snowflake/ml/modeling/neighbors/local_outlier_factor.py +106 -135
snowflake/ml/modeling/neighbors/nearest_centroid.py +108 -135
snowflake/ml/modeling/neighbors/nearest_neighbors.py +106 -135
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +106 -135
snowflake/ml/modeling/neural_network/mlp_classifier.py +108 -135
snowflake/ml/modeling/neural_network/mlp_regressor.py +108 -135
snowflake/ml/modeling/parameters/disable_distributed_hpo.py +2 -6
snowflake/ml/modeling/preprocessing/binarizer.py +25 -8
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +9 -4
snowflake/ml/modeling/preprocessing/label_encoder.py +31 -11
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +27 -9
snowflake/ml/modeling/preprocessing/min_max_scaler.py +42 -14
snowflake/ml/modeling/preprocessing/normalizer.py +9 -4
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +26 -10
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +37 -13
snowflake/ml/modeling/preprocessing/polynomial_features.py +106 -135
snowflake/ml/modeling/preprocessing/robust_scaler.py +39 -13
snowflake/ml/modeling/preprocessing/standard_scaler.py +36 -12
snowflake/ml/modeling/semi_supervised/label_propagation.py +108 -135
snowflake/ml/modeling/semi_supervised/label_spreading.py +108 -135
snowflake/ml/modeling/svm/linear_svc.py +108 -135
snowflake/ml/modeling/svm/linear_svr.py +108 -135
snowflake/ml/modeling/svm/nu_svc.py +108 -135
snowflake/ml/modeling/svm/nu_svr.py +108 -135
snowflake/ml/modeling/svm/svc.py +108 -135
snowflake/ml/modeling/svm/svr.py +108 -135
snowflake/ml/modeling/tree/decision_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/decision_tree_regressor.py +108 -135
snowflake/ml/modeling/tree/extra_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/extra_tree_regressor.py +108 -135
snowflake/ml/modeling/xgboost/xgb_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgb_regressor.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +108 -136
snowflake/ml/registry/model_registry.py +2 -0
snowflake/ml/registry/registry.py +215 -0
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/METADATA +34 -1
snowflake_ml_python-1.1.2.dist-info/RECORD +347 -0
snowflake_ml_python-1.1.0.dist-info/RECORD +0 -331
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/WHEEL +0 -0

snowflake/ml/modeling/_internal/distributed_hpo_trainer.py ADDED Viewed

@@ -0,0 +1,554 @@
+import importlib
+import inspect
+import io
+import os
+import posixpath
+import sys
+from typing import Any, Dict, List, Optional, Tuple, Union
+import cloudpickle as cp
+import numpy as np
+from scipy.stats import rankdata
+from sklearn import model_selection
+from snowflake.ml._internal import telemetry
+from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
+from snowflake.ml._internal.utils.temp_file_utils import (
+    cleanup_temp_files,
+    get_temp_file_path,
+)
+from snowflake.ml.modeling._internal.model_specifications import (
+    ModelSpecificationsBuilder,
+)
+from snowflake.ml.modeling._internal.snowpark_trainer import SnowparkModelTrainer
+from snowflake.snowpark import DataFrame, Session, functions as F
+from snowflake.snowpark._internal.utils import (
+    TempObjectType,
+    random_name_for_temp_object,
+)
+from snowflake.snowpark.functions import col, sproc, udtf
+from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
+cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
+cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
+_PROJECT = "ModelDevelopment"
+DEFAULT_UDTF_NJOBS = 3
+class DistributedHPOTrainer(SnowparkModelTrainer):
+    """
+    A class for performing distributed hyperparameter optimization (HPO) using Snowpark.
+    This class inherits from SnowparkModelTrainer and extends its functionality
+    to support distributed HPO for machine learning models. It enables optimization
+    of hyperparameters by distributing the tasks across the warehouse using Snowpark.
+    """
+    def __init__(
+        self,
+        estimator: object,
+        dataset: DataFrame,
+        session: Session,
+        input_cols: List[str],
+        label_cols: Optional[List[str]],
+        sample_weight_col: Optional[str],
+        autogenerated: bool = False,
+        subproject: str = "",
+    ) -> None:
+        """
+        Initializes the DistributedHPOTrainer with a model, a Snowpark DataFrame, feature, and label column names, etc.
+        Args:
+            estimator: SKLearn compatible estimator or transformer object.
+            dataset: The dataset used for training the model.
+            session: Snowflake session object to be used for training.
+            input_cols: The name(s) of one or more columns in a DataFrame containing a feature to be used for training.
+            label_cols: The name(s) of one or more columns in a DataFrame representing the target variable(s) to learn.
+            sample_weight_col: The column name representing the weight of training examples.
+            autogenerated: A boolean denoting if the trainer is being used by autogenerated code or not.
+            subproject: subproject name to be used in telemetry.
+        """
+        super().__init__(
+            estimator=estimator,
+            dataset=dataset,
+            session=session,
+            input_cols=input_cols,
+            label_cols=label_cols,
+            sample_weight_col=sample_weight_col,
+            autogenerated=autogenerated,
+            subproject=subproject,
+        )
+    # TODO(snandamuri): Copied this code as it is from the snowpark_handler.
+    #   Update it to improve the readability.
+    def fit_search_snowpark(
+        self,
+        param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
+        dataset: DataFrame,
+        session: Session,
+        estimator: Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
+        dependencies: List[str],
+        udf_imports: List[str],
+        input_cols: List[str],
+        label_cols: Optional[List[str]],
+        sample_weight_col: Optional[str],
+    ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
+        from itertools import product
+        import cachetools
+        from sklearn.base import clone, is_classifier
+        from sklearn.calibration import check_cv
+        # Create one stage for data and for estimators.
+        temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE)
+        temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};"
+        session.sql(temp_stage_creation_query).collect()
+        # Stage data.
+        dataset = snowpark_dataframe_utils.cast_snowpark_dataframe(dataset)
+        remote_file_path = f"{temp_stage_name}/{temp_stage_name}.parquet"
+        dataset.write.copy_into_location(  # type:ignore[call-overload]
+            remote_file_path, file_format_type="parquet", header=True, overwrite=True
+        )
+        imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()]
+        # Store GridSearchCV's refit variable. If user set it as False, we don't need to refit it again
+        original_refit = estimator.refit
+        # Create a temp file and dump the estimator to that file.
+        estimator_file_name = get_temp_file_path()
+        params_to_evaluate = []
+        for param_to_eval in list(param_grid):
+            for k, v in param_to_eval.items():
+                param_to_eval[k] = [v]
+            params_to_evaluate.append([param_to_eval])
+        with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
+            # Set GridSearchCV refit as False and fit it again after retrieving the best param
+            estimator.refit = False
+            cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
+        stage_estimator_file_name = posixpath.join(temp_stage_name, os.path.basename(estimator_file_name))
+        sproc_statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), self.__class__.__name__
+            ),
+            api_calls=[sproc],
+            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+        )
+        udtf_statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), self.__class__.__name__
+            ),
+            api_calls=[udtf],
+            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+        )
+        # Put locally serialized estimator on stage.
+        put_result = session.file.put(
+            estimator_file_name,
+            temp_stage_name,
+            auto_compress=False,
+            overwrite=True,
+        )
+        estimator_location = put_result[0].target
+        imports.append(f"@{temp_stage_name}/{estimator_location}")
+        search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
+        required_deps = dependencies + [
+            "snowflake-snowpark-python<2",
+            "fastparquet<2023.11",
+            "pyarrow<14",
+            "cachetools<5",
+        ]
+        @sproc(  # type: ignore[misc]
+            is_permanent=False,
+            name=search_sproc_name,
+            packages=required_deps,  # type: ignore[arg-type]
+            replace=True,
+            session=session,
+            anonymous=True,
+            imports=imports,  # type: ignore[arg-type]
+            statement_params=sproc_statement_params,
+        )
+        def _distributed_search(
+            session: Session,
+            imports: List[str],
+            stage_estimator_file_name: str,
+            input_cols: List[str],
+            label_cols: Optional[List[str]],
+        ) -> str:
+            import os
+            import time
+            from typing import Iterator
+            import cloudpickle as cp
+            import pandas as pd
+            import pyarrow.parquet as pq
+            from sklearn.metrics import check_scoring
+            from sklearn.metrics._scorer import _check_multimetric_scoring
+            for import_name in udf_imports:
+                importlib.import_module(import_name)
+            data_files = [
+                filename
+                for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+                if filename.startswith(temp_stage_name)
+            ]
+            partial_df = [
+                pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+                for file_name in data_files
+            ]
+            df = pd.concat(partial_df, ignore_index=True)
+            df.columns = [identifier.get_inferred_name(col) for col in df.columns]
+            X = df[input_cols]
+            y = df[label_cols].squeeze() if label_cols else None
+            local_estimator_file_name = get_temp_file_path()
+            session.file.get(stage_estimator_file_name, local_estimator_file_name)
+            local_estimator_file_path = os.path.join(
+                local_estimator_file_name, os.listdir(local_estimator_file_name)[0]
+            )
+            with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
+                estimator = cp.load(local_estimator_file_obj)["estimator"]
+            cv_orig = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
+            indices = [test for _, test in cv_orig.split(X, y)]
+            local_indices_file_name = get_temp_file_path()
+            with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
+                cp.dump(indices, local_indices_file_obj)
+            # Put locally serialized indices on stage.
+            put_result = session.file.put(
+                local_indices_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            indices_location = put_result[0].target
+            imports.append(f"@{temp_stage_name}/{indices_location}")
+            indices_len = len(indices)
+            assert estimator is not None
+            @cachetools.cached(cache={})
+            def _load_data_into_udf() -> Tuple[
+                Dict[str, pd.DataFrame],
+                Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
+                pd.DataFrame,
+                int,
+                List[Dict[str, Any]],
+            ]:
+                import pyarrow.parquet as pq
+                data_files = [
+                    filename
+                    for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+                    if filename.startswith(temp_stage_name)
+                ]
+                partial_df = [
+                    pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+                    for file_name in data_files
+                ]
+                df = pd.concat(partial_df, ignore_index=True)
+                df.columns = [identifier.get_inferred_name(col) for col in df.columns]
+                # load estimator
+                local_estimator_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
+                )
+                with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
+                    estimator_objects = cp.load(local_estimator_file_obj)
+                    estimator = estimator_objects["estimator"]
+                    params_to_evaluate = estimator_objects["param_grid"]
+                # load indices
+                local_indices_file_path = os.path.join(
+                    sys._xoptions["snowflake_import_directory"], f"{indices_location}"
+                )
+                with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
+                    indices = cp.load(local_indices_file_obj)
+                argspec = inspect.getfullargspec(estimator.fit)
+                args = {"X": df[input_cols]}
+                if label_cols:
+                    label_arg_name = "Y" if "Y" in argspec.args else "y"
+                    args[label_arg_name] = df[label_cols].squeeze()
+                if sample_weight_col is not None and "sample_weight" in argspec.args:
+                    args["sample_weight"] = df[sample_weight_col].squeeze()
+                return args, estimator, indices, len(df), params_to_evaluate
+            class SearchCV:
+                def __init__(self) -> None:
+                    args, estimator, indices, data_length, params_to_evaluate = _load_data_into_udf()
+                    self.args = args
+                    self.estimator = estimator
+                    self.indices = indices
+                    self.data_length = data_length
+                    self.params_to_evaluate = params_to_evaluate
+                def process(self, params_idx: int, idx: int) -> Iterator[Tuple[str]]:
+                    if hasattr(estimator, "param_grid"):
+                        self.estimator.param_grid = self.params_to_evaluate[params_idx]
+                    else:
+                        self.estimator.param_distributions = self.params_to_evaluate[params_idx]
+                    full_indices = np.array([i for i in range(self.data_length)])
+                    test_indice = self.indices[idx]
+                    train_indice = np.setdiff1d(full_indices, test_indice)
+                    self.estimator.cv = [(train_indice, test_indice)]
+                    self.estimator.fit(**self.args)
+                    binary_cv_results = None
+                    with io.BytesIO() as f:
+                        cp.dump(self.estimator.cv_results_, f)
+                        f.seek(0)
+                        binary_cv_results = f.getvalue().hex()
+                    yield (binary_cv_results,)
+                def end_partition(self) -> None:
+                    ...
+            session.udtf.register(
+                SearchCV,
+                output_schema=StructType([StructField("CV_RESULTS", StringType())]),
+                input_types=[IntegerType(), IntegerType()],
+                name=random_udtf_name,
+                packages=required_deps,  # type: ignore[arg-type]
+                replace=True,
+                is_permanent=False,
+                imports=imports,  # type: ignore[arg-type]
+                statement_params=udtf_statement_params,
+            )
+            HP_TUNING = F.table_function(random_udtf_name)
+            idx_length = int(indices_len)
+            params_length = len(param_grid)
+            idxs = [i for i in range(idx_length)]
+            param_indices, training_indices = [], []
+            for param_idx, cv_idx in product([param_index for param_index in range(params_length)], idxs):
+                param_indices.append(param_idx)
+                training_indices.append(cv_idx)
+            pd_df = pd.DataFrame(
+                {
+                    "PARAMS": param_indices,
+                    "TRAIN_IND": training_indices,
+                    "PARAM_INDEX": [i for i in range(idx_length * params_length)],
+                }
+            )
+            df = session.create_dataframe(pd_df)
+            results = df.select(
+                F.cast(df["PARAM_INDEX"], IntegerType()).as_("PARAM_INDEX"),
+                (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])),
+            )
+            # cv_result maintains the original order
+            multimetric = False
+            cv_results_ = dict()
+            scorers = set()
+            for i, val in enumerate(results.select("CV_RESULTS").sort(col("PARAM_INDEX")).collect()):
+                # retrieved string had one more double quote in the front and end of the string.
+                # use [1:-1] to remove the extra double quotes
+                hex_str = bytes.fromhex(val[0])
+                with io.BytesIO(hex_str) as f_reload:
+                    each_cv_result = cp.load(f_reload)
+                    for k, v in each_cv_result.items():
+                        cur_cv = i % idx_length
+                        key = k
+                        if "split0_test_" in k:
+                            # For multi-metric evaluation, the scores for all the scorers are available in the
+                            # cv_results_ dict at the keys ending with that scorer’s name ('_<scorer_name>')
+                            # instead of '_score'.
+                            scorers.add(k[len("split0_test_") :])
+                            key = k.replace("split0_test", f"split{cur_cv}_test")
+                        elif k.startswith("param"):
+                            if cur_cv != 0:
+                                key = False
+                        if key:
+                            if key not in cv_results_:
+                                cv_results_[key] = v
+                            else:
+                                cv_results_[key] = np.concatenate([cv_results_[key], v])
+            multimetric = len(scorers) > 1
+            # Use numpy to re-calculate all the information in cv_results_ again
+            # Generally speaking, reshape all the results into the (scorers+2, idx_length, params_length) shape,
+            # and average them by the idx_length;
+            # idx_length is the number of cv folds; params_length is the number of parameter combinations
+            scores = [
+                np.reshape(
+                    np.concatenate([cv_results_[f"split{cur_cv}_test_{score}"] for cur_cv in range(idx_length)]),
+                    (idx_length, -1),
+                )
+                for score in scorers
+            ]
+            fit_score_test_matrix = np.stack(
+                [
+                    np.reshape(cv_results_["mean_fit_time"], (idx_length, -1)),
+                    np.reshape(cv_results_["mean_score_time"], (idx_length, -1)),
+                ]
+                + scores
+            )
+            mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1)
+            std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1)
+            cv_results_["std_fit_time"] = std_fit_score_test_matrix[0]
+            cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0]
+            cv_results_["std_score_time"] = std_fit_score_test_matrix[1]
+            cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1]
+            for idx, score in enumerate(scorers):
+                cv_results_[f"std_test_{score}"] = std_fit_score_test_matrix[idx + 2]
+                cv_results_[f"mean_test_{score}"] = mean_fit_score_test_matrix[idx + 2]
+                # re-compute the ranking again with mean_test_<score>.
+                cv_results_[f"rank_test_{score}"] = rankdata(-cv_results_[f"mean_test_{score}"], method="min")
+                # The best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared.
+                # If all scores are `nan`, `rankdata` will also produce an array of `nan` values.
+                # In that case, default to first index.
+                best_param_index = (
+                    np.where(cv_results_[f"rank_test_{score}"] == 1)[0][0]
+                    if not np.isnan(cv_results_[f"rank_test_{score}"]).all()
+                    else 0
+                )
+            estimator.cv_results_ = cv_results_
+            estimator.multimetric_ = multimetric
+            # Reconstruct the sklearn estimator.
+            refit_metric = "score"
+            if callable(estimator.scoring):
+                scorers = estimator.scoring
+            elif estimator.scoring is None or isinstance(estimator.scoring, str):
+                scorers = check_scoring(estimator.estimator, estimator.scoring)
+            else:
+                scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
+                estimator._check_refit_for_multimetric(scorers)
+                refit_metric = original_refit
+            estimator.scorer_ = scorers
+            # check refit_metric now for a callabe scorer that is multimetric
+            if callable(estimator.scoring) and estimator.multimetric_:
+                refit_metric = original_refit
+            # For multi-metric evaluation, store the best_index_, best_params_ and
+            # best_score_ iff refit is one of the scorer names
+            # In single metric evaluation, refit_metric is "score"
+            if original_refit or not estimator.multimetric_:
+                estimator.best_index_ = estimator._select_best_index(original_refit, refit_metric, cv_results_)
+                if not callable(original_refit):
+                    # With a non-custom callable, we can select the best score
+                    # based on the best index
+                    estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
+                estimator.best_params_ = cv_results_["params"][best_param_index]
+            if original_refit:
+                estimator.best_estimator_ = clone(estimator.estimator).set_params(
+                    **clone(estimator.best_params_, safe=False)
+                )
+                # Let the sproc use all cores to refit.
+                estimator.n_jobs = -1 if not estimator.n_jobs else estimator.n_jobs
+                # process the input as args
+                argspec = inspect.getfullargspec(estimator.fit)
+                args = {"X": X}
+                if label_cols:
+                    label_arg_name = "Y" if "Y" in argspec.args else "y"
+                    args[label_arg_name] = y
+                if sample_weight_col is not None and "sample_weight" in argspec.args:
+                    args["sample_weight"] = df[sample_weight_col].squeeze()
+                estimator.refit = original_refit
+                refit_start_time = time.time()
+                estimator.best_estimator_.fit(**args)
+                refit_end_time = time.time()
+                estimator.refit_time_ = refit_end_time - refit_start_time
+                if hasattr(estimator.best_estimator_, "feature_names_in_"):
+                    estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
+            local_result_file_name = get_temp_file_path()
+            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                cp.dump(estimator, local_result_file_obj)
+            session.file.put(
+                local_result_file_name,
+                temp_stage_name,
+                auto_compress=False,
+                overwrite=True,
+            )
+            # Note: you can add something like  + "|" + str(df) to the return string
+            # to pass debug information to the caller.
+            return str(os.path.basename(local_result_file_name))
+        sproc_export_file_name = _distributed_search(
+            session,
+            imports,
+            stage_estimator_file_name,
+            input_cols,
+            label_cols,
+        )
+        local_estimator_path = get_temp_file_path()
+        session.file.get(
+            posixpath.join(temp_stage_name, sproc_export_file_name),
+            local_estimator_path,
+        )
+        with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj:
+            fit_estimator = cp.load(result_file_obj)
+        cleanup_temp_files([local_estimator_path])
+        return fit_estimator
+    def train(self) -> object:
+        """
+        Runs hyper parameter optimization by distributing the tasks across warehouse.
+        Returns:
+            Trained model
+        """
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        assert isinstance(self.estimator, model_selection.GridSearchCV) or isinstance(
+            self.estimator, model_selection.RandomizedSearchCV
+        )
+        if hasattr(self.estimator.estimator, "n_jobs") and self.estimator.estimator.n_jobs in [
+            None,
+            -1,
+        ]:
+            self.estimator.estimator.n_jobs = DEFAULT_UDTF_NJOBS
+        if isinstance(self.estimator, model_selection.GridSearchCV):
+            param_grid = model_selection.ParameterGrid(self.estimator.param_grid)
+        elif isinstance(self.estimator, model_selection.RandomizedSearchCV):
+            param_grid = model_selection.ParameterSampler(
+                self.estimator.param_distributions,
+                n_iter=self.estimator.n_iter,
+                random_state=self.estimator.random_state,
+            )
+        return self.fit_search_snowpark(
+            param_grid=param_grid,
+            dataset=self.dataset,
+            session=self.session,
+            estimator=self.estimator,
+            dependencies=model_spec.pkgDependencies,
+            udf_imports=["sklearn"],
+            input_cols=self.input_cols,
+            label_cols=self.label_cols,
+            sample_weight_col=self.sample_weight_col,
+        )

snowflake/ml/modeling/_internal/estimator_protocols.py CHANGED Viewed

@@ -1,35 +1,12 @@
-from typing import List, Optional, Protocol, Union
+from typing import List, Optional, Protocol
 import pandas as pd
-from sklearn import model_selection
 from snowflake.snowpark import DataFrame, Session
 # TODO: Add more specific entities to type hint estimators instead of using `object`.
 class FitPredictHandlers(Protocol):
-    def fit_snowpark(
-        self,
-        dataset: DataFrame,
-        session: Session,
-        estimator: object,
-        dependencies: List[str],
-        input_cols: List[str],
-        label_cols: List[str],
-        sample_weight_col: Optional[str],
-    ) -> object:
-        raise NotImplementedError
-    def fit_pandas(
-        self,
-        dataset: pd.DataFrame,
-        estimator: object,
-        input_cols: List[str],
-        label_cols: Optional[List[str]],
-        sample_weight_col: Optional[str],
-    ) -> object:
-        raise NotImplementedError
     def batch_inference(
         self,
         dataset: DataFrame,
@@ -70,28 +47,6 @@ class FitPredictHandlers(Protocol):
 # TODO: Add more specific entities to type hint estimators instead of using `object`.
 class CVHandlers(Protocol):
-    def fit_snowpark(
-        self,
-        dataset: DataFrame,
-        session: Session,
-        estimator: object,
-        dependencies: List[str],
-        input_cols: List[str],
-        label_cols: List[str],
-        sample_weight_col: Optional[str],
-    ) -> object:
-        raise NotImplementedError
-    def fit_pandas(
-        self,
-        dataset: pd.DataFrame,
-        estimator: object,
-        input_cols: List[str],
-        label_cols: Optional[List[str]],
-        sample_weight_col: Optional[str],
-    ) -> object:
-        raise NotImplementedError
     def batch_inference(
         self,
         dataset: DataFrame,
@@ -128,17 +83,3 @@ class CVHandlers(Protocol):
         sample_weight_col: Optional[str],
     ) -> float:
         raise NotImplementedError
-    def fit_search_snowpark(
-        self,
-        param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
-        dataset: DataFrame,
-        session: Session,
-        estimator: Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
-        dependencies: List[str],
-        udf_imports: List[str],
-        input_cols: List[str],
-        label_cols: List[str],
-        sample_weight_col: Optional[str],
-    ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
-        raise NotImplementedError

snowflake-ml-python 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl