PyPI - snowflake-ml-python - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

snowflake/cortex/_complete.py +1 -1
snowflake/cortex/_extract_answer.py +1 -1
snowflake/cortex/_sentiment.py +1 -1
snowflake/cortex/_summarize.py +1 -1
snowflake/cortex/_translate.py +1 -1
snowflake/ml/_internal/env_utils.py +68 -6
snowflake/ml/_internal/file_utils.py +34 -4
snowflake/ml/_internal/telemetry.py +79 -91
snowflake/ml/_internal/utils/identifier.py +78 -72
snowflake/ml/_internal/utils/retryable_http.py +16 -4
snowflake/ml/_internal/utils/spcs_attribution_utils.py +122 -0
snowflake/ml/dataset/dataset.py +1 -1
snowflake/ml/model/_api.py +21 -14
snowflake/ml/model/_client/model/model_impl.py +176 -0
snowflake/ml/model/_client/model/model_method_info.py +19 -0
snowflake/ml/model/_client/model/model_version_impl.py +291 -0
snowflake/ml/model/_client/ops/metadata_ops.py +107 -0
snowflake/ml/model/_client/ops/model_ops.py +308 -0
snowflake/ml/model/_client/sql/model.py +75 -0
snowflake/ml/model/_client/sql/model_version.py +213 -0
snowflake/ml/model/_client/sql/stage.py +40 -0
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -4
snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +24 -8
snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +23 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +14 -2
snowflake/ml/model/_deploy_client/utils/constants.py +1 -0
snowflake/ml/model/_deploy_client/warehouse/deploy.py +2 -2
snowflake/ml/model/_model_composer/model_composer.py +31 -9
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +25 -10
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -2
snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +34 -3
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +1 -1
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +3 -1
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +10 -28
snowflake/ml/model/_packager/model_meta/model_meta.py +18 -16
snowflake/ml/model/_signatures/snowpark_handler.py +1 -1
snowflake/ml/model/model_signature.py +108 -53
snowflake/ml/model/type_hints.py +1 -0
snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +554 -0
snowflake/ml/modeling/_internal/estimator_protocols.py +1 -60
snowflake/ml/modeling/_internal/model_specifications.py +146 -0
snowflake/ml/modeling/_internal/model_trainer.py +13 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +78 -0
snowflake/ml/modeling/_internal/pandas_trainer.py +54 -0
snowflake/ml/modeling/_internal/snowpark_handlers.py +6 -760
snowflake/ml/modeling/_internal/snowpark_trainer.py +331 -0
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +108 -135
snowflake/ml/modeling/cluster/affinity_propagation.py +106 -135
snowflake/ml/modeling/cluster/agglomerative_clustering.py +106 -135
snowflake/ml/modeling/cluster/birch.py +106 -135
snowflake/ml/modeling/cluster/bisecting_k_means.py +106 -135
snowflake/ml/modeling/cluster/dbscan.py +106 -135
snowflake/ml/modeling/cluster/feature_agglomeration.py +106 -135
snowflake/ml/modeling/cluster/k_means.py +105 -135
snowflake/ml/modeling/cluster/mean_shift.py +106 -135
snowflake/ml/modeling/cluster/mini_batch_k_means.py +105 -135
snowflake/ml/modeling/cluster/optics.py +106 -135
snowflake/ml/modeling/cluster/spectral_biclustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_clustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_coclustering.py +106 -135
snowflake/ml/modeling/compose/column_transformer.py +106 -135
snowflake/ml/modeling/compose/transformed_target_regressor.py +108 -135
snowflake/ml/modeling/covariance/elliptic_envelope.py +106 -135
snowflake/ml/modeling/covariance/empirical_covariance.py +99 -128
snowflake/ml/modeling/covariance/graphical_lasso.py +106 -135
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +106 -135
snowflake/ml/modeling/covariance/ledoit_wolf.py +104 -133
snowflake/ml/modeling/covariance/min_cov_det.py +106 -135
snowflake/ml/modeling/covariance/oas.py +99 -128
snowflake/ml/modeling/covariance/shrunk_covariance.py +103 -132
snowflake/ml/modeling/decomposition/dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/factor_analysis.py +106 -135
snowflake/ml/modeling/decomposition/fast_ica.py +106 -135
snowflake/ml/modeling/decomposition/incremental_pca.py +106 -135
snowflake/ml/modeling/decomposition/kernel_pca.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/pca.py +106 -135
snowflake/ml/modeling/decomposition/sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/truncated_svd.py +106 -135
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +108 -135
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +108 -135
snowflake/ml/modeling/ensemble/bagging_classifier.py +108 -135
snowflake/ml/modeling/ensemble/bagging_regressor.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/isolation_forest.py +106 -135
snowflake/ml/modeling/ensemble/random_forest_classifier.py +108 -135
snowflake/ml/modeling/ensemble/random_forest_regressor.py +108 -135
snowflake/ml/modeling/ensemble/stacking_regressor.py +108 -135
snowflake/ml/modeling/ensemble/voting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/voting_regressor.py +108 -135
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +101 -128
snowflake/ml/modeling/feature_selection/select_fdr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fpr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fwe.py +99 -126
snowflake/ml/modeling/feature_selection/select_k_best.py +100 -127
snowflake/ml/modeling/feature_selection/select_percentile.py +99 -126
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +106 -135
snowflake/ml/modeling/feature_selection/variance_threshold.py +95 -124
snowflake/ml/modeling/framework/base.py +83 -1
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +108 -135
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +108 -135
snowflake/ml/modeling/impute/iterative_imputer.py +106 -135
snowflake/ml/modeling/impute/knn_imputer.py +106 -135
snowflake/ml/modeling/impute/missing_indicator.py +106 -135
snowflake/ml/modeling/impute/simple_imputer.py +9 -1
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +96 -125
snowflake/ml/modeling/kernel_approximation/nystroem.py +106 -135
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +106 -135
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +105 -134
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +103 -132
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +108 -135
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +90 -118
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +90 -118
snowflake/ml/modeling/linear_model/ard_regression.py +108 -135
snowflake/ml/modeling/linear_model/bayesian_ridge.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/gamma_regressor.py +108 -135
snowflake/ml/modeling/linear_model/huber_regressor.py +108 -135
snowflake/ml/modeling/linear_model/lars.py +108 -135
snowflake/ml/modeling/linear_model/lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso.py +108 -135
snowflake/ml/modeling/linear_model/lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +108 -135
snowflake/ml/modeling/linear_model/linear_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +107 -135
snowflake/ml/modeling/linear_model/perceptron.py +107 -135
snowflake/ml/modeling/linear_model/poisson_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ransac_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ridge.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +108 -135
snowflake/ml/modeling/linear_model/ridge_cv.py +108 -135
snowflake/ml/modeling/linear_model/sgd_classifier.py +108 -135
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +106 -135
snowflake/ml/modeling/linear_model/sgd_regressor.py +108 -135
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +108 -135
snowflake/ml/modeling/linear_model/tweedie_regressor.py +108 -135
snowflake/ml/modeling/manifold/isomap.py +106 -135
snowflake/ml/modeling/manifold/mds.py +106 -135
snowflake/ml/modeling/manifold/spectral_embedding.py +106 -135
snowflake/ml/modeling/manifold/tsne.py +106 -135
snowflake/ml/modeling/metrics/classification.py +196 -55
snowflake/ml/modeling/metrics/correlation.py +4 -2
snowflake/ml/modeling/metrics/covariance.py +7 -4
snowflake/ml/modeling/metrics/ranking.py +32 -16
snowflake/ml/modeling/metrics/regression.py +60 -32
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +106 -135
snowflake/ml/modeling/mixture/gaussian_mixture.py +106 -135
snowflake/ml/modeling/model_selection/grid_search_cv.py +91 -148
snowflake/ml/modeling/model_selection/randomized_search_cv.py +93 -154
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +105 -132
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +108 -135
snowflake/ml/modeling/multiclass/output_code_classifier.py +108 -135
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/categorical_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/complement_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +98 -125
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +107 -134
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neighbors/kernel_density.py +106 -135
snowflake/ml/modeling/neighbors/local_outlier_factor.py +106 -135
snowflake/ml/modeling/neighbors/nearest_centroid.py +108 -135
snowflake/ml/modeling/neighbors/nearest_neighbors.py +106 -135
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +106 -135
snowflake/ml/modeling/neural_network/mlp_classifier.py +108 -135
snowflake/ml/modeling/neural_network/mlp_regressor.py +108 -135
snowflake/ml/modeling/parameters/disable_distributed_hpo.py +2 -6
snowflake/ml/modeling/preprocessing/binarizer.py +25 -8
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +9 -4
snowflake/ml/modeling/preprocessing/label_encoder.py +31 -11
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +27 -9
snowflake/ml/modeling/preprocessing/min_max_scaler.py +42 -14
snowflake/ml/modeling/preprocessing/normalizer.py +9 -4
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +26 -10
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +37 -13
snowflake/ml/modeling/preprocessing/polynomial_features.py +106 -135
snowflake/ml/modeling/preprocessing/robust_scaler.py +39 -13
snowflake/ml/modeling/preprocessing/standard_scaler.py +36 -12
snowflake/ml/modeling/semi_supervised/label_propagation.py +108 -135
snowflake/ml/modeling/semi_supervised/label_spreading.py +108 -135
snowflake/ml/modeling/svm/linear_svc.py +108 -135
snowflake/ml/modeling/svm/linear_svr.py +108 -135
snowflake/ml/modeling/svm/nu_svc.py +108 -135
snowflake/ml/modeling/svm/nu_svr.py +108 -135
snowflake/ml/modeling/svm/svc.py +108 -135
snowflake/ml/modeling/svm/svr.py +108 -135
snowflake/ml/modeling/tree/decision_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/decision_tree_regressor.py +108 -135
snowflake/ml/modeling/tree/extra_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/extra_tree_regressor.py +108 -135
snowflake/ml/modeling/xgboost/xgb_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgb_regressor.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +108 -136
snowflake/ml/registry/model_registry.py +2 -0
snowflake/ml/registry/registry.py +215 -0
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/METADATA +34 -1
snowflake_ml_python-1.1.2.dist-info/RECORD +347 -0
snowflake_ml_python-1.1.0.dist-info/RECORD +0 -331
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/WHEEL +0 -0

snowflake/ml/modeling/_internal/snowpark_handlers.py CHANGED Viewed

@@ -1,51 +1,29 @@
 import importlib
 import inspect
-import io
 import os
 import posixpath
-import sys
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional
 from uuid import uuid4
 import cloudpickle as cp
-import numpy as np
 import pandas as pd
-import sklearn
-from scipy.stats import rankdata
-from sklearn import model_selection
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
-from snowflake.ml._internal.exceptions import (
-    error_codes,
-    exceptions,
-    modeling_error_messages,
-)
+from snowflake.ml._internal.exceptions import error_codes, exceptions
 from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
     get_temp_file_path,
 )
-from snowflake.snowpark import (
-    DataFrame,
-    Session,
-    exceptions as snowpark_exceptions,
-    functions as F,
-)
+from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
     random_name_for_temp_object,
 )
-from snowflake.snowpark.functions import col, pandas_udf, sproc, udtf
-from snowflake.snowpark.stored_procedure import StoredProcedure
-from snowflake.snowpark.types import (
-    IntegerType,
-    PandasSeries,
-    StringType,
-    StructField,
-    StructType,
-)
+from snowflake.snowpark.functions import pandas_udf, sproc
+from snowflake.snowpark.types import PandasSeries
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
@@ -53,144 +31,6 @@ cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 _PROJECT = "ModelDevelopment"
-class WrapperProvider:
-    def __init__(self) -> None:
-        self.imports: List[str] = []
-        self.dependencies: List[str] = []
-    def get_fit_wrapper_function(
-        self,
-    ) -> Callable[[Any, List[str], str, str, List[str], List[str], Optional[str], Dict[str, str]], str]:
-        imports = self.imports  # In order for the sproc to not resolve this reference in snowflake.ml
-        def fit_wrapper_function(
-            session: Session,
-            sql_queries: List[str],
-            stage_transform_file_name: str,
-            stage_result_file_name: str,
-            input_cols: List[str],
-            label_cols: List[str],
-            sample_weight_col: Optional[str],
-            statement_params: Dict[str, str],
-        ) -> str:
-            import inspect
-            import os
-            import cloudpickle as cp
-            import pandas as pd
-            for import_name in imports:
-                importlib.import_module(import_name)
-            # Execute snowpark queries and obtain the results as pandas dataframe
-            # NB: this implies that the result data must fit into memory.
-            for query in sql_queries[:-1]:
-                _ = session.sql(query).collect(statement_params=statement_params)
-            sp_df = session.sql(sql_queries[-1])
-            df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params)
-            df.columns = sp_df.columns
-            local_transform_file_name = get_temp_file_path()
-            session.file.get(stage_transform_file_name, local_transform_file_name, statement_params=statement_params)
-            local_transform_file_path = os.path.join(
-                local_transform_file_name, os.listdir(local_transform_file_name)[0]
-            )
-            with open(local_transform_file_path, mode="r+b") as local_transform_file_obj:
-                estimator = cp.load(local_transform_file_obj)
-            argspec = inspect.getfullargspec(estimator.fit)
-            args = {"X": df[input_cols]}
-            if label_cols:
-                label_arg_name = "Y" if "Y" in argspec.args else "y"
-                args[label_arg_name] = df[label_cols].squeeze()
-            if sample_weight_col is not None and "sample_weight" in argspec.args:
-                args["sample_weight"] = df[sample_weight_col].squeeze()
-            estimator.fit(**args)
-            local_result_file_name = get_temp_file_path()
-            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
-                cp.dump(estimator, local_result_file_obj)
-            session.file.put(
-                local_result_file_name,
-                stage_result_file_name,
-                auto_compress=False,
-                overwrite=True,
-                statement_params=statement_params,
-            )
-            # Note: you can add something like  + "|" + str(df) to the return string
-            # to pass debug information to the caller.
-            return str(os.path.basename(local_result_file_name))
-        return fit_wrapper_function
-class SklearnWrapperProvider(WrapperProvider):
-    def __init__(self) -> None:
-        import sklearn
-        self.imports: List[str] = ["sklearn"]
-        # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda.
-        self.dependencies: List[str] = [
-            f"numpy=={np.__version__}",
-            f"scikit-learn=={sklearn.__version__}",
-            f"cloudpickle=={cp.__version__}",
-        ]
-class XGBoostWrapperProvider(WrapperProvider):
-    def __init__(self) -> None:
-        import xgboost
-        self.imports: List[str] = ["xgboost"]
-        self.dependencies = [
-            f"numpy=={np.__version__}",
-            f"xgboost=={xgboost.__version__}",
-            f"cloudpickle=={cp.__version__}",
-        ]
-class LightGBMWrapperProvider(WrapperProvider):
-    def __init__(self) -> None:
-        import lightgbm
-        self.imports: List[str] = ["lightgbm"]
-        self.dependencies = [
-            f"numpy=={np.__version__}",
-            f"lightgbm=={lightgbm.__version__}",
-            f"cloudpickle=={cp.__version__}",
-        ]
-class SklearnModelSelectionWrapperProvider(WrapperProvider):
-    def __init__(self) -> None:
-        import xgboost
-        self.imports: List[str] = ["sklearn", "xgboost"]
-        self.dependencies = [
-            f"numpy=={np.__version__}",
-            f"scikit-learn=={sklearn.__version__}",
-            f"cloudpickle=={cp.__version__}",
-            f"xgboost=={xgboost.__version__}",
-        ]
-        # Only include lightgbm in the dependencies if it is installed.
-        try:
-            import lightgbm
-        except ModuleNotFoundError:
-            pass
-        else:
-            self.imports.append("lightgbm")
-            self.dependencies.append(f"lightgbm=={lightgbm.__version__}")
 def _get_rand_id() -> str:
     """
     Generate random id to be used in sproc and stage names.
@@ -202,171 +42,11 @@ def _get_rand_id() -> str:
 class SnowparkHandlers:
-    def __init__(
-        self, class_name: str, subproject: str, wrapper_provider: WrapperProvider, autogenerated: Optional[bool] = False
-    ) -> None:
+    def __init__(self, class_name: str, subproject: str, autogenerated: Optional[bool] = False) -> None:
         self._class_name = class_name
         self._subproject = subproject
-        self._wrapper_provider = wrapper_provider
         self._autogenerated = autogenerated
-    def _get_fit_wrapper_sproc(
-        self, dependencies: List[str], session: Session, statement_params: Dict[str, str]
-    ) -> StoredProcedure:
-        # If the sproc already exists, don't register.
-        if not hasattr(session, "_FIT_WRAPPER_SPROCS"):
-            session._FIT_WRAPPER_SPROCS: Dict[str, StoredProcedure] = {}  # type: ignore[attr-defined, misc]
-        fit_sproc_key = self._wrapper_provider.__class__.__name__
-        if fit_sproc_key in session._FIT_WRAPPER_SPROCS:  # type: ignore[attr-defined]
-            fit_sproc: StoredProcedure = session._FIT_WRAPPER_SPROCS[fit_sproc_key]  # type: ignore[attr-defined]
-            return fit_sproc
-        fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
-        fit_wrapper_sproc = session.sproc.register(
-            func=self._wrapper_provider.get_fit_wrapper_function(),
-            is_permanent=False,
-            name=fit_sproc_name,
-            packages=dependencies,  # type: ignore[arg-type]
-            replace=True,
-            session=session,
-            statement_params=statement_params,
-        )
-        session._FIT_WRAPPER_SPROCS[fit_sproc_key] = fit_wrapper_sproc  # type: ignore[attr-defined]
-        return fit_wrapper_sproc
-    def fit_pandas(
-        self,
-        dataset: pd.DataFrame,
-        estimator: object,
-        input_cols: List[str],
-        label_cols: Optional[List[str]],
-        sample_weight_col: Optional[str],
-    ) -> object:
-        assert hasattr(estimator, "fit")  # Keep mypy happy
-        argspec = inspect.getfullargspec(estimator.fit)
-        args = {"X": dataset[input_cols]}
-        if label_cols:
-            label_arg_name = "Y" if "Y" in argspec.args else "y"
-            args[label_arg_name] = dataset[label_cols].squeeze()
-        if sample_weight_col is not None and "sample_weight" in argspec.args:
-            args["sample_weight"] = dataset[sample_weight_col].squeeze()
-        return estimator.fit(**args)
-    def fit_snowpark(
-        self,
-        dataset: DataFrame,
-        session: Session,
-        estimator: object,
-        dependencies: List[str],
-        input_cols: List[str],
-        label_cols: List[str],
-        sample_weight_col: Optional[str],
-    ) -> Any:
-        dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
-        # If we are already in a stored procedure, no need to kick off another one.
-        if SNOWML_SPROC_ENV in os.environ:
-            statement_params = telemetry.get_function_usage_statement_params(
-                project=_PROJECT,
-                subproject=self._subproject,
-                function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-                api_calls=[Session.call],
-                custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-            )
-            pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
-            pd_df.columns = dataset.columns
-            return self.fit_pandas(pd_df, estimator, input_cols, label_cols, sample_weight_col)
-        # Extract query that generated the dataframe. We will need to pass it to the fit procedure.
-        queries = dataset.queries["queries"]
-        # Create a temp file and dump the transform to that file.
-        local_transform_file_name = get_temp_file_path()
-        with open(local_transform_file_name, mode="w+b") as local_transform_file:
-            cp.dump(estimator, local_transform_file)
-        # Create temp stage to run fit.
-        transform_stage_name = random_name_for_temp_object(TempObjectType.STAGE)
-        stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
-        SqlResultValidator(session=session, query=stage_creation_query).has_dimensions(
-            expected_rows=1, expected_cols=1
-        ).validate()
-        # Use posixpath to construct stage paths
-        stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
-        stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
-        local_result_file_name = get_temp_file_path()
-        statement_params = telemetry.get_function_usage_statement_params(
-            project=_PROJECT,
-            subproject=self._subproject,
-            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-            api_calls=[sproc],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-        )
-        # Put locally serialized transform on stage.
-        session.file.put(
-            local_transform_file_name,
-            stage_transform_file_name,
-            auto_compress=False,
-            overwrite=True,
-            statement_params=statement_params,
-        )
-        # Call fit sproc
-        statement_params = telemetry.get_function_usage_statement_params(
-            project=_PROJECT,
-            subproject=self._subproject,
-            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-            api_calls=[Session.call],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-        )
-        fit_wrapper_sproc = self._get_fit_wrapper_sproc(dependencies, session, statement_params)
-        try:
-            sproc_export_file_name: str = fit_wrapper_sproc(
-                session,
-                queries,
-                stage_transform_file_name,
-                stage_result_file_name,
-                input_cols,
-                label_cols,
-                sample_weight_col,
-                statement_params,
-            )
-        except snowpark_exceptions.SnowparkClientException as e:
-            if "fit() missing 1 required positional argument: 'y'" in str(e):
-                raise exceptions.SnowflakeMLException(
-                    error_code=error_codes.NOT_FOUND,
-                    original_exception=RuntimeError(modeling_error_messages.ATTRIBUTE_NOT_SET.format("label_cols")),
-                ) from e
-            raise e
-        if "|" in sproc_export_file_name:
-            fields = sproc_export_file_name.strip().split("|")
-            sproc_export_file_name = fields[0]
-        session.file.get(
-            posixpath.join(stage_result_file_name, sproc_export_file_name),
-            local_result_file_name,
-            statement_params=statement_params,
-        )
-        with open(os.path.join(local_result_file_name, sproc_export_file_name), mode="r+b") as result_file_obj:
-            fit_estimator = cp.load(result_file_obj)
-        cleanup_temp_files([local_transform_file_name, local_result_file_name])
-        return fit_estimator
     def batch_inference(
         self,
         dataset: DataFrame,
@@ -690,437 +370,3 @@ class SnowparkHandlers:
         cleanup_temp_files([local_score_file_name])
         return score
-    def fit_search_snowpark(
-        self,
-        param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
-        dataset: DataFrame,
-        session: Session,
-        estimator: Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
-        dependencies: List[str],
-        udf_imports: List[str],
-        input_cols: List[str],
-        label_cols: List[str],
-        sample_weight_col: Optional[str],
-    ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
-        from itertools import product
-        import cachetools
-        from sklearn.base import clone, is_classifier
-        from sklearn.calibration import check_cv
-        # Create one stage for data and for estimators.
-        temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE)
-        temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};"
-        session.sql(temp_stage_creation_query).collect()
-        # Stage data.
-        dataset = snowpark_dataframe_utils.cast_snowpark_dataframe(dataset)
-        remote_file_path = f"{temp_stage_name}/{temp_stage_name}.parquet"
-        dataset.write.copy_into_location(  # type:ignore[call-overload]
-            remote_file_path, file_format_type="parquet", header=True, overwrite=True
-        )
-        imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()]
-        # Store GridSearchCV's refit variable. If user set it as False, we don't need to refit it again
-        original_refit = estimator.refit
-        # Create a temp file and dump the estimator to that file.
-        estimator_file_name = get_temp_file_path()
-        params_to_evaluate = []
-        for param_to_eval in list(param_grid):
-            for k, v in param_to_eval.items():
-                param_to_eval[k] = [v]
-            params_to_evaluate.append([param_to_eval])
-        with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
-            # Set GridSearchCV refit as False and fit it again after retrieving the best param
-            estimator.refit = False
-            cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
-        stage_estimator_file_name = posixpath.join(temp_stage_name, os.path.basename(estimator_file_name))
-        sproc_statement_params = telemetry.get_function_usage_statement_params(
-            project=_PROJECT,
-            subproject=self._subproject,
-            function_name=telemetry.get_statement_params_full_func_name(
-                inspect.currentframe(), self.__class__.__name__
-            ),
-            api_calls=[sproc],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-        )
-        udtf_statement_params = telemetry.get_function_usage_statement_params(
-            project=_PROJECT,
-            subproject=self._subproject,
-            function_name=telemetry.get_statement_params_full_func_name(
-                inspect.currentframe(), self.__class__.__name__
-            ),
-            api_calls=[udtf],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-        )
-        # Put locally serialized estimator on stage.
-        put_result = session.file.put(
-            estimator_file_name,
-            temp_stage_name,
-            auto_compress=False,
-            overwrite=True,
-        )
-        estimator_location = put_result[0].target
-        imports.append(f"@{temp_stage_name}/{estimator_location}")
-        search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
-        random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
-        required_deps = dependencies + [
-            "snowflake-snowpark-python<2",
-            "fastparquet<2023.11",
-            "pyarrow<14",
-            "cachetools<5",
-        ]
-        @sproc(  # type: ignore[misc]
-            is_permanent=False,
-            name=search_sproc_name,
-            packages=required_deps,  # type: ignore[arg-type]
-            replace=True,
-            session=session,
-            anonymous=True,
-            imports=imports,  # type: ignore[arg-type]
-            statement_params=sproc_statement_params,
-        )
-        def _distributed_search(
-            session: Session,
-            imports: List[str],
-            stage_estimator_file_name: str,
-            input_cols: List[str],
-            label_cols: List[str],
-        ) -> str:
-            import os
-            import time
-            from typing import Iterator
-            import cloudpickle as cp
-            import pandas as pd
-            import pyarrow.parquet as pq
-            from sklearn.metrics import check_scoring
-            from sklearn.metrics._scorer import _check_multimetric_scoring
-            for import_name in udf_imports:
-                importlib.import_module(import_name)
-            data_files = [
-                filename
-                for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
-                if filename.startswith(temp_stage_name)
-            ]
-            partial_df = [
-                pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
-                for file_name in data_files
-            ]
-            df = pd.concat(partial_df, ignore_index=True)
-            df.columns = [identifier.get_inferred_name(col) for col in df.columns]
-            X = df[input_cols]
-            y = df[label_cols].squeeze()
-            local_estimator_file_name = get_temp_file_path()
-            session.file.get(stage_estimator_file_name, local_estimator_file_name)
-            local_estimator_file_path = os.path.join(
-                local_estimator_file_name, os.listdir(local_estimator_file_name)[0]
-            )
-            with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
-                estimator = cp.load(local_estimator_file_obj)["estimator"]
-            cv_orig = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
-            indices = [test for _, test in cv_orig.split(X, y)]
-            local_indices_file_name = get_temp_file_path()
-            with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
-                cp.dump(indices, local_indices_file_obj)
-            # Put locally serialized indices on stage.
-            put_result = session.file.put(
-                local_indices_file_name,
-                temp_stage_name,
-                auto_compress=False,
-                overwrite=True,
-            )
-            indices_location = put_result[0].target
-            imports.append(f"@{temp_stage_name}/{indices_location}")
-            indices_len = len(indices)
-            assert estimator is not None
-            @cachetools.cached(cache={})
-            def _load_data_into_udf() -> Tuple[
-                Dict[str, pd.DataFrame],
-                Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV],
-                pd.DataFrame,
-                int,
-                List[Dict[str, Any]],
-            ]:
-                import pyarrow.parquet as pq
-                data_files = [
-                    filename
-                    for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
-                    if filename.startswith(temp_stage_name)
-                ]
-                partial_df = [
-                    pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
-                    for file_name in data_files
-                ]
-                df = pd.concat(partial_df, ignore_index=True)
-                df.columns = [identifier.get_inferred_name(col) for col in df.columns]
-                # load estimator
-                local_estimator_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
-                )
-                with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
-                    estimator_objects = cp.load(local_estimator_file_obj)
-                    estimator = estimator_objects["estimator"]
-                    params_to_evaluate = estimator_objects["param_grid"]
-                # load indices
-                local_indices_file_path = os.path.join(
-                    sys._xoptions["snowflake_import_directory"], f"{indices_location}"
-                )
-                with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
-                    indices = cp.load(local_indices_file_obj)
-                argspec = inspect.getfullargspec(estimator.fit)
-                args = {"X": df[input_cols]}
-                if label_cols:
-                    label_arg_name = "Y" if "Y" in argspec.args else "y"
-                    args[label_arg_name] = df[label_cols].squeeze()
-                if sample_weight_col is not None and "sample_weight" in argspec.args:
-                    args["sample_weight"] = df[sample_weight_col].squeeze()
-                return args, estimator, indices, len(df), params_to_evaluate
-            class SearchCV:
-                def __init__(self) -> None:
-                    args, estimator, indices, data_length, params_to_evaluate = _load_data_into_udf()
-                    self.args = args
-                    self.estimator = estimator
-                    self.indices = indices
-                    self.data_length = data_length
-                    self.params_to_evaluate = params_to_evaluate
-                def process(self, params_idx: int, idx: int) -> Iterator[Tuple[str]]:
-                    if hasattr(estimator, "param_grid"):
-                        self.estimator.param_grid = self.params_to_evaluate[params_idx]
-                    else:
-                        self.estimator.param_distributions = self.params_to_evaluate[params_idx]
-                    full_indices = np.array([i for i in range(self.data_length)])
-                    test_indice = self.indices[idx]
-                    train_indice = np.setdiff1d(full_indices, test_indice)
-                    self.estimator.cv = [(train_indice, test_indice)]
-                    self.estimator.fit(**self.args)
-                    binary_cv_results = None
-                    with io.BytesIO() as f:
-                        cp.dump(self.estimator.cv_results_, f)
-                        f.seek(0)
-                        binary_cv_results = f.getvalue().hex()
-                    yield (binary_cv_results,)
-                def end_partition(self) -> None:
-                    ...
-            session.udtf.register(
-                SearchCV,
-                output_schema=StructType([StructField("CV_RESULTS", StringType())]),
-                input_types=[IntegerType(), IntegerType()],
-                name=random_udtf_name,
-                packages=required_deps,  # type: ignore[arg-type]
-                replace=True,
-                is_permanent=False,
-                imports=imports,  # type: ignore[arg-type]
-                statement_params=udtf_statement_params,
-            )
-            HP_TUNING = F.table_function(random_udtf_name)
-            idx_length = int(indices_len)
-            params_length = len(param_grid)
-            idxs = [i for i in range(idx_length)]
-            param_indices, training_indices = [], []
-            for param_idx, cv_idx in product([param_index for param_index in range(params_length)], idxs):
-                param_indices.append(param_idx)
-                training_indices.append(cv_idx)
-            pd_df = pd.DataFrame(
-                {
-                    "PARAMS": param_indices,
-                    "TRAIN_IND": training_indices,
-                    "PARAM_INDEX": [i for i in range(idx_length * params_length)],
-                }
-            )
-            df = session.create_dataframe(pd_df)
-            results = df.select(
-                F.cast(df["PARAM_INDEX"], IntegerType()).as_("PARAM_INDEX"),
-                (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])),
-            )
-            # cv_result maintains the original order
-            multimetric = False
-            cv_results_ = dict()
-            scorers = set()
-            for i, val in enumerate(results.select("CV_RESULTS").sort(col("PARAM_INDEX")).collect()):
-                # retrieved string had one more double quote in the front and end of the string.
-                # use [1:-1] to remove the extra double quotes
-                hex_str = bytes.fromhex(val[0])
-                with io.BytesIO(hex_str) as f_reload:
-                    each_cv_result = cp.load(f_reload)
-                    for k, v in each_cv_result.items():
-                        cur_cv = i % idx_length
-                        key = k
-                        if "split0_test_" in k:
-                            # For multi-metric evaluation, the scores for all the scorers are available in the
-                            # cv_results_ dict at the keys ending with that scorer’s name ('_<scorer_name>')
-                            # instead of '_score'.
-                            scorers.add(k[len("split0_test_") :])
-                            key = k.replace("split0_test", f"split{cur_cv}_test")
-                        elif k.startswith("param"):
-                            if cur_cv != 0:
-                                key = False
-                        if key:
-                            if key not in cv_results_:
-                                cv_results_[key] = v
-                            else:
-                                cv_results_[key] = np.concatenate([cv_results_[key], v])
-            multimetric = len(scorers) > 1
-            # Use numpy to re-calculate all the information in cv_results_ again
-            # Generally speaking, reshape all the results into the (scorers+2, idx_length, params_length) shape,
-            # and average them by the idx_length;
-            # idx_length is the number of cv folds; params_length is the number of parameter combinations
-            scores = [
-                np.reshape(
-                    np.concatenate([cv_results_[f"split{cur_cv}_test_{score}"] for cur_cv in range(idx_length)]),
-                    (idx_length, -1),
-                )
-                for score in scorers
-            ]
-            fit_score_test_matrix = np.stack(
-                [
-                    np.reshape(cv_results_["mean_fit_time"], (idx_length, -1)),
-                    np.reshape(cv_results_["mean_score_time"], (idx_length, -1)),
-                ]
-                + scores
-            )
-            mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1)
-            std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1)
-            cv_results_["std_fit_time"] = std_fit_score_test_matrix[0]
-            cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0]
-            cv_results_["std_score_time"] = std_fit_score_test_matrix[1]
-            cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1]
-            for idx, score in enumerate(scorers):
-                cv_results_[f"std_test_{score}"] = std_fit_score_test_matrix[idx + 2]
-                cv_results_[f"mean_test_{score}"] = mean_fit_score_test_matrix[idx + 2]
-                # re-compute the ranking again with mean_test_<score>.
-                cv_results_[f"rank_test_{score}"] = rankdata(-cv_results_[f"mean_test_{score}"], method="min")
-                # The best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared.
-                # If all scores are `nan`, `rankdata` will also produce an array of `nan` values.
-                # In that case, default to first index.
-                best_param_index = (
-                    np.where(cv_results_[f"rank_test_{score}"] == 1)[0][0]
-                    if not np.isnan(cv_results_[f"rank_test_{score}"]).all()
-                    else 0
-                )
-            estimator.cv_results_ = cv_results_
-            estimator.multimetric_ = multimetric
-            # Reconstruct the sklearn estimator.
-            refit_metric = "score"
-            if callable(estimator.scoring):
-                scorers = estimator.scoring
-            elif estimator.scoring is None or isinstance(estimator.scoring, str):
-                scorers = check_scoring(estimator.estimator, estimator.scoring)
-            else:
-                scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
-                estimator._check_refit_for_multimetric(scorers)
-                refit_metric = original_refit
-            estimator.scorer_ = scorers
-            # check refit_metric now for a callabe scorer that is multimetric
-            if callable(estimator.scoring) and estimator.multimetric_:
-                refit_metric = original_refit
-            # For multi-metric evaluation, store the best_index_, best_params_ and
-            # best_score_ iff refit is one of the scorer names
-            # In single metric evaluation, refit_metric is "score"
-            if original_refit or not estimator.multimetric_:
-                estimator.best_index_ = estimator._select_best_index(original_refit, refit_metric, cv_results_)
-                if not callable(original_refit):
-                    # With a non-custom callable, we can select the best score
-                    # based on the best index
-                    estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
-                estimator.best_params_ = cv_results_["params"][best_param_index]
-            if original_refit:
-                estimator.best_estimator_ = clone(estimator.estimator).set_params(
-                    **clone(estimator.best_params_, safe=False)
-                )
-                # Let the sproc use all cores to refit.
-                estimator.n_jobs = -1 if not estimator.n_jobs else estimator.n_jobs
-                # process the input as args
-                argspec = inspect.getfullargspec(estimator.fit)
-                args = {"X": X}
-                if label_cols:
-                    label_arg_name = "Y" if "Y" in argspec.args else "y"
-                    args[label_arg_name] = y
-                if sample_weight_col is not None and "sample_weight" in argspec.args:
-                    args["sample_weight"] = df[sample_weight_col].squeeze()
-                estimator.refit = original_refit
-                refit_start_time = time.time()
-                estimator.best_estimator_.fit(**args)
-                refit_end_time = time.time()
-                estimator.refit_time_ = refit_end_time - refit_start_time
-                if hasattr(estimator.best_estimator_, "feature_names_in_"):
-                    estimator.feature_names_in_ = estimator.best_estimator_.feature_names_in_
-            local_result_file_name = get_temp_file_path()
-            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
-                cp.dump(estimator, local_result_file_obj)
-            session.file.put(
-                local_result_file_name,
-                temp_stage_name,
-                auto_compress=False,
-                overwrite=True,
-            )
-            # Note: you can add something like  + "|" + str(df) to the return string
-            # to pass debug information to the caller.
-            return str(os.path.basename(local_result_file_name))
-        sproc_export_file_name = _distributed_search(
-            session,
-            imports,
-            stage_estimator_file_name,
-            input_cols,
-            label_cols,
-        )
-        local_estimator_path = get_temp_file_path()
-        session.file.get(
-            posixpath.join(temp_stage_name, sproc_export_file_name),
-            local_estimator_path,
-        )
-        with open(os.path.join(local_estimator_path, sproc_export_file_name), mode="r+b") as result_file_obj:
-            fit_estimator = cp.load(result_file_obj)
-        cleanup_temp_files([local_estimator_path])
-        return fit_estimator

snowflake-ml-python 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl