PyPI - snowflake-ml-python - Versions diffs - 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl - Mend

snowflake-ml-python 1.6.1py3-none-any.whl → 1.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (212) hide show

snowflake/ml/_internal/telemetry.py +142 -20
snowflake/ml/_internal/utils/identifier.py +48 -11
snowflake/ml/_internal/utils/snowflake_env.py +23 -13
snowflake/ml/_internal/utils/sql_identifier.py +1 -1
snowflake/ml/_internal/utils/table_manager.py +19 -1
snowflake/ml/_internal/utils/uri.py +2 -2
snowflake/ml/data/data_connector.py +33 -7
snowflake/ml/data/torch_utils.py +68 -0
snowflake/ml/dataset/dataset.py +1 -3
snowflake/ml/feature_store/feature_store.py +41 -17
snowflake/ml/feature_store/feature_view.py +2 -2
snowflake/ml/fileset/embedded_stage_fs.py +1 -1
snowflake/ml/fileset/fileset.py +1 -1
snowflake/ml/fileset/sfcfs.py +9 -3
snowflake/ml/model/_client/model/model_version_impl.py +22 -7
snowflake/ml/model/_client/ops/model_ops.py +39 -3
snowflake/ml/model/_client/ops/service_ops.py +198 -7
snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
snowflake/ml/model/_client/sql/service.py +85 -18
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
snowflake/ml/model/_model_composer/model_composer.py +2 -0
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
snowflake/ml/model/_packager/model_packager.py +2 -0
snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
snowflake/ml/model/_signatures/utils.py +9 -0
snowflake/ml/model/models/llm.py +3 -1
snowflake/ml/model/type_hints.py +9 -1
snowflake/ml/modeling/_internal/constants.py +1 -0
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
snowflake/ml/modeling/_internal/model_specifications.py +2 -0
snowflake/ml/modeling/_internal/model_trainer.py +1 -0
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
snowflake/ml/modeling/cluster/birch.py +60 -21
snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
snowflake/ml/modeling/cluster/dbscan.py +60 -21
snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
snowflake/ml/modeling/cluster/k_means.py +60 -21
snowflake/ml/modeling/cluster/mean_shift.py +60 -21
snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
snowflake/ml/modeling/cluster/optics.py +60 -21
snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
snowflake/ml/modeling/compose/column_transformer.py +60 -21
snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
snowflake/ml/modeling/covariance/oas.py +60 -21
snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
snowflake/ml/modeling/decomposition/pca.py +60 -21
snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
snowflake/ml/modeling/impute/knn_imputer.py +60 -21
snowflake/ml/modeling/impute/missing_indicator.py +60 -21
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
snowflake/ml/modeling/linear_model/lars.py +60 -21
snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso.py +60 -21
snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
snowflake/ml/modeling/linear_model/perceptron.py +60 -21
snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ridge.py +60 -21
snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
snowflake/ml/modeling/manifold/isomap.py +60 -21
snowflake/ml/modeling/manifold/mds.py +60 -21
snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
snowflake/ml/modeling/manifold/tsne.py +60 -21
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
snowflake/ml/modeling/pipeline/pipeline.py +1 -12
snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
snowflake/ml/modeling/svm/linear_svc.py +60 -21
snowflake/ml/modeling/svm/linear_svr.py +60 -21
snowflake/ml/modeling/svm/nu_svc.py +60 -21
snowflake/ml/modeling/svm/nu_svr.py +60 -21
snowflake/ml/modeling/svm/svc.py +60 -21
snowflake/ml/modeling/svm/svr.py +60 -21
snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
snowflake/ml/registry/_manager/model_manager.py +4 -0
snowflake/ml/registry/model_registry.py +1 -1
snowflake/ml/registry/registry.py +1 -2
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
snowflake/ml/data/torch_dataset.py +0 -33
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/neural_network/mlp_classifier.py CHANGED Viewed

@@ -4,14 +4,12 @@
 #
 import inspect
 import os
-import posixpath
-from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
-from typing_extensions import TypeGuard
+from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
 from uuid import uuid4
 import cloudpickle as cp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from numpy import typing as npt
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
 from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
-from snowflake.ml._internal.utils import pkg_version_utils, identifier
+from snowflake.ml._internal.utils import identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.transformer_protocols import (
-    ModelTransformHandlers,
     BatchInferenceKwargsTypedDict,
     ScoreKwargsTypedDict
 )
@@ -660,12 +657,23 @@ class MLPClassifier(BaseTransformer):
             autogenerated=self._autogenerated,
             subproject=_SUBPROJECT,
         )
-        output_result, fitted_estimator = model_trainer.train_fit_predict(
-            drop_input_cols=self._drop_input_cols,
-            expected_output_cols_list=(
-                self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
-            ),
+        expected_output_cols = (
+            self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
         )
+        if isinstance(dataset, DataFrame):
+            expected_output_cols, example_output_pd_df = self._align_expected_output(
+                "fit_predict", dataset, expected_output_cols, output_cols_prefix
+            )
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+                example_output_pd_df=example_output_pd_df,
+            )
+        else:
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+            )
         self._sklearn_object = fitted_estimator
         self._is_fitted = True
         return output_result
@@ -744,12 +752,41 @@ class MLPClassifier(BaseTransformer):
         return rv
-    def _align_expected_output_names(
-        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
-    ) -> List[str]:
+    def _align_expected_output(
+        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
+    ) -> Tuple[List[str], pd.DataFrame]:
+        """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
+        and output dataframe with 1 line.
+        If the method is fit_predict, run 2 lines of data.
+        """
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
+        # so change the minimum of number of rows to 2
+        num_examples = 2
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=_SUBPROJECT,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), MLPClassifier.__class__.__name__
+            ),
+            api_calls=[Session.call],
+            custom_tags={"autogen": True} if self._autogenerated else None,
+        )
+        if output_cols_prefix == "fit_predict_":
+            if hasattr(self._sklearn_object, "n_clusters"):
+                # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
+                num_examples = self._sklearn_object.n_clusters
+            elif hasattr(self._sklearn_object, "min_samples"):
+                # OPTICS default min_samples 5, which requires at least 5 lines of data
+                num_examples = self._sklearn_object.min_samples
+            elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
+                # LocalOutlierFactor expects n_neighbors <= n_samples
+                num_examples = self._sklearn_object.n_neighbors
+            sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
+        else:
+            sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
         # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
         # seen during the fit.
@@ -761,12 +798,14 @@ class MLPClassifier(BaseTransformer):
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:
             output_df_columns_set -= set(self.sample_weight_col)
         # if the dimension of inferred output column names is correct; use it
         if len(expected_output_cols_list) == len(output_df_columns_set):
-            return expected_output_cols_list
+            return expected_output_cols_list, output_df_pd
         # otherwise, use the sklearn estimator's output
         else:
-            return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            return expected_output_cols_list, output_df_pd[expected_output_cols_list]
     @available_if(original_estimator_has_callable("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -814,7 +853,7 @@ class MLPClassifier(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -882,7 +921,7 @@ class MLPClassifier(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -945,7 +984,7 @@ class MLPClassifier(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -1010,7 +1049,7 @@ class MLPClassifier(BaseTransformer):
                 drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )

snowflake/ml/modeling/neural_network/mlp_regressor.py CHANGED Viewed

@@ -4,14 +4,12 @@
 #
 import inspect
 import os
-import posixpath
-from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
-from typing_extensions import TypeGuard
+from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
 from uuid import uuid4
 import cloudpickle as cp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from numpy import typing as npt
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
 from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
-from snowflake.ml._internal.utils import pkg_version_utils, identifier
+from snowflake.ml._internal.utils import identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.transformer_protocols import (
-    ModelTransformHandlers,
     BatchInferenceKwargsTypedDict,
     ScoreKwargsTypedDict
 )
@@ -656,12 +653,23 @@ class MLPRegressor(BaseTransformer):
             autogenerated=self._autogenerated,
             subproject=_SUBPROJECT,
         )
-        output_result, fitted_estimator = model_trainer.train_fit_predict(
-            drop_input_cols=self._drop_input_cols,
-            expected_output_cols_list=(
-                self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
-            ),
+        expected_output_cols = (
+            self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
         )
+        if isinstance(dataset, DataFrame):
+            expected_output_cols, example_output_pd_df = self._align_expected_output(
+                "fit_predict", dataset, expected_output_cols, output_cols_prefix
+            )
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+                example_output_pd_df=example_output_pd_df,
+            )
+        else:
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+            )
         self._sklearn_object = fitted_estimator
         self._is_fitted = True
         return output_result
@@ -740,12 +748,41 @@ class MLPRegressor(BaseTransformer):
         return rv
-    def _align_expected_output_names(
-        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
-    ) -> List[str]:
+    def _align_expected_output(
+        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
+    ) -> Tuple[List[str], pd.DataFrame]:
+        """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
+        and output dataframe with 1 line.
+        If the method is fit_predict, run 2 lines of data.
+        """
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
+        # so change the minimum of number of rows to 2
+        num_examples = 2
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=_SUBPROJECT,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), MLPRegressor.__class__.__name__
+            ),
+            api_calls=[Session.call],
+            custom_tags={"autogen": True} if self._autogenerated else None,
+        )
+        if output_cols_prefix == "fit_predict_":
+            if hasattr(self._sklearn_object, "n_clusters"):
+                # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
+                num_examples = self._sklearn_object.n_clusters
+            elif hasattr(self._sklearn_object, "min_samples"):
+                # OPTICS default min_samples 5, which requires at least 5 lines of data
+                num_examples = self._sklearn_object.min_samples
+            elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
+                # LocalOutlierFactor expects n_neighbors <= n_samples
+                num_examples = self._sklearn_object.n_neighbors
+            sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
+        else:
+            sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
         # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
         # seen during the fit.
@@ -757,12 +794,14 @@ class MLPRegressor(BaseTransformer):
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:
             output_df_columns_set -= set(self.sample_weight_col)
         # if the dimension of inferred output column names is correct; use it
         if len(expected_output_cols_list) == len(output_df_columns_set):
-            return expected_output_cols_list
+            return expected_output_cols_list, output_df_pd
         # otherwise, use the sklearn estimator's output
         else:
-            return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            return expected_output_cols_list, output_df_pd[expected_output_cols_list]
     @available_if(original_estimator_has_callable("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -808,7 +847,7 @@ class MLPRegressor(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -874,7 +913,7 @@ class MLPRegressor(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -937,7 +976,7 @@ class MLPRegressor(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -1002,7 +1041,7 @@ class MLPRegressor(BaseTransformer):
                 drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )

snowflake/ml/modeling/parameters/disable_model_tracer.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Disables the snowpark observability tracer when running modeling fit"""
+from snowflake.ml.modeling._internal.snowpark_implementations import snowpark_trainer
+snowpark_trainer._ENABLE_TRACER = False

snowflake/ml/modeling/pipeline/pipeline.py CHANGED Viewed

@@ -418,9 +418,6 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Fitted pipeline.
-        Raises:
-            ValueError: A pipeline incompatible with sklearn is used on MLRS
         """
         self._validate_steps()
@@ -437,8 +434,6 @@ class Pipeline(base.BaseTransformer):
         lineage_utils.set_data_sources(self, data_sources)
         if self._can_be_trained_in_ml_runtime(dataset):
-            if not self._is_convertible_to_sklearn:
-                raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
             self._fit_ml_runtime(dataset)
         elif squash and isinstance(dataset, snowpark.DataFrame):
@@ -611,14 +606,8 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
-        Raises:
-            ValueError: An sklearn object has not been fit and stored before calling this function.
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            if self._sklearn_object is None:
-                raise ValueError("Model must be fit before inference.")
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR) and self._sklearn_object is not None:
             expected_output_cols = self._infer_output_cols()
             handler = ModelTransformerBuilder.build(
                 dataset=dataset,

snowflake/ml/modeling/preprocessing/polynomial_features.py CHANGED Viewed

@@ -4,14 +4,12 @@
 #
 import inspect
 import os
-import posixpath
-from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
-from typing_extensions import TypeGuard
+from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
 from uuid import uuid4
 import cloudpickle as cp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from numpy import typing as npt
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
 from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
-from snowflake.ml._internal.utils import pkg_version_utils, identifier
+from snowflake.ml._internal.utils import identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.transformer_protocols import (
-    ModelTransformHandlers,
     BatchInferenceKwargsTypedDict,
     ScoreKwargsTypedDict
 )
@@ -497,12 +494,23 @@ class PolynomialFeatures(BaseTransformer):
             autogenerated=self._autogenerated,
             subproject=_SUBPROJECT,
         )
-        output_result, fitted_estimator = model_trainer.train_fit_predict(
-            drop_input_cols=self._drop_input_cols,
-            expected_output_cols_list=(
-                self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
-            ),
+        expected_output_cols = (
+            self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
         )
+        if isinstance(dataset, DataFrame):
+            expected_output_cols, example_output_pd_df = self._align_expected_output(
+                "fit_predict", dataset, expected_output_cols, output_cols_prefix
+            )
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+                example_output_pd_df=example_output_pd_df,
+            )
+        else:
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+            )
         self._sklearn_object = fitted_estimator
         self._is_fitted = True
         return output_result
@@ -583,12 +591,41 @@ class PolynomialFeatures(BaseTransformer):
         return rv
-    def _align_expected_output_names(
-        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
-    ) -> List[str]:
+    def _align_expected_output(
+        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
+    ) -> Tuple[List[str], pd.DataFrame]:
+        """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
+        and output dataframe with 1 line.
+        If the method is fit_predict, run 2 lines of data.
+        """
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
+        # so change the minimum of number of rows to 2
+        num_examples = 2
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=_SUBPROJECT,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), PolynomialFeatures.__class__.__name__
+            ),
+            api_calls=[Session.call],
+            custom_tags={"autogen": True} if self._autogenerated else None,
+        )
+        if output_cols_prefix == "fit_predict_":
+            if hasattr(self._sklearn_object, "n_clusters"):
+                # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
+                num_examples = self._sklearn_object.n_clusters
+            elif hasattr(self._sklearn_object, "min_samples"):
+                # OPTICS default min_samples 5, which requires at least 5 lines of data
+                num_examples = self._sklearn_object.min_samples
+            elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
+                # LocalOutlierFactor expects n_neighbors <= n_samples
+                num_examples = self._sklearn_object.n_neighbors
+            sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
+        else:
+            sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
         # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
         # seen during the fit.
@@ -600,12 +637,14 @@ class PolynomialFeatures(BaseTransformer):
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:
             output_df_columns_set -= set(self.sample_weight_col)
         # if the dimension of inferred output column names is correct; use it
         if len(expected_output_cols_list) == len(output_df_columns_set):
-            return expected_output_cols_list
+            return expected_output_cols_list, output_df_pd
         # otherwise, use the sklearn estimator's output
         else:
-            return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            return expected_output_cols_list, output_df_pd[expected_output_cols_list]
     @available_if(original_estimator_has_callable("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -651,7 +690,7 @@ class PolynomialFeatures(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -717,7 +756,7 @@ class PolynomialFeatures(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -780,7 +819,7 @@ class PolynomialFeatures(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -845,7 +884,7 @@ class PolynomialFeatures(BaseTransformer):
                 drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )

snowflake/ml/modeling/semi_supervised/label_propagation.py CHANGED Viewed

@@ -4,14 +4,12 @@
 #
 import inspect
 import os
-import posixpath
-from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
-from typing_extensions import TypeGuard
+from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
 from uuid import uuid4
 import cloudpickle as cp
-import pandas as pd
 import numpy as np
+import pandas as pd
 from numpy import typing as npt
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
 from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
-from snowflake.ml._internal.utils import pkg_version_utils, identifier
+from snowflake.ml._internal.utils import identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.transformer_protocols import (
-    ModelTransformHandlers,
     BatchInferenceKwargsTypedDict,
     ScoreKwargsTypedDict
 )
@@ -501,12 +498,23 @@ class LabelPropagation(BaseTransformer):
             autogenerated=self._autogenerated,
             subproject=_SUBPROJECT,
         )
-        output_result, fitted_estimator = model_trainer.train_fit_predict(
-            drop_input_cols=self._drop_input_cols,
-            expected_output_cols_list=(
-                self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
-            ),
+        expected_output_cols = (
+            self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
         )
+        if isinstance(dataset, DataFrame):
+            expected_output_cols, example_output_pd_df = self._align_expected_output(
+                "fit_predict", dataset, expected_output_cols, output_cols_prefix
+            )
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+                example_output_pd_df=example_output_pd_df,
+            )
+        else:
+            output_result, fitted_estimator = model_trainer.train_fit_predict(
+                drop_input_cols=self._drop_input_cols,
+                expected_output_cols_list=expected_output_cols,
+            )
         self._sklearn_object = fitted_estimator
         self._is_fitted = True
         return output_result
@@ -585,12 +593,41 @@ class LabelPropagation(BaseTransformer):
         return rv
-    def _align_expected_output_names(
-        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
-    ) -> List[str]:
+    def _align_expected_output(
+        self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
+    ) -> Tuple[List[str], pd.DataFrame]:
+        """ Run 1 line of data with the desired method, and return one tuple that consists of the output column names
+        and output dataframe with 1 line.
+        If the method is fit_predict, run 2 lines of data.
+        """
         # in case the inferred output column names dimension is different
         # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
-        sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
+        # For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
+        # so change the minimum of number of rows to 2
+        num_examples = 2
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=_SUBPROJECT,
+            function_name=telemetry.get_statement_params_full_func_name(
+                inspect.currentframe(), LabelPropagation.__class__.__name__
+            ),
+            api_calls=[Session.call],
+            custom_tags={"autogen": True} if self._autogenerated else None,
+        )
+        if output_cols_prefix == "fit_predict_":
+            if hasattr(self._sklearn_object, "n_clusters"):
+                # cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
+                num_examples = self._sklearn_object.n_clusters
+            elif hasattr(self._sklearn_object, "min_samples"):
+                # OPTICS default min_samples 5, which requires at least 5 lines of data
+                num_examples = self._sklearn_object.min_samples
+            elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
+                # LocalOutlierFactor expects n_neighbors <= n_samples
+                num_examples = self._sklearn_object.n_neighbors
+            sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
+        else:
+            sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
         # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
         # seen during the fit.
@@ -602,12 +639,14 @@ class LabelPropagation(BaseTransformer):
         output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
         if self.sample_weight_col:
             output_df_columns_set -= set(self.sample_weight_col)
         # if the dimension of inferred output column names is correct; use it
         if len(expected_output_cols_list) == len(output_df_columns_set):
-            return expected_output_cols_list
+            return expected_output_cols_list, output_df_pd
         # otherwise, use the sklearn estimator's output
         else:
-            return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
+            return expected_output_cols_list, output_df_pd[expected_output_cols_list]
     @available_if(original_estimator_has_callable("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -655,7 +694,7 @@ class LabelPropagation(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -723,7 +762,7 @@ class LabelPropagation(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -786,7 +825,7 @@ class LabelPropagation(BaseTransformer):
                 drop_input_cols=self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )
@@ -851,7 +890,7 @@ class LabelPropagation(BaseTransformer):
                 drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
-            expected_output_cols = self._align_expected_output_names(
+            expected_output_cols, _ = self._align_expected_output(
                 inference_method, dataset, expected_output_cols, output_cols_prefix
             )

snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl

snowflake-ml-python 1.6.1py3-none-any.whl → 1.6.2py3-none-any.whl