snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +1 -1
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/_internal/utils/uri.py +2 -2
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/feature_store.py +41 -17
- snowflake/ml/feature_store/feature_view.py +2 -2
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/model/_client/model/model_version_impl.py +22 -7
- snowflake/ml/model/_client/ops/model_ops.py +39 -3
- snowflake/ml/model/_client/ops/service_ops.py +198 -7
- snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
- snowflake/ml/model/_client/sql/service.py +85 -18
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
- snowflake/ml/model/_model_composer/model_composer.py +2 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
- snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
- snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
- snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
- snowflake/ml/model/_packager/model_packager.py +2 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/models/llm.py +3 -1
- snowflake/ml/model/type_hints.py +9 -1
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
- snowflake/ml/modeling/cluster/birch.py +60 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
- snowflake/ml/modeling/cluster/dbscan.py +60 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
- snowflake/ml/modeling/cluster/k_means.py +60 -21
- snowflake/ml/modeling/cluster/mean_shift.py +60 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
- snowflake/ml/modeling/cluster/optics.py +60 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
- snowflake/ml/modeling/compose/column_transformer.py +60 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
- snowflake/ml/modeling/covariance/oas.py +60 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/pca.py +60 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
- snowflake/ml/modeling/impute/knn_imputer.py +60 -21
- snowflake/ml/modeling/impute/missing_indicator.py +60 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/lars.py +60 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/perceptron.py +60 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ridge.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
- snowflake/ml/modeling/manifold/isomap.py +60 -21
- snowflake/ml/modeling/manifold/mds.py +60 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
- snowflake/ml/modeling/manifold/tsne.py +60 -21
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -12
- snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
- snowflake/ml/modeling/svm/linear_svc.py +60 -21
- snowflake/ml/modeling/svm/linear_svr.py +60 -21
- snowflake/ml/modeling/svm/nu_svc.py +60 -21
- snowflake/ml/modeling/svm/nu_svr.py +60 -21
- snowflake/ml/modeling/svm/svc.py +60 -21
- snowflake/ml/modeling/svm/svr.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/model_registry.py +1 -1
- snowflake/ml/registry/registry.py +1 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
- snowflake/ml/data/torch_dataset.py +0 -33
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -485,12 +482,23 @@ class RBFSampler(BaseTransformer):
|
|
485
482
|
autogenerated=self._autogenerated,
|
486
483
|
subproject=_SUBPROJECT,
|
487
484
|
)
|
488
|
-
|
489
|
-
|
490
|
-
expected_output_cols_list=(
|
491
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
492
|
-
),
|
485
|
+
expected_output_cols = (
|
486
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
493
487
|
)
|
488
|
+
if isinstance(dataset, DataFrame):
|
489
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
490
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
491
|
+
)
|
492
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
493
|
+
drop_input_cols=self._drop_input_cols,
|
494
|
+
expected_output_cols_list=expected_output_cols,
|
495
|
+
example_output_pd_df=example_output_pd_df,
|
496
|
+
)
|
497
|
+
else:
|
498
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
499
|
+
drop_input_cols=self._drop_input_cols,
|
500
|
+
expected_output_cols_list=expected_output_cols,
|
501
|
+
)
|
494
502
|
self._sklearn_object = fitted_estimator
|
495
503
|
self._is_fitted = True
|
496
504
|
return output_result
|
@@ -571,12 +579,41 @@ class RBFSampler(BaseTransformer):
|
|
571
579
|
|
572
580
|
return rv
|
573
581
|
|
574
|
-
def
|
575
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
576
|
-
) -> List[str]:
|
582
|
+
def _align_expected_output(
|
583
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
584
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
585
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
586
|
+
and output dataframe with 1 line.
|
587
|
+
If the method is fit_predict, run 2 lines of data.
|
588
|
+
"""
|
577
589
|
# in case the inferred output column names dimension is different
|
578
590
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
579
|
-
|
591
|
+
|
592
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
593
|
+
# so change the minimum of number of rows to 2
|
594
|
+
num_examples = 2
|
595
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
596
|
+
project=_PROJECT,
|
597
|
+
subproject=_SUBPROJECT,
|
598
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
599
|
+
inspect.currentframe(), RBFSampler.__class__.__name__
|
600
|
+
),
|
601
|
+
api_calls=[Session.call],
|
602
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
603
|
+
)
|
604
|
+
if output_cols_prefix == "fit_predict_":
|
605
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
606
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
607
|
+
num_examples = self._sklearn_object.n_clusters
|
608
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
609
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
610
|
+
num_examples = self._sklearn_object.min_samples
|
611
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
612
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
613
|
+
num_examples = self._sklearn_object.n_neighbors
|
614
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
615
|
+
else:
|
616
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
580
617
|
|
581
618
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
582
619
|
# seen during the fit.
|
@@ -588,12 +625,14 @@ class RBFSampler(BaseTransformer):
|
|
588
625
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
589
626
|
if self.sample_weight_col:
|
590
627
|
output_df_columns_set -= set(self.sample_weight_col)
|
628
|
+
|
591
629
|
# if the dimension of inferred output column names is correct; use it
|
592
630
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
593
|
-
return expected_output_cols_list
|
631
|
+
return expected_output_cols_list, output_df_pd
|
594
632
|
# otherwise, use the sklearn estimator's output
|
595
633
|
else:
|
596
|
-
|
634
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
635
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
597
636
|
|
598
637
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
599
638
|
@telemetry.send_api_usage_telemetry(
|
@@ -639,7 +678,7 @@ class RBFSampler(BaseTransformer):
|
|
639
678
|
drop_input_cols=self._drop_input_cols,
|
640
679
|
expected_output_cols_type="float",
|
641
680
|
)
|
642
|
-
expected_output_cols = self.
|
681
|
+
expected_output_cols, _ = self._align_expected_output(
|
643
682
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
644
683
|
)
|
645
684
|
|
@@ -705,7 +744,7 @@ class RBFSampler(BaseTransformer):
|
|
705
744
|
drop_input_cols=self._drop_input_cols,
|
706
745
|
expected_output_cols_type="float",
|
707
746
|
)
|
708
|
-
expected_output_cols = self.
|
747
|
+
expected_output_cols, _ = self._align_expected_output(
|
709
748
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
710
749
|
)
|
711
750
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -768,7 +807,7 @@ class RBFSampler(BaseTransformer):
|
|
768
807
|
drop_input_cols=self._drop_input_cols,
|
769
808
|
expected_output_cols_type="float",
|
770
809
|
)
|
771
|
-
expected_output_cols = self.
|
810
|
+
expected_output_cols, _ = self._align_expected_output(
|
772
811
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
773
812
|
)
|
774
813
|
|
@@ -833,7 +872,7 @@ class RBFSampler(BaseTransformer):
|
|
833
872
|
drop_input_cols = self._drop_input_cols,
|
834
873
|
expected_output_cols_type="float",
|
835
874
|
)
|
836
|
-
expected_output_cols = self.
|
875
|
+
expected_output_cols, _ = self._align_expected_output(
|
837
876
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
838
877
|
)
|
839
878
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -483,12 +480,23 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
483
480
|
autogenerated=self._autogenerated,
|
484
481
|
subproject=_SUBPROJECT,
|
485
482
|
)
|
486
|
-
|
487
|
-
|
488
|
-
expected_output_cols_list=(
|
489
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
490
|
-
),
|
483
|
+
expected_output_cols = (
|
484
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
491
485
|
)
|
486
|
+
if isinstance(dataset, DataFrame):
|
487
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
488
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
489
|
+
)
|
490
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
491
|
+
drop_input_cols=self._drop_input_cols,
|
492
|
+
expected_output_cols_list=expected_output_cols,
|
493
|
+
example_output_pd_df=example_output_pd_df,
|
494
|
+
)
|
495
|
+
else:
|
496
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
497
|
+
drop_input_cols=self._drop_input_cols,
|
498
|
+
expected_output_cols_list=expected_output_cols,
|
499
|
+
)
|
492
500
|
self._sklearn_object = fitted_estimator
|
493
501
|
self._is_fitted = True
|
494
502
|
return output_result
|
@@ -569,12 +577,41 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
569
577
|
|
570
578
|
return rv
|
571
579
|
|
572
|
-
def
|
573
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
574
|
-
) -> List[str]:
|
580
|
+
def _align_expected_output(
|
581
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
582
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
583
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
584
|
+
and output dataframe with 1 line.
|
585
|
+
If the method is fit_predict, run 2 lines of data.
|
586
|
+
"""
|
575
587
|
# in case the inferred output column names dimension is different
|
576
588
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
577
|
-
|
589
|
+
|
590
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
591
|
+
# so change the minimum of number of rows to 2
|
592
|
+
num_examples = 2
|
593
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
594
|
+
project=_PROJECT,
|
595
|
+
subproject=_SUBPROJECT,
|
596
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
597
|
+
inspect.currentframe(), SkewedChi2Sampler.__class__.__name__
|
598
|
+
),
|
599
|
+
api_calls=[Session.call],
|
600
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
601
|
+
)
|
602
|
+
if output_cols_prefix == "fit_predict_":
|
603
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
604
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
605
|
+
num_examples = self._sklearn_object.n_clusters
|
606
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
607
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
608
|
+
num_examples = self._sklearn_object.min_samples
|
609
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
610
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
611
|
+
num_examples = self._sklearn_object.n_neighbors
|
612
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
613
|
+
else:
|
614
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
578
615
|
|
579
616
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
580
617
|
# seen during the fit.
|
@@ -586,12 +623,14 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
586
623
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
587
624
|
if self.sample_weight_col:
|
588
625
|
output_df_columns_set -= set(self.sample_weight_col)
|
626
|
+
|
589
627
|
# if the dimension of inferred output column names is correct; use it
|
590
628
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
591
|
-
return expected_output_cols_list
|
629
|
+
return expected_output_cols_list, output_df_pd
|
592
630
|
# otherwise, use the sklearn estimator's output
|
593
631
|
else:
|
594
|
-
|
632
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
633
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
595
634
|
|
596
635
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
597
636
|
@telemetry.send_api_usage_telemetry(
|
@@ -637,7 +676,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
637
676
|
drop_input_cols=self._drop_input_cols,
|
638
677
|
expected_output_cols_type="float",
|
639
678
|
)
|
640
|
-
expected_output_cols = self.
|
679
|
+
expected_output_cols, _ = self._align_expected_output(
|
641
680
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
642
681
|
)
|
643
682
|
|
@@ -703,7 +742,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
703
742
|
drop_input_cols=self._drop_input_cols,
|
704
743
|
expected_output_cols_type="float",
|
705
744
|
)
|
706
|
-
expected_output_cols = self.
|
745
|
+
expected_output_cols, _ = self._align_expected_output(
|
707
746
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
708
747
|
)
|
709
748
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -766,7 +805,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
766
805
|
drop_input_cols=self._drop_input_cols,
|
767
806
|
expected_output_cols_type="float",
|
768
807
|
)
|
769
|
-
expected_output_cols = self.
|
808
|
+
expected_output_cols, _ = self._align_expected_output(
|
770
809
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
771
810
|
)
|
772
811
|
|
@@ -831,7 +870,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
831
870
|
drop_input_cols = self._drop_input_cols,
|
832
871
|
expected_output_cols_type="float",
|
833
872
|
)
|
834
|
-
expected_output_cols = self.
|
873
|
+
expected_output_cols, _ = self._align_expected_output(
|
835
874
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
836
875
|
)
|
837
876
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -517,12 +514,23 @@ class KernelRidge(BaseTransformer):
|
|
517
514
|
autogenerated=self._autogenerated,
|
518
515
|
subproject=_SUBPROJECT,
|
519
516
|
)
|
520
|
-
|
521
|
-
|
522
|
-
expected_output_cols_list=(
|
523
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
524
|
-
),
|
517
|
+
expected_output_cols = (
|
518
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
525
519
|
)
|
520
|
+
if isinstance(dataset, DataFrame):
|
521
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
522
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
523
|
+
)
|
524
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
525
|
+
drop_input_cols=self._drop_input_cols,
|
526
|
+
expected_output_cols_list=expected_output_cols,
|
527
|
+
example_output_pd_df=example_output_pd_df,
|
528
|
+
)
|
529
|
+
else:
|
530
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
531
|
+
drop_input_cols=self._drop_input_cols,
|
532
|
+
expected_output_cols_list=expected_output_cols,
|
533
|
+
)
|
526
534
|
self._sklearn_object = fitted_estimator
|
527
535
|
self._is_fitted = True
|
528
536
|
return output_result
|
@@ -601,12 +609,41 @@ class KernelRidge(BaseTransformer):
|
|
601
609
|
|
602
610
|
return rv
|
603
611
|
|
604
|
-
def
|
605
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
606
|
-
) -> List[str]:
|
612
|
+
def _align_expected_output(
|
613
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
614
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
615
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
616
|
+
and output dataframe with 1 line.
|
617
|
+
If the method is fit_predict, run 2 lines of data.
|
618
|
+
"""
|
607
619
|
# in case the inferred output column names dimension is different
|
608
620
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
609
|
-
|
621
|
+
|
622
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
623
|
+
# so change the minimum of number of rows to 2
|
624
|
+
num_examples = 2
|
625
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
626
|
+
project=_PROJECT,
|
627
|
+
subproject=_SUBPROJECT,
|
628
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
629
|
+
inspect.currentframe(), KernelRidge.__class__.__name__
|
630
|
+
),
|
631
|
+
api_calls=[Session.call],
|
632
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
633
|
+
)
|
634
|
+
if output_cols_prefix == "fit_predict_":
|
635
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
636
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
637
|
+
num_examples = self._sklearn_object.n_clusters
|
638
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
639
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
640
|
+
num_examples = self._sklearn_object.min_samples
|
641
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
642
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
643
|
+
num_examples = self._sklearn_object.n_neighbors
|
644
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
645
|
+
else:
|
646
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
610
647
|
|
611
648
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
612
649
|
# seen during the fit.
|
@@ -618,12 +655,14 @@ class KernelRidge(BaseTransformer):
|
|
618
655
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
619
656
|
if self.sample_weight_col:
|
620
657
|
output_df_columns_set -= set(self.sample_weight_col)
|
658
|
+
|
621
659
|
# if the dimension of inferred output column names is correct; use it
|
622
660
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
623
|
-
return expected_output_cols_list
|
661
|
+
return expected_output_cols_list, output_df_pd
|
624
662
|
# otherwise, use the sklearn estimator's output
|
625
663
|
else:
|
626
|
-
|
664
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
665
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
627
666
|
|
628
667
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
629
668
|
@telemetry.send_api_usage_telemetry(
|
@@ -669,7 +708,7 @@ class KernelRidge(BaseTransformer):
|
|
669
708
|
drop_input_cols=self._drop_input_cols,
|
670
709
|
expected_output_cols_type="float",
|
671
710
|
)
|
672
|
-
expected_output_cols = self.
|
711
|
+
expected_output_cols, _ = self._align_expected_output(
|
673
712
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
674
713
|
)
|
675
714
|
|
@@ -735,7 +774,7 @@ class KernelRidge(BaseTransformer):
|
|
735
774
|
drop_input_cols=self._drop_input_cols,
|
736
775
|
expected_output_cols_type="float",
|
737
776
|
)
|
738
|
-
expected_output_cols = self.
|
777
|
+
expected_output_cols, _ = self._align_expected_output(
|
739
778
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
740
779
|
)
|
741
780
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -798,7 +837,7 @@ class KernelRidge(BaseTransformer):
|
|
798
837
|
drop_input_cols=self._drop_input_cols,
|
799
838
|
expected_output_cols_type="float",
|
800
839
|
)
|
801
|
-
expected_output_cols = self.
|
840
|
+
expected_output_cols, _ = self._align_expected_output(
|
802
841
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
803
842
|
)
|
804
843
|
|
@@ -863,7 +902,7 @@ class KernelRidge(BaseTransformer):
|
|
863
902
|
drop_input_cols = self._drop_input_cols,
|
864
903
|
expected_output_cols_type="float",
|
865
904
|
)
|
866
|
-
expected_output_cols = self.
|
905
|
+
expected_output_cols, _ = self._align_expected_output(
|
867
906
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
868
907
|
)
|
869
908
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -506,12 +503,23 @@ class LGBMClassifier(BaseTransformer):
|
|
506
503
|
autogenerated=self._autogenerated,
|
507
504
|
subproject=_SUBPROJECT,
|
508
505
|
)
|
509
|
-
|
510
|
-
|
511
|
-
expected_output_cols_list=(
|
512
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
513
|
-
),
|
506
|
+
expected_output_cols = (
|
507
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
514
508
|
)
|
509
|
+
if isinstance(dataset, DataFrame):
|
510
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
511
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
512
|
+
)
|
513
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
514
|
+
drop_input_cols=self._drop_input_cols,
|
515
|
+
expected_output_cols_list=expected_output_cols,
|
516
|
+
example_output_pd_df=example_output_pd_df,
|
517
|
+
)
|
518
|
+
else:
|
519
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
520
|
+
drop_input_cols=self._drop_input_cols,
|
521
|
+
expected_output_cols_list=expected_output_cols,
|
522
|
+
)
|
515
523
|
self._sklearn_object = fitted_estimator
|
516
524
|
self._is_fitted = True
|
517
525
|
return output_result
|
@@ -590,12 +598,41 @@ class LGBMClassifier(BaseTransformer):
|
|
590
598
|
|
591
599
|
return rv
|
592
600
|
|
593
|
-
def
|
594
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
595
|
-
) -> List[str]:
|
601
|
+
def _align_expected_output(
|
602
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
603
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
604
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
605
|
+
and output dataframe with 1 line.
|
606
|
+
If the method is fit_predict, run 2 lines of data.
|
607
|
+
"""
|
596
608
|
# in case the inferred output column names dimension is different
|
597
609
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
598
|
-
|
610
|
+
|
611
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
612
|
+
# so change the minimum of number of rows to 2
|
613
|
+
num_examples = 2
|
614
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
615
|
+
project=_PROJECT,
|
616
|
+
subproject=_SUBPROJECT,
|
617
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
618
|
+
inspect.currentframe(), LGBMClassifier.__class__.__name__
|
619
|
+
),
|
620
|
+
api_calls=[Session.call],
|
621
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
622
|
+
)
|
623
|
+
if output_cols_prefix == "fit_predict_":
|
624
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
625
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
626
|
+
num_examples = self._sklearn_object.n_clusters
|
627
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
628
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
629
|
+
num_examples = self._sklearn_object.min_samples
|
630
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
631
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
632
|
+
num_examples = self._sklearn_object.n_neighbors
|
633
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
634
|
+
else:
|
635
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
599
636
|
|
600
637
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
601
638
|
# seen during the fit.
|
@@ -607,12 +644,14 @@ class LGBMClassifier(BaseTransformer):
|
|
607
644
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
608
645
|
if self.sample_weight_col:
|
609
646
|
output_df_columns_set -= set(self.sample_weight_col)
|
647
|
+
|
610
648
|
# if the dimension of inferred output column names is correct; use it
|
611
649
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
612
|
-
return expected_output_cols_list
|
650
|
+
return expected_output_cols_list, output_df_pd
|
613
651
|
# otherwise, use the sklearn estimator's output
|
614
652
|
else:
|
615
|
-
|
653
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
654
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
616
655
|
|
617
656
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
618
657
|
@telemetry.send_api_usage_telemetry(
|
@@ -660,7 +699,7 @@ class LGBMClassifier(BaseTransformer):
|
|
660
699
|
drop_input_cols=self._drop_input_cols,
|
661
700
|
expected_output_cols_type="float",
|
662
701
|
)
|
663
|
-
expected_output_cols = self.
|
702
|
+
expected_output_cols, _ = self._align_expected_output(
|
664
703
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
665
704
|
)
|
666
705
|
|
@@ -728,7 +767,7 @@ class LGBMClassifier(BaseTransformer):
|
|
728
767
|
drop_input_cols=self._drop_input_cols,
|
729
768
|
expected_output_cols_type="float",
|
730
769
|
)
|
731
|
-
expected_output_cols = self.
|
770
|
+
expected_output_cols, _ = self._align_expected_output(
|
732
771
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
733
772
|
)
|
734
773
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -791,7 +830,7 @@ class LGBMClassifier(BaseTransformer):
|
|
791
830
|
drop_input_cols=self._drop_input_cols,
|
792
831
|
expected_output_cols_type="float",
|
793
832
|
)
|
794
|
-
expected_output_cols = self.
|
833
|
+
expected_output_cols, _ = self._align_expected_output(
|
795
834
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
796
835
|
)
|
797
836
|
|
@@ -856,7 +895,7 @@ class LGBMClassifier(BaseTransformer):
|
|
856
895
|
drop_input_cols = self._drop_input_cols,
|
857
896
|
expected_output_cols_type="float",
|
858
897
|
)
|
859
|
-
expected_output_cols = self.
|
898
|
+
expected_output_cols, _ = self._align_expected_output(
|
860
899
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
861
900
|
)
|
862
901
|
|