snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_sentiment.py +7 -4
- snowflake/ml/_internal/env_utils.py +6 -0
- snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
- snowflake/ml/_internal/telemetry.py +1 -0
- snowflake/ml/_internal/utils/identifier.py +1 -1
- snowflake/ml/_internal/utils/sql_identifier.py +14 -1
- snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
- snowflake/ml/dataset/__init__.py +2 -1
- snowflake/ml/dataset/dataset.py +4 -3
- snowflake/ml/dataset/dataset_reader.py +5 -8
- snowflake/ml/feature_store/__init__.py +6 -0
- snowflake/ml/feature_store/access_manager.py +283 -0
- snowflake/ml/feature_store/feature_store.py +160 -100
- snowflake/ml/feature_store/feature_view.py +30 -19
- snowflake/ml/fileset/embedded_stage_fs.py +15 -12
- snowflake/ml/fileset/snowfs.py +2 -30
- snowflake/ml/fileset/stage_fs.py +25 -7
- snowflake/ml/model/_client/model/model_impl.py +46 -39
- snowflake/ml/model/_client/model/model_version_impl.py +24 -2
- snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
- snowflake/ml/model/_client/ops/model_ops.py +174 -16
- snowflake/ml/model/_client/sql/_base.py +34 -0
- snowflake/ml/model/_client/sql/model.py +32 -39
- snowflake/ml/model/_client/sql/model_version.py +111 -42
- snowflake/ml/model/_client/sql/stage.py +6 -32
- snowflake/ml/model/_client/sql/tag.py +32 -56
- snowflake/ml/model/_model_composer/model_composer.py +8 -4
- snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +90 -142
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +81 -3
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
- snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
- snowflake/ml/modeling/cluster/birch.py +8 -1
- snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
- snowflake/ml/modeling/cluster/dbscan.py +8 -1
- snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
- snowflake/ml/modeling/cluster/k_means.py +8 -1
- snowflake/ml/modeling/cluster/mean_shift.py +8 -1
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
- snowflake/ml/modeling/cluster/optics.py +8 -1
- snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
- snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
- snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
- snowflake/ml/modeling/compose/column_transformer.py +8 -1
- snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
- snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
- snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
- snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
- snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
- snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
- snowflake/ml/modeling/covariance/oas.py +8 -1
- snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
- snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
- snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
- snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
- snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
- snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
- snowflake/ml/modeling/decomposition/pca.py +8 -1
- snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
- snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
- snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
- snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
- snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
- snowflake/ml/modeling/framework/base.py +4 -3
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
- snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
- snowflake/ml/modeling/impute/knn_imputer.py +8 -1
- snowflake/ml/modeling/impute/missing_indicator.py +8 -1
- snowflake/ml/modeling/impute/simple_imputer.py +21 -2
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
- snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
- snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
- snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/lars.py +8 -1
- snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
- snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
- snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/perceptron.py +8 -1
- snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ridge.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
- snowflake/ml/modeling/manifold/isomap.py +8 -1
- snowflake/ml/modeling/manifold/mds.py +8 -1
- snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
- snowflake/ml/modeling/manifold/tsne.py +8 -1
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
- snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
- snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
- snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
- snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
- snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
- snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
- snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +27 -7
- snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
- snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
- snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
- snowflake/ml/modeling/svm/linear_svc.py +8 -1
- snowflake/ml/modeling/svm/linear_svr.py +8 -1
- snowflake/ml/modeling/svm/nu_svc.py +8 -1
- snowflake/ml/modeling/svm/nu_svr.py +8 -1
- snowflake/ml/modeling/svm/svc.py +8 -1
- snowflake/ml/modeling/svm/svr.py +8 -1
- snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
- snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
- snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
- snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
- snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
- snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
- snowflake/ml/registry/_manager/model_manager.py +95 -8
- snowflake/ml/registry/registry.py +10 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +66 -10
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +196 -192
- snowflake/ml/_internal/lineage/dataset_dataframe.py +0 -44
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -638,7 +638,14 @@ class BaggingClassifier(BaseTransformer):
|
|
638
638
|
) -> List[str]:
|
639
639
|
# in case the inferred output column names dimension is different
|
640
640
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
641
|
-
|
641
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
642
|
+
|
643
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
644
|
+
# seen during the fit.
|
645
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
646
|
+
sample_pd_df.columns = snowpark_column_names
|
647
|
+
|
648
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
642
649
|
output_df_columns = list(output_df_pd.columns)
|
643
650
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
644
651
|
if self.sample_weight_col:
|
@@ -638,7 +638,14 @@ class BaggingRegressor(BaseTransformer):
|
|
638
638
|
) -> List[str]:
|
639
639
|
# in case the inferred output column names dimension is different
|
640
640
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
641
|
-
|
641
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
642
|
+
|
643
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
644
|
+
# seen during the fit.
|
645
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
646
|
+
sample_pd_df.columns = snowpark_column_names
|
647
|
+
|
648
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
642
649
|
output_df_columns = list(output_df_pd.columns)
|
643
650
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
644
651
|
if self.sample_weight_col:
|
@@ -741,7 +741,14 @@ class ExtraTreesClassifier(BaseTransformer):
|
|
741
741
|
) -> List[str]:
|
742
742
|
# in case the inferred output column names dimension is different
|
743
743
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
744
|
-
|
744
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
745
|
+
|
746
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
747
|
+
# seen during the fit.
|
748
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
749
|
+
sample_pd_df.columns = snowpark_column_names
|
750
|
+
|
751
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
745
752
|
output_df_columns = list(output_df_pd.columns)
|
746
753
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
747
754
|
if self.sample_weight_col:
|
@@ -720,7 +720,14 @@ class ExtraTreesRegressor(BaseTransformer):
|
|
720
720
|
) -> List[str]:
|
721
721
|
# in case the inferred output column names dimension is different
|
722
722
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
723
|
-
|
723
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
724
|
+
|
725
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
726
|
+
# seen during the fit.
|
727
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
728
|
+
sample_pd_df.columns = snowpark_column_names
|
729
|
+
|
730
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
724
731
|
output_df_columns = list(output_df_pd.columns)
|
725
732
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
726
733
|
if self.sample_weight_col:
|
@@ -753,7 +753,14 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
753
753
|
) -> List[str]:
|
754
754
|
# in case the inferred output column names dimension is different
|
755
755
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
756
|
-
|
756
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
757
|
+
|
758
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
759
|
+
# seen during the fit.
|
760
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
761
|
+
sample_pd_df.columns = snowpark_column_names
|
762
|
+
|
763
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
757
764
|
output_df_columns = list(output_df_pd.columns)
|
758
765
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
759
766
|
if self.sample_weight_col:
|
@@ -762,7 +762,14 @@ class GradientBoostingRegressor(BaseTransformer):
|
|
762
762
|
) -> List[str]:
|
763
763
|
# in case the inferred output column names dimension is different
|
764
764
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
765
|
-
|
765
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
766
|
+
|
767
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
768
|
+
# seen during the fit.
|
769
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
770
|
+
sample_pd_df.columns = snowpark_column_names
|
771
|
+
|
772
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
766
773
|
output_df_columns = list(output_df_pd.columns)
|
767
774
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
768
775
|
if self.sample_weight_col:
|
@@ -734,7 +734,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
734
734
|
) -> List[str]:
|
735
735
|
# in case the inferred output column names dimension is different
|
736
736
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
737
|
-
|
737
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
738
|
+
|
739
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
740
|
+
# seen during the fit.
|
741
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
742
|
+
sample_pd_df.columns = snowpark_column_names
|
743
|
+
|
744
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
738
745
|
output_df_columns = list(output_df_pd.columns)
|
739
746
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
740
747
|
if self.sample_weight_col:
|
@@ -725,7 +725,14 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
725
725
|
) -> List[str]:
|
726
726
|
# in case the inferred output column names dimension is different
|
727
727
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
728
|
-
|
728
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
729
|
+
|
730
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
731
|
+
# seen during the fit.
|
732
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
733
|
+
sample_pd_df.columns = snowpark_column_names
|
734
|
+
|
735
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
729
736
|
output_df_columns = list(output_df_pd.columns)
|
730
737
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
731
738
|
if self.sample_weight_col:
|
@@ -627,7 +627,14 @@ class IsolationForest(BaseTransformer):
|
|
627
627
|
) -> List[str]:
|
628
628
|
# in case the inferred output column names dimension is different
|
629
629
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
630
|
-
|
630
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
631
|
+
|
632
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
633
|
+
# seen during the fit.
|
634
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
635
|
+
sample_pd_df.columns = snowpark_column_names
|
636
|
+
|
637
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
631
638
|
output_df_columns = list(output_df_pd.columns)
|
632
639
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
633
640
|
if self.sample_weight_col:
|
@@ -737,7 +737,14 @@ class RandomForestClassifier(BaseTransformer):
|
|
737
737
|
) -> List[str]:
|
738
738
|
# in case the inferred output column names dimension is different
|
739
739
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
740
|
-
|
740
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
741
|
+
|
742
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
743
|
+
# seen during the fit.
|
744
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
745
|
+
sample_pd_df.columns = snowpark_column_names
|
746
|
+
|
747
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
741
748
|
output_df_columns = list(output_df_pd.columns)
|
742
749
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
743
750
|
if self.sample_weight_col:
|
@@ -716,7 +716,14 @@ class RandomForestRegressor(BaseTransformer):
|
|
716
716
|
) -> List[str]:
|
717
717
|
# in case the inferred output column names dimension is different
|
718
718
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
719
|
-
|
719
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
720
|
+
|
721
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
722
|
+
# seen during the fit.
|
723
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
724
|
+
sample_pd_df.columns = snowpark_column_names
|
725
|
+
|
726
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
720
727
|
output_df_columns = list(output_df_pd.columns)
|
721
728
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
722
729
|
if self.sample_weight_col:
|
@@ -621,7 +621,14 @@ class StackingRegressor(BaseTransformer):
|
|
621
621
|
) -> List[str]:
|
622
622
|
# in case the inferred output column names dimension is different
|
623
623
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
624
|
-
|
624
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
625
|
+
|
626
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
627
|
+
# seen during the fit.
|
628
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
629
|
+
sample_pd_df.columns = snowpark_column_names
|
630
|
+
|
631
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
625
632
|
output_df_columns = list(output_df_pd.columns)
|
626
633
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
627
634
|
if self.sample_weight_col:
|
@@ -603,7 +603,14 @@ class VotingClassifier(BaseTransformer):
|
|
603
603
|
) -> List[str]:
|
604
604
|
# in case the inferred output column names dimension is different
|
605
605
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
606
|
-
|
606
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
607
|
+
|
608
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
609
|
+
# seen during the fit.
|
610
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
611
|
+
sample_pd_df.columns = snowpark_column_names
|
612
|
+
|
613
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
607
614
|
output_df_columns = list(output_df_pd.columns)
|
608
615
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
609
616
|
if self.sample_weight_col:
|
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
|
|
585
585
|
) -> List[str]:
|
586
586
|
# in case the inferred output column names dimension is different
|
587
587
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
588
|
-
|
588
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
589
|
+
|
590
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
591
|
+
# seen during the fit.
|
592
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
593
|
+
sample_pd_df.columns = snowpark_column_names
|
594
|
+
|
595
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
589
596
|
output_df_columns = list(output_df_pd.columns)
|
590
597
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
591
598
|
if self.sample_weight_col:
|
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
573
573
|
) -> List[str]:
|
574
574
|
# in case the inferred output column names dimension is different
|
575
575
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
576
|
-
|
576
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
577
|
+
|
578
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
579
|
+
# seen during the fit.
|
580
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
581
|
+
sample_pd_df.columns = snowpark_column_names
|
582
|
+
|
583
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
577
584
|
output_df_columns = list(output_df_pd.columns)
|
578
585
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
579
586
|
if self.sample_weight_col:
|
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
|
|
570
570
|
) -> List[str]:
|
571
571
|
# in case the inferred output column names dimension is different
|
572
572
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
573
|
-
|
573
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
574
|
+
|
575
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
576
|
+
# seen during the fit.
|
577
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
578
|
+
sample_pd_df.columns = snowpark_column_names
|
579
|
+
|
580
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
574
581
|
output_df_columns = list(output_df_pd.columns)
|
575
582
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
576
583
|
if self.sample_weight_col:
|
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
627
627
|
) -> List[str]:
|
628
628
|
# in case the inferred output column names dimension is different
|
629
629
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
630
|
-
|
630
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
631
|
+
|
632
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
633
|
+
# seen during the fit.
|
634
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
635
|
+
sample_pd_df.columns = snowpark_column_names
|
636
|
+
|
637
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
631
638
|
output_df_columns = list(output_df_pd.columns)
|
632
639
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
633
640
|
if self.sample_weight_col:
|
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
|
|
560
560
|
) -> List[str]:
|
561
561
|
# in case the inferred output column names dimension is different
|
562
562
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
563
|
-
|
563
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
564
|
+
|
565
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
566
|
+
# seen during the fit.
|
567
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
568
|
+
sample_pd_df.columns = snowpark_column_names
|
569
|
+
|
570
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
564
571
|
output_df_columns = list(output_df_pd.columns)
|
565
572
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
566
573
|
if self.sample_weight_col:
|
@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
|
|
16
16
|
exceptions,
|
17
17
|
modeling_error_messages,
|
18
18
|
)
|
19
|
-
from snowflake.ml._internal.lineage import data_source,
|
19
|
+
from snowflake.ml._internal.lineage import data_source, lineage_utils
|
20
20
|
from snowflake.ml._internal.utils import identifier, parallelize
|
21
21
|
from snowflake.ml.modeling.framework import _utils
|
22
22
|
from snowflake.snowpark import functions as F
|
@@ -430,8 +430,9 @@ class BaseEstimator(Base):
|
|
430
430
|
)
|
431
431
|
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
|
432
432
|
"""Runs universal logics for all fit implementations."""
|
433
|
-
|
434
|
-
|
433
|
+
self._data_sources = getattr(dataset, lineage_utils.DATA_SOURCES_ATTR, None)
|
434
|
+
if self._data_sources:
|
435
|
+
assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
|
435
436
|
return self._fit(dataset)
|
436
437
|
|
437
438
|
@abstractmethod
|
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
653
653
|
) -> List[str]:
|
654
654
|
# in case the inferred output column names dimension is different
|
655
655
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
656
|
-
|
656
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
657
|
+
|
658
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
659
|
+
# seen during the fit.
|
660
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
661
|
+
sample_pd_df.columns = snowpark_column_names
|
662
|
+
|
663
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
657
664
|
output_df_columns = list(output_df_pd.columns)
|
658
665
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
659
666
|
if self.sample_weight_col:
|
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
644
644
|
) -> List[str]:
|
645
645
|
# in case the inferred output column names dimension is different
|
646
646
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
647
|
-
|
647
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
648
|
+
|
649
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
650
|
+
# seen during the fit.
|
651
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
652
|
+
sample_pd_df.columns = snowpark_column_names
|
653
|
+
|
654
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
648
655
|
output_df_columns = list(output_df_pd.columns)
|
649
656
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
650
657
|
if self.sample_weight_col:
|
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
|
|
688
688
|
) -> List[str]:
|
689
689
|
# in case the inferred output column names dimension is different
|
690
690
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
691
|
-
|
691
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
692
|
+
|
693
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
694
|
+
# seen during the fit.
|
695
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
696
|
+
sample_pd_df.columns = snowpark_column_names
|
697
|
+
|
698
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
692
699
|
output_df_columns = list(output_df_pd.columns)
|
693
700
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
694
701
|
if self.sample_weight_col:
|
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
|
|
614
614
|
) -> List[str]:
|
615
615
|
# in case the inferred output column names dimension is different
|
616
616
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
617
|
-
|
617
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
618
|
+
|
619
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
620
|
+
# seen during the fit.
|
621
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
622
|
+
sample_pd_df.columns = snowpark_column_names
|
623
|
+
|
624
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
618
625
|
output_df_columns = list(output_df_pd.columns)
|
619
626
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
620
627
|
if self.sample_weight_col:
|
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
|
|
588
588
|
) -> List[str]:
|
589
589
|
# in case the inferred output column names dimension is different
|
590
590
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
591
|
-
|
591
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
592
|
+
|
593
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
594
|
+
# seen during the fit.
|
595
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
596
|
+
sample_pd_df.columns = snowpark_column_names
|
597
|
+
|
598
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
592
599
|
output_df_columns = list(output_df_pd.columns)
|
593
600
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
594
601
|
if self.sample_weight_col:
|
@@ -158,6 +158,7 @@ class SimpleImputer(base.BaseTransformer):
|
|
158
158
|
|
159
159
|
self.fill_value = fill_value
|
160
160
|
self.missing_values = missing_values
|
161
|
+
self.statistics_: Dict[str, Any] = {}
|
161
162
|
# TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
|
162
163
|
# Add back when `keep_empty_features` is supported.
|
163
164
|
# self.keep_empty_features = keep_empty_features
|
@@ -229,8 +230,27 @@ class SimpleImputer(base.BaseTransformer):
|
|
229
230
|
|
230
231
|
return input_col_datatypes
|
231
232
|
|
233
|
+
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
|
234
|
+
if isinstance(dataset, snowpark.DataFrame):
|
235
|
+
return self._fit_snowpark(dataset)
|
236
|
+
else:
|
237
|
+
return self._fit_sklearn(dataset)
|
238
|
+
|
239
|
+
def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
|
240
|
+
dataset = self._use_input_cols_only(dataset)
|
241
|
+
sklearn_simple_imputer = self._create_sklearn_object()
|
242
|
+
sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
|
243
|
+
self._sklearn_object = sklearn_simple_imputer
|
244
|
+
for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
|
245
|
+
self.statistics_[input_col] = fill_value
|
246
|
+
self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
|
247
|
+
self.n_features_in_ = len(self.input_cols)
|
248
|
+
self.feature_names_in_ = self.input_cols
|
249
|
+
self._is_fitted = True
|
250
|
+
return self
|
251
|
+
|
232
252
|
@telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
|
233
|
-
def
|
253
|
+
def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
|
234
254
|
"""
|
235
255
|
Compute values to impute for the dataset according to the strategy.
|
236
256
|
|
@@ -245,7 +265,6 @@ class SimpleImputer(base.BaseTransformer):
|
|
245
265
|
# In order to fit, the input columns should have the same type.
|
246
266
|
input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
|
247
267
|
|
248
|
-
self.statistics_: Dict[str, Any] = {}
|
249
268
|
statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
|
250
269
|
|
251
270
|
if self.strategy == "constant":
|
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
563
563
|
) -> List[str]:
|
564
564
|
# in case the inferred output column names dimension is different
|
565
565
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
566
|
-
|
566
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
567
|
+
|
568
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
569
|
+
# seen during the fit.
|
570
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
571
|
+
sample_pd_df.columns = snowpark_column_names
|
572
|
+
|
573
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
567
574
|
output_df_columns = list(output_df_pd.columns)
|
568
575
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
569
576
|
if self.sample_weight_col:
|
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
|
|
611
611
|
) -> List[str]:
|
612
612
|
# in case the inferred output column names dimension is different
|
613
613
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
614
|
-
|
614
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
615
|
+
|
616
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
617
|
+
# seen during the fit.
|
618
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
619
|
+
sample_pd_df.columns = snowpark_column_names
|
620
|
+
|
621
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
615
622
|
output_df_columns = list(output_df_pd.columns)
|
616
623
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
617
624
|
if self.sample_weight_col:
|
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
587
587
|
) -> List[str]:
|
588
588
|
# in case the inferred output column names dimension is different
|
589
589
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
590
|
-
|
590
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
591
|
+
|
592
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
593
|
+
# seen during the fit.
|
594
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
595
|
+
sample_pd_df.columns = snowpark_column_names
|
596
|
+
|
597
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
591
598
|
output_df_columns = list(output_df_pd.columns)
|
592
599
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
593
600
|
if self.sample_weight_col:
|
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
|
|
574
574
|
) -> List[str]:
|
575
575
|
# in case the inferred output column names dimension is different
|
576
576
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
577
|
-
|
577
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
578
|
+
|
579
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
580
|
+
# seen during the fit.
|
581
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
582
|
+
sample_pd_df.columns = snowpark_column_names
|
583
|
+
|
584
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
578
585
|
output_df_columns = list(output_df_pd.columns)
|
579
586
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
580
587
|
if self.sample_weight_col:
|
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
572
572
|
) -> List[str]:
|
573
573
|
# in case the inferred output column names dimension is different
|
574
574
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
575
|
-
|
575
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
576
|
+
|
577
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
578
|
+
# seen during the fit.
|
579
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
580
|
+
sample_pd_df.columns = snowpark_column_names
|
581
|
+
|
582
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
576
583
|
output_df_columns = list(output_df_pd.columns)
|
577
584
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
578
585
|
if self.sample_weight_col:
|