snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_complete.py +26 -5
- snowflake/cortex/_sentiment.py +7 -4
- snowflake/cortex/_sse_client.py +81 -0
- snowflake/cortex/_util.py +105 -8
- snowflake/ml/_internal/lineage/lineage_utils.py +34 -25
- snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
- snowflake/ml/dataset/dataset.py +15 -12
- snowflake/ml/dataset/dataset_factory.py +3 -4
- snowflake/ml/feature_store/access_manager.py +34 -30
- snowflake/ml/feature_store/feature_store.py +3 -3
- snowflake/ml/feature_store/feature_view.py +12 -11
- snowflake/ml/fileset/snowfs.py +2 -31
- snowflake/ml/model/_client/ops/model_ops.py +43 -0
- snowflake/ml/model/_client/sql/model_version.py +55 -3
- snowflake/ml/model/_model_composer/model_composer.py +7 -3
- snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -1
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
- snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
- snowflake/ml/model/_signatures/builtins_handler.py +2 -1
- snowflake/ml/model/_signatures/core.py +13 -1
- snowflake/ml/model/_signatures/pandas_handler.py +2 -0
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
- snowflake/ml/model/model_signature.py +2 -0
- snowflake/ml/model/type_hints.py +1 -0
- snowflake/ml/modeling/_internal/estimator_utils.py +58 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +196 -242
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +161 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +38 -18
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +82 -134
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +21 -17
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +9 -2
- snowflake/ml/modeling/cluster/affinity_propagation.py +9 -2
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +9 -2
- snowflake/ml/modeling/cluster/birch.py +9 -2
- snowflake/ml/modeling/cluster/bisecting_k_means.py +9 -2
- snowflake/ml/modeling/cluster/dbscan.py +9 -2
- snowflake/ml/modeling/cluster/feature_agglomeration.py +9 -2
- snowflake/ml/modeling/cluster/k_means.py +9 -2
- snowflake/ml/modeling/cluster/mean_shift.py +9 -2
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +9 -2
- snowflake/ml/modeling/cluster/optics.py +9 -2
- snowflake/ml/modeling/cluster/spectral_biclustering.py +9 -2
- snowflake/ml/modeling/cluster/spectral_clustering.py +9 -2
- snowflake/ml/modeling/cluster/spectral_coclustering.py +9 -2
- snowflake/ml/modeling/compose/column_transformer.py +9 -2
- snowflake/ml/modeling/compose/transformed_target_regressor.py +9 -2
- snowflake/ml/modeling/covariance/elliptic_envelope.py +9 -2
- snowflake/ml/modeling/covariance/empirical_covariance.py +9 -2
- snowflake/ml/modeling/covariance/graphical_lasso.py +9 -2
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +9 -2
- snowflake/ml/modeling/covariance/ledoit_wolf.py +9 -2
- snowflake/ml/modeling/covariance/min_cov_det.py +9 -2
- snowflake/ml/modeling/covariance/oas.py +9 -2
- snowflake/ml/modeling/covariance/shrunk_covariance.py +9 -2
- snowflake/ml/modeling/decomposition/dictionary_learning.py +9 -2
- snowflake/ml/modeling/decomposition/factor_analysis.py +9 -2
- snowflake/ml/modeling/decomposition/fast_ica.py +9 -2
- snowflake/ml/modeling/decomposition/incremental_pca.py +9 -2
- snowflake/ml/modeling/decomposition/kernel_pca.py +9 -2
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +9 -2
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +9 -2
- snowflake/ml/modeling/decomposition/pca.py +9 -2
- snowflake/ml/modeling/decomposition/sparse_pca.py +9 -2
- snowflake/ml/modeling/decomposition/truncated_svd.py +9 -2
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +9 -2
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +9 -2
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/bagging_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/bagging_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/isolation_forest.py +9 -2
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/stacking_regressor.py +9 -2
- snowflake/ml/modeling/ensemble/voting_classifier.py +9 -2
- snowflake/ml/modeling/ensemble/voting_regressor.py +9 -2
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +9 -2
- snowflake/ml/modeling/feature_selection/select_fdr.py +9 -2
- snowflake/ml/modeling/feature_selection/select_fpr.py +9 -2
- snowflake/ml/modeling/feature_selection/select_fwe.py +9 -2
- snowflake/ml/modeling/feature_selection/select_k_best.py +9 -2
- snowflake/ml/modeling/feature_selection/select_percentile.py +9 -2
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +9 -2
- snowflake/ml/modeling/feature_selection/variance_threshold.py +9 -2
- snowflake/ml/modeling/framework/base.py +3 -8
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +9 -2
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +9 -2
- snowflake/ml/modeling/impute/iterative_imputer.py +9 -2
- snowflake/ml/modeling/impute/knn_imputer.py +9 -2
- snowflake/ml/modeling/impute/missing_indicator.py +9 -2
- snowflake/ml/modeling/impute/simple_imputer.py +28 -5
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +9 -2
- snowflake/ml/modeling/kernel_approximation/nystroem.py +9 -2
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +9 -2
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +9 -2
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +9 -2
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +9 -2
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +9 -2
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/ard_regression.py +9 -2
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +9 -2
- snowflake/ml/modeling/linear_model/elastic_net.py +9 -2
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +9 -2
- snowflake/ml/modeling/linear_model/gamma_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/huber_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/lars.py +9 -2
- snowflake/ml/modeling/linear_model/lars_cv.py +9 -2
- snowflake/ml/modeling/linear_model/lasso.py +9 -2
- snowflake/ml/modeling/linear_model/lasso_cv.py +9 -2
- snowflake/ml/modeling/linear_model/lasso_lars.py +9 -2
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +9 -2
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +9 -2
- snowflake/ml/modeling/linear_model/linear_regression.py +9 -2
- snowflake/ml/modeling/linear_model/logistic_regression.py +9 -2
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +9 -2
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +9 -2
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +9 -2
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +9 -2
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +9 -2
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +9 -2
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +9 -2
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/perceptron.py +9 -2
- snowflake/ml/modeling/linear_model/poisson_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/ransac_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/ridge.py +9 -2
- snowflake/ml/modeling/linear_model/ridge_classifier.py +9 -2
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +9 -2
- snowflake/ml/modeling/linear_model/ridge_cv.py +9 -2
- snowflake/ml/modeling/linear_model/sgd_classifier.py +9 -2
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +9 -2
- snowflake/ml/modeling/linear_model/sgd_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +9 -2
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +9 -2
- snowflake/ml/modeling/manifold/isomap.py +9 -2
- snowflake/ml/modeling/manifold/mds.py +9 -2
- snowflake/ml/modeling/manifold/spectral_embedding.py +9 -2
- snowflake/ml/modeling/manifold/tsne.py +9 -2
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +9 -2
- snowflake/ml/modeling/mixture/gaussian_mixture.py +9 -2
- snowflake/ml/modeling/model_selection/grid_search_cv.py +1 -5
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +1 -5
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +9 -2
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +9 -2
- snowflake/ml/modeling/multiclass/output_code_classifier.py +9 -2
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +9 -2
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +9 -2
- snowflake/ml/modeling/naive_bayes/complement_nb.py +9 -2
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +9 -2
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +9 -2
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +9 -2
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +9 -2
- snowflake/ml/modeling/neighbors/kernel_density.py +9 -2
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +9 -2
- snowflake/ml/modeling/neighbors/nearest_centroid.py +9 -2
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +9 -2
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +9 -2
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +9 -2
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +9 -2
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +9 -2
- snowflake/ml/modeling/neural_network/mlp_classifier.py +9 -2
- snowflake/ml/modeling/neural_network/mlp_regressor.py +9 -2
- snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +5 -0
- snowflake/ml/modeling/preprocessing/binarizer.py +7 -3
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +7 -2
- snowflake/ml/modeling/preprocessing/label_encoder.py +8 -7
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +7 -3
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +7 -4
- snowflake/ml/modeling/preprocessing/normalizer.py +7 -3
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +10 -2
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +8 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +9 -2
- snowflake/ml/modeling/preprocessing/robust_scaler.py +7 -4
- snowflake/ml/modeling/preprocessing/standard_scaler.py +7 -3
- snowflake/ml/modeling/semi_supervised/label_propagation.py +9 -2
- snowflake/ml/modeling/semi_supervised/label_spreading.py +9 -2
- snowflake/ml/modeling/svm/linear_svc.py +9 -2
- snowflake/ml/modeling/svm/linear_svr.py +9 -2
- snowflake/ml/modeling/svm/nu_svc.py +9 -2
- snowflake/ml/modeling/svm/nu_svr.py +9 -2
- snowflake/ml/modeling/svm/svc.py +9 -2
- snowflake/ml/modeling/svm/svr.py +9 -2
- snowflake/ml/modeling/tree/decision_tree_classifier.py +9 -2
- snowflake/ml/modeling/tree/decision_tree_regressor.py +9 -2
- snowflake/ml/modeling/tree/extra_tree_classifier.py +9 -2
- snowflake/ml/modeling/tree/extra_tree_regressor.py +9 -2
- snowflake/ml/modeling/xgboost/xgb_classifier.py +9 -2
- snowflake/ml/modeling/xgboost/xgb_regressor.py +9 -2
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +9 -2
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +9 -2
- snowflake/ml/registry/_manager/model_manager.py +59 -1
- snowflake/ml/registry/registry.py +10 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/METADATA +32 -4
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/RECORD +207 -204
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/top_level.txt +0 -0
@@ -248,7 +248,7 @@ class VotingRegressor(BaseTransformer):
|
|
248
248
|
inspect.currentframe(), VotingRegressor.__class__.__name__
|
249
249
|
),
|
250
250
|
api_calls=[Session.call],
|
251
|
-
custom_tags=
|
251
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
252
252
|
)
|
253
253
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
254
254
|
pd_df.columns = dataset.columns
|
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
|
|
585
585
|
) -> List[str]:
|
586
586
|
# in case the inferred output column names dimension is different
|
587
587
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
588
|
-
|
588
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
589
|
+
|
590
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
591
|
+
# seen during the fit.
|
592
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
593
|
+
sample_pd_df.columns = snowpark_column_names
|
594
|
+
|
595
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
589
596
|
output_df_columns = list(output_df_pd.columns)
|
590
597
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
591
598
|
if self.sample_weight_col:
|
@@ -238,7 +238,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
238
238
|
inspect.currentframe(), GenericUnivariateSelect.__class__.__name__
|
239
239
|
),
|
240
240
|
api_calls=[Session.call],
|
241
|
-
custom_tags=
|
241
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
242
242
|
)
|
243
243
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
244
244
|
pd_df.columns = dataset.columns
|
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
573
573
|
) -> List[str]:
|
574
574
|
# in case the inferred output column names dimension is different
|
575
575
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
576
|
-
|
576
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
577
|
+
|
578
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
579
|
+
# seen during the fit.
|
580
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
581
|
+
sample_pd_df.columns = snowpark_column_names
|
582
|
+
|
583
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
577
584
|
output_df_columns = list(output_df_pd.columns)
|
578
585
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
579
586
|
if self.sample_weight_col:
|
@@ -234,7 +234,7 @@ class SelectFdr(BaseTransformer):
|
|
234
234
|
inspect.currentframe(), SelectFdr.__class__.__name__
|
235
235
|
),
|
236
236
|
api_calls=[Session.call],
|
237
|
-
custom_tags=
|
237
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
238
238
|
)
|
239
239
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
240
240
|
pd_df.columns = dataset.columns
|
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -234,7 +234,7 @@ class SelectFpr(BaseTransformer):
|
|
234
234
|
inspect.currentframe(), SelectFpr.__class__.__name__
|
235
235
|
),
|
236
236
|
api_calls=[Session.call],
|
237
|
-
custom_tags=
|
237
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
238
238
|
)
|
239
239
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
240
240
|
pd_df.columns = dataset.columns
|
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -234,7 +234,7 @@ class SelectFwe(BaseTransformer):
|
|
234
234
|
inspect.currentframe(), SelectFwe.__class__.__name__
|
235
235
|
),
|
236
236
|
api_calls=[Session.call],
|
237
|
-
custom_tags=
|
237
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
238
238
|
)
|
239
239
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
240
240
|
pd_df.columns = dataset.columns
|
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -235,7 +235,7 @@ class SelectKBest(BaseTransformer):
|
|
235
235
|
inspect.currentframe(), SelectKBest.__class__.__name__
|
236
236
|
),
|
237
237
|
api_calls=[Session.call],
|
238
|
-
custom_tags=
|
238
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
239
239
|
)
|
240
240
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
241
241
|
pd_df.columns = dataset.columns
|
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
|
|
570
570
|
) -> List[str]:
|
571
571
|
# in case the inferred output column names dimension is different
|
572
572
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
573
|
-
|
573
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
574
|
+
|
575
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
576
|
+
# seen during the fit.
|
577
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
578
|
+
sample_pd_df.columns = snowpark_column_names
|
579
|
+
|
580
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
574
581
|
output_df_columns = list(output_df_pd.columns)
|
575
582
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
576
583
|
if self.sample_weight_col:
|
@@ -234,7 +234,7 @@ class SelectPercentile(BaseTransformer):
|
|
234
234
|
inspect.currentframe(), SelectPercentile.__class__.__name__
|
235
235
|
),
|
236
236
|
api_calls=[Session.call],
|
237
|
-
custom_tags=
|
237
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
238
238
|
)
|
239
239
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
240
240
|
pd_df.columns = dataset.columns
|
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
|
|
569
569
|
) -> List[str]:
|
570
570
|
# in case the inferred output column names dimension is different
|
571
571
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
572
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
573
|
+
|
574
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
|
+
# seen during the fit.
|
576
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
577
|
+
sample_pd_df.columns = snowpark_column_names
|
578
|
+
|
579
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
573
580
|
output_df_columns = list(output_df_pd.columns)
|
574
581
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
582
|
if self.sample_weight_col:
|
@@ -292,7 +292,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
292
292
|
inspect.currentframe(), SequentialFeatureSelector.__class__.__name__
|
293
293
|
),
|
294
294
|
api_calls=[Session.call],
|
295
|
-
custom_tags=
|
295
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
296
296
|
)
|
297
297
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
298
298
|
pd_df.columns = dataset.columns
|
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
627
627
|
) -> List[str]:
|
628
628
|
# in case the inferred output column names dimension is different
|
629
629
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
630
|
-
|
630
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
631
|
+
|
632
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
633
|
+
# seen during the fit.
|
634
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
635
|
+
sample_pd_df.columns = snowpark_column_names
|
636
|
+
|
637
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
631
638
|
output_df_columns = list(output_df_pd.columns)
|
632
639
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
633
640
|
if self.sample_weight_col:
|
@@ -225,7 +225,7 @@ class VarianceThreshold(BaseTransformer):
|
|
225
225
|
inspect.currentframe(), VarianceThreshold.__class__.__name__
|
226
226
|
),
|
227
227
|
api_calls=[Session.call],
|
228
|
-
custom_tags=
|
228
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
229
229
|
)
|
230
230
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
231
231
|
pd_df.columns = dataset.columns
|
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
|
|
560
560
|
) -> List[str]:
|
561
561
|
# in case the inferred output column names dimension is different
|
562
562
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
563
|
-
|
563
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
564
|
+
|
565
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
566
|
+
# seen during the fit.
|
567
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
568
|
+
sample_pd_df.columns = snowpark_column_names
|
569
|
+
|
570
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
564
571
|
output_df_columns = list(output_df_pd.columns)
|
565
572
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
566
573
|
if self.sample_weight_col:
|
@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
|
|
16
16
|
exceptions,
|
17
17
|
modeling_error_messages,
|
18
18
|
)
|
19
|
-
from snowflake.ml._internal.lineage import
|
19
|
+
from snowflake.ml._internal.lineage import lineage_utils
|
20
20
|
from snowflake.ml._internal.utils import identifier, parallelize
|
21
21
|
from snowflake.ml.modeling.framework import _utils
|
22
22
|
from snowflake.snowpark import functions as F
|
@@ -386,7 +386,6 @@ class BaseEstimator(Base):
|
|
386
386
|
self.file_names = file_names
|
387
387
|
self.custom_states = custom_states
|
388
388
|
self.sample_weight_col = sample_weight_col
|
389
|
-
self._data_sources: Optional[List[data_source.DataSource]] = None
|
390
389
|
|
391
390
|
self.start_time = datetime.now().strftime(_utils.DATETIME_FORMAT)[:-3]
|
392
391
|
|
@@ -421,18 +420,14 @@ class BaseEstimator(Base):
|
|
421
420
|
"""
|
422
421
|
return []
|
423
422
|
|
424
|
-
def _get_data_sources(self) -> Optional[List[data_source.DataSource]]:
|
425
|
-
return self._data_sources
|
426
|
-
|
427
423
|
@telemetry.send_api_usage_telemetry(
|
428
424
|
project=PROJECT,
|
429
425
|
subproject=SUBPROJECT,
|
430
426
|
)
|
431
427
|
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
|
432
428
|
"""Runs universal logics for all fit implementations."""
|
433
|
-
|
434
|
-
|
435
|
-
assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
|
429
|
+
data_sources = lineage_utils.get_data_sources(dataset)
|
430
|
+
lineage_utils.set_data_sources(self, data_sources)
|
436
431
|
return self._fit(dataset)
|
437
432
|
|
438
433
|
@abstractmethod
|
@@ -320,7 +320,7 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
320
320
|
inspect.currentframe(), GaussianProcessClassifier.__class__.__name__
|
321
321
|
),
|
322
322
|
api_calls=[Session.call],
|
323
|
-
custom_tags=
|
323
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
324
324
|
)
|
325
325
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
326
326
|
pd_df.columns = dataset.columns
|
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
653
653
|
) -> List[str]:
|
654
654
|
# in case the inferred output column names dimension is different
|
655
655
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
656
|
-
|
656
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
657
|
+
|
658
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
659
|
+
# seen during the fit.
|
660
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
661
|
+
sample_pd_df.columns = snowpark_column_names
|
662
|
+
|
663
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
657
664
|
output_df_columns = list(output_df_pd.columns)
|
658
665
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
659
666
|
if self.sample_weight_col:
|
@@ -311,7 +311,7 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
311
311
|
inspect.currentframe(), GaussianProcessRegressor.__class__.__name__
|
312
312
|
),
|
313
313
|
api_calls=[Session.call],
|
314
|
-
custom_tags=
|
314
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
315
315
|
)
|
316
316
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
317
317
|
pd_df.columns = dataset.columns
|
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
644
644
|
) -> List[str]:
|
645
645
|
# in case the inferred output column names dimension is different
|
646
646
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
647
|
-
|
647
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
648
|
+
|
649
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
650
|
+
# seen during the fit.
|
651
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
652
|
+
sample_pd_df.columns = snowpark_column_names
|
653
|
+
|
654
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
648
655
|
output_df_columns = list(output_df_pd.columns)
|
649
656
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
650
657
|
if self.sample_weight_col:
|
@@ -353,7 +353,7 @@ class IterativeImputer(BaseTransformer):
|
|
353
353
|
inspect.currentframe(), IterativeImputer.__class__.__name__
|
354
354
|
),
|
355
355
|
api_calls=[Session.call],
|
356
|
-
custom_tags=
|
356
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
357
357
|
)
|
358
358
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
359
359
|
pd_df.columns = dataset.columns
|
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
|
|
688
688
|
) -> List[str]:
|
689
689
|
# in case the inferred output column names dimension is different
|
690
690
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
691
|
-
|
691
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
692
|
+
|
693
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
694
|
+
# seen during the fit.
|
695
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
696
|
+
sample_pd_df.columns = snowpark_column_names
|
697
|
+
|
698
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
692
699
|
output_df_columns = list(output_df_pd.columns)
|
693
700
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
694
701
|
if self.sample_weight_col:
|
@@ -279,7 +279,7 @@ class KNNImputer(BaseTransformer):
|
|
279
279
|
inspect.currentframe(), KNNImputer.__class__.__name__
|
280
280
|
),
|
281
281
|
api_calls=[Session.call],
|
282
|
-
custom_tags=
|
282
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
283
283
|
)
|
284
284
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
285
285
|
pd_df.columns = dataset.columns
|
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
|
|
614
614
|
) -> List[str]:
|
615
615
|
# in case the inferred output column names dimension is different
|
616
616
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
617
|
-
|
617
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
618
|
+
|
619
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
620
|
+
# seen during the fit.
|
621
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
622
|
+
sample_pd_df.columns = snowpark_column_names
|
623
|
+
|
624
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
618
625
|
output_df_columns = list(output_df_pd.columns)
|
619
626
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
620
627
|
if self.sample_weight_col:
|
@@ -253,7 +253,7 @@ class MissingIndicator(BaseTransformer):
|
|
253
253
|
inspect.currentframe(), MissingIndicator.__class__.__name__
|
254
254
|
),
|
255
255
|
api_calls=[Session.call],
|
256
|
-
custom_tags=
|
256
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
257
257
|
)
|
258
258
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
259
259
|
pd_df.columns = dataset.columns
|
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
|
|
588
588
|
) -> List[str]:
|
589
589
|
# in case the inferred output column names dimension is different
|
590
590
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
591
|
-
|
591
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
592
|
+
|
593
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
594
|
+
# seen during the fit.
|
595
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
596
|
+
sample_pd_df.columns = snowpark_column_names
|
597
|
+
|
598
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
592
599
|
output_df_columns = list(output_df_pd.columns)
|
593
600
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
594
601
|
if self.sample_weight_col:
|
@@ -102,10 +102,14 @@ class SimpleImputer(base.BaseTransformer):
|
|
102
102
|
For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when
|
103
103
|
imputing numerical data and `missing_value` for strings and object data types.
|
104
104
|
input_cols: Optional[Union[str, List[str]]]
|
105
|
-
|
105
|
+
The name(s) of one or more columns in the input DataFrame containing feature(s) to be imputed. Input
|
106
|
+
columns must be specified before fit with this argument or after initialization with the
|
107
|
+
`set_input_cols` method. This argument is optional for API consistency.
|
106
108
|
output_cols: Optional[Union[str, List[str]]]
|
107
|
-
|
108
|
-
|
109
|
+
The name(s) to assign output columns in the output DataFrame. The number of
|
110
|
+
output columns specified must equal the number of input columns. Output columns must be specified before
|
111
|
+
transform with this argument or after initialization with the `set_output_cols` method. This argument is
|
112
|
+
optional for API consistency.
|
109
113
|
passthrough_cols: A string or a list of strings indicating column names to be excluded from any
|
110
114
|
operations (such as train, transform, or inference). These specified column(s)
|
111
115
|
will remain untouched throughout the process. This option is helpful in scenarios
|
@@ -158,6 +162,7 @@ class SimpleImputer(base.BaseTransformer):
|
|
158
162
|
|
159
163
|
self.fill_value = fill_value
|
160
164
|
self.missing_values = missing_values
|
165
|
+
self.statistics_: Dict[str, Any] = {}
|
161
166
|
# TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
|
162
167
|
# Add back when `keep_empty_features` is supported.
|
163
168
|
# self.keep_empty_features = keep_empty_features
|
@@ -229,8 +234,27 @@ class SimpleImputer(base.BaseTransformer):
|
|
229
234
|
|
230
235
|
return input_col_datatypes
|
231
236
|
|
237
|
+
def _fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
|
238
|
+
if isinstance(dataset, snowpark.DataFrame):
|
239
|
+
return self._fit_snowpark(dataset)
|
240
|
+
else:
|
241
|
+
return self._fit_sklearn(dataset)
|
242
|
+
|
243
|
+
def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
|
244
|
+
dataset = self._use_input_cols_only(dataset)
|
245
|
+
sklearn_simple_imputer = self._create_sklearn_object()
|
246
|
+
sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
|
247
|
+
self._sklearn_object = sklearn_simple_imputer
|
248
|
+
for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
|
249
|
+
self.statistics_[input_col] = fill_value
|
250
|
+
self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
|
251
|
+
self.n_features_in_ = len(self.input_cols)
|
252
|
+
self.feature_names_in_ = self.input_cols
|
253
|
+
self._is_fitted = True
|
254
|
+
return self
|
255
|
+
|
232
256
|
@telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
|
233
|
-
def
|
257
|
+
def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
|
234
258
|
"""
|
235
259
|
Compute values to impute for the dataset according to the strategy.
|
236
260
|
|
@@ -245,7 +269,6 @@ class SimpleImputer(base.BaseTransformer):
|
|
245
269
|
# In order to fit, the input columns should have the same type.
|
246
270
|
input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
|
247
271
|
|
248
|
-
self.statistics_: Dict[str, Any] = {}
|
249
272
|
statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
|
250
273
|
|
251
274
|
if self.strategy == "constant":
|
@@ -228,7 +228,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
228
228
|
inspect.currentframe(), AdditiveChi2Sampler.__class__.__name__
|
229
229
|
),
|
230
230
|
api_calls=[Session.call],
|
231
|
-
custom_tags=
|
231
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
232
232
|
)
|
233
233
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
234
234
|
pd_df.columns = dataset.columns
|
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
563
563
|
) -> List[str]:
|
564
564
|
# in case the inferred output column names dimension is different
|
565
565
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
566
|
-
|
566
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
567
|
+
|
568
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
569
|
+
# seen during the fit.
|
570
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
571
|
+
sample_pd_df.columns = snowpark_column_names
|
572
|
+
|
573
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
567
574
|
output_df_columns = list(output_df_pd.columns)
|
568
575
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
569
576
|
if self.sample_weight_col:
|
@@ -276,7 +276,7 @@ class Nystroem(BaseTransformer):
|
|
276
276
|
inspect.currentframe(), Nystroem.__class__.__name__
|
277
277
|
),
|
278
278
|
api_calls=[Session.call],
|
279
|
-
custom_tags=
|
279
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
280
280
|
)
|
281
281
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
282
282
|
pd_df.columns = dataset.columns
|
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
|
|
611
611
|
) -> List[str]:
|
612
612
|
# in case the inferred output column names dimension is different
|
613
613
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
614
|
-
|
614
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
615
|
+
|
616
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
617
|
+
# seen during the fit.
|
618
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
619
|
+
sample_pd_df.columns = snowpark_column_names
|
620
|
+
|
621
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
615
622
|
output_df_columns = list(output_df_pd.columns)
|
616
623
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
617
624
|
if self.sample_weight_col:
|
@@ -252,7 +252,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
252
252
|
inspect.currentframe(), PolynomialCountSketch.__class__.__name__
|
253
253
|
),
|
254
254
|
api_calls=[Session.call],
|
255
|
-
custom_tags=
|
255
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
256
256
|
)
|
257
257
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
258
258
|
pd_df.columns = dataset.columns
|
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
587
587
|
) -> List[str]:
|
588
588
|
# in case the inferred output column names dimension is different
|
589
589
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
590
|
-
|
590
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
591
|
+
|
592
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
593
|
+
# seen during the fit.
|
594
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
595
|
+
sample_pd_df.columns = snowpark_column_names
|
596
|
+
|
597
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
591
598
|
output_df_columns = list(output_df_pd.columns)
|
592
599
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
593
600
|
if self.sample_weight_col:
|
@@ -239,7 +239,7 @@ class RBFSampler(BaseTransformer):
|
|
239
239
|
inspect.currentframe(), RBFSampler.__class__.__name__
|
240
240
|
),
|
241
241
|
api_calls=[Session.call],
|
242
|
-
custom_tags=
|
242
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
243
243
|
)
|
244
244
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
245
245
|
pd_df.columns = dataset.columns
|
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
|
|
574
574
|
) -> List[str]:
|
575
575
|
# in case the inferred output column names dimension is different
|
576
576
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
577
|
-
|
577
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
578
|
+
|
579
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
580
|
+
# seen during the fit.
|
581
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
582
|
+
sample_pd_df.columns = snowpark_column_names
|
583
|
+
|
584
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
578
585
|
output_df_columns = list(output_df_pd.columns)
|
579
586
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
580
587
|
if self.sample_weight_col:
|
@@ -237,7 +237,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
237
237
|
inspect.currentframe(), SkewedChi2Sampler.__class__.__name__
|
238
238
|
),
|
239
239
|
api_calls=[Session.call],
|
240
|
-
custom_tags=
|
240
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
241
241
|
)
|
242
242
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
243
243
|
pd_df.columns = dataset.columns
|
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
572
572
|
) -> List[str]:
|
573
573
|
# in case the inferred output column names dimension is different
|
574
574
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
575
|
-
|
575
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
576
|
+
|
577
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
578
|
+
# seen during the fit.
|
579
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
580
|
+
sample_pd_df.columns = snowpark_column_names
|
581
|
+
|
582
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
576
583
|
output_df_columns = list(output_df_pd.columns)
|
577
584
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
578
585
|
if self.sample_weight_col:
|
@@ -273,7 +273,7 @@ class KernelRidge(BaseTransformer):
|
|
273
273
|
inspect.currentframe(), KernelRidge.__class__.__name__
|
274
274
|
),
|
275
275
|
api_calls=[Session.call],
|
276
|
-
custom_tags=
|
276
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
277
277
|
)
|
278
278
|
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
279
279
|
pd_df.columns = dataset.columns
|
@@ -606,7 +606,14 @@ class KernelRidge(BaseTransformer):
|
|
606
606
|
) -> List[str]:
|
607
607
|
# in case the inferred output column names dimension is different
|
608
608
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
609
|
-
|
609
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
610
|
+
|
611
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
612
|
+
# seen during the fit.
|
613
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
614
|
+
sample_pd_df.columns = snowpark_column_names
|
615
|
+
|
616
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
610
617
|
output_df_columns = list(output_df_pd.columns)
|
611
618
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
612
619
|
if self.sample_weight_col:
|