snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +16 -8
- snowflake/cortex/_classify_text.py +12 -1
- snowflake/cortex/_complete.py +101 -13
- snowflake/cortex/_embed_text_1024.py +9 -2
- snowflake/cortex/_embed_text_768.py +9 -2
- snowflake/cortex/_extract_answer.py +9 -2
- snowflake/cortex/_sentiment.py +9 -2
- snowflake/cortex/_summarize.py +9 -2
- snowflake/cortex/_translate.py +9 -2
- snowflake/ml/_internal/env_utils.py +7 -52
- snowflake/ml/_internal/platform_capabilities.py +87 -0
- snowflake/ml/_internal/utils/identifier.py +4 -2
- snowflake/ml/data/__init__.py +3 -0
- snowflake/ml/data/_internal/arrow_ingestor.py +4 -4
- snowflake/ml/data/data_connector.py +53 -11
- snowflake/ml/data/data_ingestor.py +2 -1
- snowflake/ml/data/torch_utils.py +18 -5
- snowflake/ml/dataset/dataset.py +0 -1
- snowflake/ml/feature_store/examples/example_helper.py +2 -1
- snowflake/ml/fileset/fileset.py +24 -18
- snowflake/ml/jobs/__init__.py +21 -0
- snowflake/ml/jobs/_utils/constants.py +51 -0
- snowflake/ml/jobs/_utils/payload_utils.py +352 -0
- snowflake/ml/jobs/_utils/spec_utils.py +298 -0
- snowflake/ml/jobs/_utils/types.py +39 -0
- snowflake/ml/jobs/decorators.py +91 -0
- snowflake/ml/jobs/job.py +113 -0
- snowflake/ml/jobs/manager.py +298 -0
- snowflake/ml/model/_client/model/model_version_impl.py +5 -3
- snowflake/ml/model/_client/ops/model_ops.py +13 -8
- snowflake/ml/model/_client/ops/service_ops.py +1 -11
- snowflake/ml/model/_client/sql/model_version.py +11 -0
- snowflake/ml/model/_client/sql/service.py +13 -6
- snowflake/ml/model/_model_composer/model_composer.py +8 -3
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +20 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
- snowflake/ml/model/_model_composer/model_method/constants.py +1 -0
- snowflake/ml/model/_model_composer/model_method/function_generator.py +2 -0
- snowflake/ml/model/_model_composer/model_method/infer_function.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +9 -1
- snowflake/ml/model/_model_composer/model_user_file/model_user_file.py +27 -0
- snowflake/ml/model/_packager/model_handlers/_utils.py +39 -5
- snowflake/ml/model/_packager/model_handlers/catboost.py +3 -3
- snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +6 -1
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +5 -3
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +55 -20
- snowflake/ml/model/_packager/model_handlers/sklearn.py +9 -10
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +66 -28
- snowflake/ml/model/_packager/model_handlers/tensorflow.py +70 -17
- snowflake/ml/model/_packager/model_handlers/xgboost.py +3 -3
- snowflake/ml/model/_packager/model_meta/model_meta.py +3 -0
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +6 -1
- snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
- snowflake/ml/model/_packager/model_task/model_task_utils.py +3 -2
- snowflake/ml/model/_signatures/base_handler.py +1 -2
- snowflake/ml/model/_signatures/builtins_handler.py +2 -2
- snowflake/ml/model/_signatures/numpy_handler.py +6 -7
- snowflake/ml/model/_signatures/pandas_handler.py +3 -3
- snowflake/ml/model/_signatures/pytorch_handler.py +2 -5
- snowflake/ml/model/_signatures/snowpark_handler.py +11 -5
- snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
- snowflake/ml/model/model_signature.py +17 -4
- snowflake/ml/model/type_hints.py +1 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +0 -8
- snowflake/ml/modeling/_internal/model_transformer_builder.py +0 -13
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
- snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
- snowflake/ml/modeling/cluster/birch.py +6 -3
- snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
- snowflake/ml/modeling/cluster/dbscan.py +6 -3
- snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
- snowflake/ml/modeling/cluster/k_means.py +6 -3
- snowflake/ml/modeling/cluster/mean_shift.py +6 -3
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
- snowflake/ml/modeling/cluster/optics.py +6 -3
- snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
- snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
- snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
- snowflake/ml/modeling/compose/column_transformer.py +6 -3
- snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
- snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
- snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
- snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
- snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
- snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
- snowflake/ml/modeling/covariance/oas.py +6 -3
- snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
- snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
- snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
- snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
- snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
- snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
- snowflake/ml/modeling/decomposition/pca.py +6 -3
- snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
- snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
- snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
- snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
- snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
- snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
- snowflake/ml/modeling/impute/knn_imputer.py +6 -3
- snowflake/ml/modeling/impute/missing_indicator.py +6 -3
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
- snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
- snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
- snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/lars.py +6 -3
- snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
- snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
- snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/perceptron.py +6 -3
- snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ridge.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
- snowflake/ml/modeling/manifold/isomap.py +6 -3
- snowflake/ml/modeling/manifold/mds.py +6 -3
- snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
- snowflake/ml/modeling/manifold/tsne.py +6 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
- snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
- snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
- snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
- snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
- snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
- snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
- snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
- snowflake/ml/modeling/pipeline/pipeline.py +16 -178
- snowflake/ml/modeling/preprocessing/polynomial_features.py +6 -3
- snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
- snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
- snowflake/ml/modeling/svm/linear_svc.py +6 -3
- snowflake/ml/modeling/svm/linear_svr.py +6 -3
- snowflake/ml/modeling/svm/nu_svc.py +6 -3
- snowflake/ml/modeling/svm/nu_svr.py +6 -3
- snowflake/ml/modeling/svm/svc.py +6 -3
- snowflake/ml/modeling/svm/svr.py +6 -3
- snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
- snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
- snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
- snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
- snowflake/ml/modeling/xgboost/xgb_classifier.py +167 -91
- snowflake/ml/modeling/xgboost/xgb_regressor.py +166 -88
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +166 -88
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +166 -88
- snowflake/ml/monitoring/_client/model_monitor_sql_client.py +4 -4
- snowflake/ml/registry/_manager/model_manager.py +70 -33
- snowflake/ml/registry/registry.py +41 -22
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/METADATA +63 -19
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/RECORD +231 -226
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/utils/retryable_http.py +0 -39
- snowflake/ml/fileset/parquet_parser.py +0 -170
- snowflake/ml/fileset/tf_dataset.py +0 -88
- snowflake/ml/fileset/torch_datapipe.py +0 -57
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +0 -151
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_trainer.py +0 -66
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
|
|
38
38
|
FeatureSpec,
|
39
39
|
ModelSignature,
|
40
40
|
_infer_signature,
|
41
|
+
_truncate_data,
|
41
42
|
_rename_signature_with_snowflake_identifiers,
|
42
43
|
)
|
43
44
|
|
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
58
59
|
|
59
60
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
60
61
|
|
62
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
63
|
+
|
61
64
|
class SelectFwe(BaseTransformer):
|
62
65
|
r"""Filter: Select the p-values corresponding to Family-wise error rate
|
63
66
|
For more details on this class, see [sklearn.feature_selection.SelectFwe]
|
@@ -410,7 +413,7 @@ class SelectFwe(BaseTransformer):
|
|
410
413
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
411
414
|
expected_dtype = "array"
|
412
415
|
else:
|
413
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
416
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
414
417
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
415
418
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
416
419
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1067,7 +1070,7 @@ class SelectFwe(BaseTransformer):
|
|
1067
1070
|
|
1068
1071
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1069
1072
|
|
1070
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1073
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1071
1074
|
outputs: List[BaseFeatureSpec] = []
|
1072
1075
|
if hasattr(self, "predict"):
|
1073
1076
|
# keep mypy happy
|
@@ -1075,7 +1078,7 @@ class SelectFwe(BaseTransformer):
|
|
1075
1078
|
# For classifier, the type of predict is the same as the type of label
|
1076
1079
|
if self._sklearn_object._estimator_type == "classifier":
|
1077
1080
|
# label columns is the desired type for output
|
1078
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1081
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1079
1082
|
# rename the output columns
|
1080
1083
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1081
1084
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
|
|
38
38
|
FeatureSpec,
|
39
39
|
ModelSignature,
|
40
40
|
_infer_signature,
|
41
|
+
_truncate_data,
|
41
42
|
_rename_signature_with_snowflake_identifiers,
|
42
43
|
)
|
43
44
|
|
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
58
59
|
|
59
60
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
60
61
|
|
62
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
63
|
+
|
61
64
|
class SelectKBest(BaseTransformer):
|
62
65
|
r"""Select features according to the k highest scores
|
63
66
|
For more details on this class, see [sklearn.feature_selection.SelectKBest]
|
@@ -411,7 +414,7 @@ class SelectKBest(BaseTransformer):
|
|
411
414
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
412
415
|
expected_dtype = "array"
|
413
416
|
else:
|
414
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
417
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
415
418
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
416
419
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
417
420
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1068,7 +1071,7 @@ class SelectKBest(BaseTransformer):
|
|
1068
1071
|
|
1069
1072
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1070
1073
|
|
1071
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1074
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1072
1075
|
outputs: List[BaseFeatureSpec] = []
|
1073
1076
|
if hasattr(self, "predict"):
|
1074
1077
|
# keep mypy happy
|
@@ -1076,7 +1079,7 @@ class SelectKBest(BaseTransformer):
|
|
1076
1079
|
# For classifier, the type of predict is the same as the type of label
|
1077
1080
|
if self._sklearn_object._estimator_type == "classifier":
|
1078
1081
|
# label columns is the desired type for output
|
1079
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1082
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1080
1083
|
# rename the output columns
|
1081
1084
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1082
1085
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
|
|
38
38
|
FeatureSpec,
|
39
39
|
ModelSignature,
|
40
40
|
_infer_signature,
|
41
|
+
_truncate_data,
|
41
42
|
_rename_signature_with_snowflake_identifiers,
|
42
43
|
)
|
43
44
|
|
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
58
59
|
|
59
60
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
60
61
|
|
62
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
63
|
+
|
61
64
|
class SelectPercentile(BaseTransformer):
|
62
65
|
r"""Select features according to a percentile of the highest scores
|
63
66
|
For more details on this class, see [sklearn.feature_selection.SelectPercentile]
|
@@ -410,7 +413,7 @@ class SelectPercentile(BaseTransformer):
|
|
410
413
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
411
414
|
expected_dtype = "array"
|
412
415
|
else:
|
413
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
416
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
414
417
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
415
418
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
416
419
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1067,7 +1070,7 @@ class SelectPercentile(BaseTransformer):
|
|
1067
1070
|
|
1068
1071
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1069
1072
|
|
1070
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1073
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1071
1074
|
outputs: List[BaseFeatureSpec] = []
|
1072
1075
|
if hasattr(self, "predict"):
|
1073
1076
|
# keep mypy happy
|
@@ -1075,7 +1078,7 @@ class SelectPercentile(BaseTransformer):
|
|
1075
1078
|
# For classifier, the type of predict is the same as the type of label
|
1076
1079
|
if self._sklearn_object._estimator_type == "classifier":
|
1077
1080
|
# label columns is the desired type for output
|
1078
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1081
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1079
1082
|
# rename the output columns
|
1080
1083
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1081
1084
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class SequentialFeatureSelector(BaseTransformer):
|
61
64
|
r"""Transformer that performs Sequential Feature Selection
|
62
65
|
For more details on this class, see [sklearn.feature_selection.SequentialFeatureSelector]
|
@@ -472,7 +475,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
472
475
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
473
476
|
expected_dtype = "array"
|
474
477
|
else:
|
475
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
478
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
476
479
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
477
480
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
478
481
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1129,7 +1132,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1129
1132
|
|
1130
1133
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1131
1134
|
|
1132
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1135
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1133
1136
|
outputs: List[BaseFeatureSpec] = []
|
1134
1137
|
if hasattr(self, "predict"):
|
1135
1138
|
# keep mypy happy
|
@@ -1137,7 +1140,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1137
1140
|
# For classifier, the type of predict is the same as the type of label
|
1138
1141
|
if self._sklearn_object._estimator_type == "classifier":
|
1139
1142
|
# label columns is the desired type for output
|
1140
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1143
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1141
1144
|
# rename the output columns
|
1142
1145
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1143
1146
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class VarianceThreshold(BaseTransformer):
|
61
64
|
r"""Feature selector that removes all low-variance features
|
62
65
|
For more details on this class, see [sklearn.feature_selection.VarianceThreshold]
|
@@ -403,7 +406,7 @@ class VarianceThreshold(BaseTransformer):
|
|
403
406
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
404
407
|
expected_dtype = "array"
|
405
408
|
else:
|
406
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
409
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
407
410
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
408
411
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
409
412
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1060,7 +1063,7 @@ class VarianceThreshold(BaseTransformer):
|
|
1060
1063
|
|
1061
1064
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1062
1065
|
|
1063
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1066
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1064
1067
|
outputs: List[BaseFeatureSpec] = []
|
1065
1068
|
if hasattr(self, "predict"):
|
1066
1069
|
# keep mypy happy
|
@@ -1068,7 +1071,7 @@ class VarianceThreshold(BaseTransformer):
|
|
1068
1071
|
# For classifier, the type of predict is the same as the type of label
|
1069
1072
|
if self._sklearn_object._estimator_type == "classifier":
|
1070
1073
|
# label columns is the desired type for output
|
1071
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1074
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1072
1075
|
# rename the output columns
|
1073
1076
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1074
1077
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.gaussian_process".replac
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class GaussianProcessClassifier(BaseTransformer):
|
61
64
|
r"""Gaussian process classification (GPC) based on Laplace approximation
|
62
65
|
For more details on this class, see [sklearn.gaussian_process.GaussianProcessClassifier]
|
@@ -496,7 +499,7 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
496
499
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
497
500
|
expected_dtype = "array"
|
498
501
|
else:
|
499
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
502
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
500
503
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
501
504
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
502
505
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1157,7 +1160,7 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
1157
1160
|
|
1158
1161
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1159
1162
|
|
1160
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1163
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1161
1164
|
outputs: List[BaseFeatureSpec] = []
|
1162
1165
|
if hasattr(self, "predict"):
|
1163
1166
|
# keep mypy happy
|
@@ -1165,7 +1168,7 @@ class GaussianProcessClassifier(BaseTransformer):
|
|
1165
1168
|
# For classifier, the type of predict is the same as the type of label
|
1166
1169
|
if self._sklearn_object._estimator_type == "classifier":
|
1167
1170
|
# label columns is the desired type for output
|
1168
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1171
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1169
1172
|
# rename the output columns
|
1170
1173
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1171
1174
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.gaussian_process".replac
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class GaussianProcessRegressor(BaseTransformer):
|
61
64
|
r"""Gaussian process regression (GPR)
|
62
65
|
For more details on this class, see [sklearn.gaussian_process.GaussianProcessRegressor]
|
@@ -487,7 +490,7 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
487
490
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
488
491
|
expected_dtype = "array"
|
489
492
|
else:
|
490
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
493
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
491
494
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
492
495
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
493
496
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1144,7 +1147,7 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
1144
1147
|
|
1145
1148
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1146
1149
|
|
1147
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1150
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1148
1151
|
outputs: List[BaseFeatureSpec] = []
|
1149
1152
|
if hasattr(self, "predict"):
|
1150
1153
|
# keep mypy happy
|
@@ -1152,7 +1155,7 @@ class GaussianProcessRegressor(BaseTransformer):
|
|
1152
1155
|
# For classifier, the type of predict is the same as the type of label
|
1153
1156
|
if self._sklearn_object._estimator_type == "classifier":
|
1154
1157
|
# label columns is the desired type for output
|
1155
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1158
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1156
1159
|
# rename the output columns
|
1157
1160
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1158
1161
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -38,6 +38,7 @@ from snowflake.ml.model.model_signature import (
|
|
38
38
|
FeatureSpec,
|
39
39
|
ModelSignature,
|
40
40
|
_infer_signature,
|
41
|
+
_truncate_data,
|
41
42
|
_rename_signature_with_snowflake_identifiers,
|
42
43
|
)
|
43
44
|
|
@@ -58,6 +59,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
|
|
58
59
|
|
59
60
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
60
61
|
|
62
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
63
|
+
|
61
64
|
class IterativeImputer(BaseTransformer):
|
62
65
|
r"""Multivariate imputer that estimates each feature from all the others
|
63
66
|
For more details on this class, see [sklearn.impute.IterativeImputer]
|
@@ -531,7 +534,7 @@ class IterativeImputer(BaseTransformer):
|
|
531
534
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
532
535
|
expected_dtype = "array"
|
533
536
|
else:
|
534
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
537
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
535
538
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
536
539
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
537
540
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1188,7 +1191,7 @@ class IterativeImputer(BaseTransformer):
|
|
1188
1191
|
|
1189
1192
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1190
1193
|
|
1191
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1194
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1192
1195
|
outputs: List[BaseFeatureSpec] = []
|
1193
1196
|
if hasattr(self, "predict"):
|
1194
1197
|
# keep mypy happy
|
@@ -1196,7 +1199,7 @@ class IterativeImputer(BaseTransformer):
|
|
1196
1199
|
# For classifier, the type of predict is the same as the type of label
|
1197
1200
|
if self._sklearn_object._estimator_type == "classifier":
|
1198
1201
|
# label columns is the desired type for output
|
1199
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1202
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1200
1203
|
# rename the output columns
|
1201
1204
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1202
1205
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class KNNImputer(BaseTransformer):
|
61
64
|
r"""Imputation for completing missing values using k-Nearest Neighbors
|
62
65
|
For more details on this class, see [sklearn.impute.KNNImputer]
|
@@ -457,7 +460,7 @@ class KNNImputer(BaseTransformer):
|
|
457
460
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
458
461
|
expected_dtype = "array"
|
459
462
|
else:
|
460
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
463
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
461
464
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
462
465
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
463
466
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1114,7 +1117,7 @@ class KNNImputer(BaseTransformer):
|
|
1114
1117
|
|
1115
1118
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1116
1119
|
|
1117
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1120
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1118
1121
|
outputs: List[BaseFeatureSpec] = []
|
1119
1122
|
if hasattr(self, "predict"):
|
1120
1123
|
# keep mypy happy
|
@@ -1122,7 +1125,7 @@ class KNNImputer(BaseTransformer):
|
|
1122
1125
|
# For classifier, the type of predict is the same as the type of label
|
1123
1126
|
if self._sklearn_object._estimator_type == "classifier":
|
1124
1127
|
# label columns is the desired type for output
|
1125
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1128
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1126
1129
|
# rename the output columns
|
1127
1130
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1128
1131
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.impute".replace("sklearn
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class MissingIndicator(BaseTransformer):
|
61
64
|
r"""Binary indicators for missing values
|
62
65
|
For more details on this class, see [sklearn.impute.MissingIndicator]
|
@@ -431,7 +434,7 @@ class MissingIndicator(BaseTransformer):
|
|
431
434
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
432
435
|
expected_dtype = "array"
|
433
436
|
else:
|
434
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
437
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
435
438
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
436
439
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
437
440
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1088,7 +1091,7 @@ class MissingIndicator(BaseTransformer):
|
|
1088
1091
|
|
1089
1092
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1090
1093
|
|
1091
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1094
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1092
1095
|
outputs: List[BaseFeatureSpec] = []
|
1093
1096
|
if hasattr(self, "predict"):
|
1094
1097
|
# keep mypy happy
|
@@ -1096,7 +1099,7 @@ class MissingIndicator(BaseTransformer):
|
|
1096
1099
|
# For classifier, the type of predict is the same as the type of label
|
1097
1100
|
if self._sklearn_object._estimator_type == "classifier":
|
1098
1101
|
# label columns is the desired type for output
|
1099
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1102
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1100
1103
|
# rename the output columns
|
1101
1104
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1102
1105
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class AdditiveChi2Sampler(BaseTransformer):
|
61
64
|
r"""Approximate feature map for additive chi2 kernel
|
62
65
|
For more details on this class, see [sklearn.kernel_approximation.AdditiveChi2Sampler]
|
@@ -406,7 +409,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
406
409
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
407
410
|
expected_dtype = "array"
|
408
411
|
else:
|
409
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
412
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
410
413
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
411
414
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
412
415
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1063,7 +1066,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1063
1066
|
|
1064
1067
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1065
1068
|
|
1066
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1069
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1067
1070
|
outputs: List[BaseFeatureSpec] = []
|
1068
1071
|
if hasattr(self, "predict"):
|
1069
1072
|
# keep mypy happy
|
@@ -1071,7 +1074,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1071
1074
|
# For classifier, the type of predict is the same as the type of label
|
1072
1075
|
if self._sklearn_object._estimator_type == "classifier":
|
1073
1076
|
# label columns is the desired type for output
|
1074
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1077
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1075
1078
|
# rename the output columns
|
1076
1079
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1077
1080
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class Nystroem(BaseTransformer):
|
61
64
|
r"""Approximate a kernel map using a subset of the training data
|
62
65
|
For more details on this class, see [sklearn.kernel_approximation.Nystroem]
|
@@ -454,7 +457,7 @@ class Nystroem(BaseTransformer):
|
|
454
457
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
455
458
|
expected_dtype = "array"
|
456
459
|
else:
|
457
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
460
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
458
461
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
459
462
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
460
463
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1111,7 +1114,7 @@ class Nystroem(BaseTransformer):
|
|
1111
1114
|
|
1112
1115
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1113
1116
|
|
1114
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1117
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1115
1118
|
outputs: List[BaseFeatureSpec] = []
|
1116
1119
|
if hasattr(self, "predict"):
|
1117
1120
|
# keep mypy happy
|
@@ -1119,7 +1122,7 @@ class Nystroem(BaseTransformer):
|
|
1119
1122
|
# For classifier, the type of predict is the same as the type of label
|
1120
1123
|
if self._sklearn_object._estimator_type == "classifier":
|
1121
1124
|
# label columns is the desired type for output
|
1122
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1125
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1123
1126
|
# rename the output columns
|
1124
1127
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1125
1128
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class PolynomialCountSketch(BaseTransformer):
|
61
64
|
r"""Polynomial kernel approximation via Tensor Sketch
|
62
65
|
For more details on this class, see [sklearn.kernel_approximation.PolynomialCountSketch]
|
@@ -430,7 +433,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
430
433
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
431
434
|
expected_dtype = "array"
|
432
435
|
else:
|
433
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
436
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
434
437
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
435
438
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
436
439
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1087,7 +1090,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
1087
1090
|
|
1088
1091
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1089
1092
|
|
1090
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1093
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1091
1094
|
outputs: List[BaseFeatureSpec] = []
|
1092
1095
|
if hasattr(self, "predict"):
|
1093
1096
|
# keep mypy happy
|
@@ -1095,7 +1098,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
1095
1098
|
# For classifier, the type of predict is the same as the type of label
|
1096
1099
|
if self._sklearn_object._estimator_type == "classifier":
|
1097
1100
|
# label columns is the desired type for output
|
1098
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1101
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1099
1102
|
# rename the output columns
|
1100
1103
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1101
1104
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class RBFSampler(BaseTransformer):
|
61
64
|
r"""Approximate a RBF kernel feature map using random Fourier features
|
62
65
|
For more details on this class, see [sklearn.kernel_approximation.RBFSampler]
|
@@ -417,7 +420,7 @@ class RBFSampler(BaseTransformer):
|
|
417
420
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
418
421
|
expected_dtype = "array"
|
419
422
|
else:
|
420
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
423
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
421
424
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
422
425
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
423
426
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1074,7 +1077,7 @@ class RBFSampler(BaseTransformer):
|
|
1074
1077
|
|
1075
1078
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1076
1079
|
|
1077
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1080
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1078
1081
|
outputs: List[BaseFeatureSpec] = []
|
1079
1082
|
if hasattr(self, "predict"):
|
1080
1083
|
# keep mypy happy
|
@@ -1082,7 +1085,7 @@ class RBFSampler(BaseTransformer):
|
|
1082
1085
|
# For classifier, the type of predict is the same as the type of label
|
1083
1086
|
if self._sklearn_object._estimator_type == "classifier":
|
1084
1087
|
# label columns is the desired type for output
|
1085
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1088
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1086
1089
|
# rename the output columns
|
1087
1090
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1088
1091
|
self._model_signature_dict["predict"] = ModelSignature(
|