snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +16 -8
- snowflake/cortex/_classify_text.py +12 -1
- snowflake/cortex/_complete.py +101 -13
- snowflake/cortex/_embed_text_1024.py +9 -2
- snowflake/cortex/_embed_text_768.py +9 -2
- snowflake/cortex/_extract_answer.py +9 -2
- snowflake/cortex/_sentiment.py +9 -2
- snowflake/cortex/_summarize.py +9 -2
- snowflake/cortex/_translate.py +9 -2
- snowflake/ml/_internal/env_utils.py +7 -52
- snowflake/ml/_internal/platform_capabilities.py +87 -0
- snowflake/ml/_internal/utils/identifier.py +4 -2
- snowflake/ml/data/__init__.py +3 -0
- snowflake/ml/data/_internal/arrow_ingestor.py +4 -4
- snowflake/ml/data/data_connector.py +53 -11
- snowflake/ml/data/data_ingestor.py +2 -1
- snowflake/ml/data/torch_utils.py +18 -5
- snowflake/ml/dataset/dataset.py +0 -1
- snowflake/ml/feature_store/examples/example_helper.py +2 -1
- snowflake/ml/fileset/fileset.py +24 -18
- snowflake/ml/jobs/__init__.py +21 -0
- snowflake/ml/jobs/_utils/constants.py +51 -0
- snowflake/ml/jobs/_utils/payload_utils.py +352 -0
- snowflake/ml/jobs/_utils/spec_utils.py +298 -0
- snowflake/ml/jobs/_utils/types.py +39 -0
- snowflake/ml/jobs/decorators.py +91 -0
- snowflake/ml/jobs/job.py +113 -0
- snowflake/ml/jobs/manager.py +298 -0
- snowflake/ml/model/_client/model/model_version_impl.py +5 -3
- snowflake/ml/model/_client/ops/model_ops.py +13 -8
- snowflake/ml/model/_client/ops/service_ops.py +1 -11
- snowflake/ml/model/_client/sql/model_version.py +11 -0
- snowflake/ml/model/_client/sql/service.py +13 -6
- snowflake/ml/model/_model_composer/model_composer.py +8 -3
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +20 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
- snowflake/ml/model/_model_composer/model_method/constants.py +1 -0
- snowflake/ml/model/_model_composer/model_method/function_generator.py +2 -0
- snowflake/ml/model/_model_composer/model_method/infer_function.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +9 -1
- snowflake/ml/model/_model_composer/model_user_file/model_user_file.py +27 -0
- snowflake/ml/model/_packager/model_handlers/_utils.py +39 -5
- snowflake/ml/model/_packager/model_handlers/catboost.py +3 -3
- snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +6 -1
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +5 -3
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +55 -20
- snowflake/ml/model/_packager/model_handlers/sklearn.py +9 -10
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +66 -28
- snowflake/ml/model/_packager/model_handlers/tensorflow.py +70 -17
- snowflake/ml/model/_packager/model_handlers/xgboost.py +3 -3
- snowflake/ml/model/_packager/model_meta/model_meta.py +3 -0
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +6 -1
- snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
- snowflake/ml/model/_packager/model_task/model_task_utils.py +3 -2
- snowflake/ml/model/_signatures/base_handler.py +1 -2
- snowflake/ml/model/_signatures/builtins_handler.py +2 -2
- snowflake/ml/model/_signatures/numpy_handler.py +6 -7
- snowflake/ml/model/_signatures/pandas_handler.py +3 -3
- snowflake/ml/model/_signatures/pytorch_handler.py +2 -5
- snowflake/ml/model/_signatures/snowpark_handler.py +11 -5
- snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
- snowflake/ml/model/model_signature.py +17 -4
- snowflake/ml/model/type_hints.py +1 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +0 -8
- snowflake/ml/modeling/_internal/model_transformer_builder.py +0 -13
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
- snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
- snowflake/ml/modeling/cluster/birch.py +6 -3
- snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
- snowflake/ml/modeling/cluster/dbscan.py +6 -3
- snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
- snowflake/ml/modeling/cluster/k_means.py +6 -3
- snowflake/ml/modeling/cluster/mean_shift.py +6 -3
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
- snowflake/ml/modeling/cluster/optics.py +6 -3
- snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
- snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
- snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
- snowflake/ml/modeling/compose/column_transformer.py +6 -3
- snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
- snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
- snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
- snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
- snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
- snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
- snowflake/ml/modeling/covariance/oas.py +6 -3
- snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
- snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
- snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
- snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
- snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
- snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
- snowflake/ml/modeling/decomposition/pca.py +6 -3
- snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
- snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
- snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
- snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
- snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
- snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
- snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
- snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
- snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
- snowflake/ml/modeling/impute/knn_imputer.py +6 -3
- snowflake/ml/modeling/impute/missing_indicator.py +6 -3
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
- snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
- snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
- snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/lars.py +6 -3
- snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
- snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
- snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/perceptron.py +6 -3
- snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/ridge.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
- snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
- snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
- snowflake/ml/modeling/manifold/isomap.py +6 -3
- snowflake/ml/modeling/manifold/mds.py +6 -3
- snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
- snowflake/ml/modeling/manifold/tsne.py +6 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
- snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
- snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
- snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
- snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
- snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
- snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
- snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
- snowflake/ml/modeling/pipeline/pipeline.py +16 -178
- snowflake/ml/modeling/preprocessing/polynomial_features.py +6 -3
- snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
- snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
- snowflake/ml/modeling/svm/linear_svc.py +6 -3
- snowflake/ml/modeling/svm/linear_svr.py +6 -3
- snowflake/ml/modeling/svm/nu_svc.py +6 -3
- snowflake/ml/modeling/svm/nu_svr.py +6 -3
- snowflake/ml/modeling/svm/svc.py +6 -3
- snowflake/ml/modeling/svm/svr.py +6 -3
- snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
- snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
- snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
- snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
- snowflake/ml/modeling/xgboost/xgb_classifier.py +167 -91
- snowflake/ml/modeling/xgboost/xgb_regressor.py +166 -88
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +166 -88
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +166 -88
- snowflake/ml/monitoring/_client/model_monitor_sql_client.py +4 -4
- snowflake/ml/registry/_manager/model_manager.py +70 -33
- snowflake/ml/registry/registry.py +41 -22
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/METADATA +63 -19
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/RECORD +231 -226
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/utils/retryable_http.py +0 -39
- snowflake/ml/fileset/parquet_parser.py +0 -170
- snowflake/ml/fileset/tf_dataset.py +0 -88
- snowflake/ml/fileset/torch_datapipe.py +0 -57
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +0 -151
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_trainer.py +0 -66
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,11 @@
|
|
1
|
-
import os
|
2
1
|
from typing import Optional, Union
|
3
2
|
|
4
3
|
import pandas as pd
|
5
4
|
|
6
5
|
from snowflake import snowpark
|
7
|
-
from snowflake.ml.modeling._internal.constants import IN_ML_RUNTIME_ENV_VAR
|
8
6
|
from snowflake.ml.modeling._internal.local_implementations.pandas_handlers import (
|
9
7
|
PandasTransformHandlers,
|
10
8
|
)
|
11
|
-
from snowflake.ml.modeling._internal.ml_runtime_implementations.ml_runtime_handlers import (
|
12
|
-
MLRuntimeTransformHandlers,
|
13
|
-
)
|
14
9
|
from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import (
|
15
10
|
SnowparkTransformHandlers,
|
16
11
|
)
|
@@ -63,14 +58,6 @@ class ModelTransformerBuilder:
|
|
63
58
|
)
|
64
59
|
|
65
60
|
elif isinstance(dataset, snowpark.DataFrame):
|
66
|
-
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
67
|
-
return MLRuntimeTransformHandlers(
|
68
|
-
dataset=dataset,
|
69
|
-
estimator=estimator,
|
70
|
-
class_name=class_name,
|
71
|
-
subproject=subproject,
|
72
|
-
autogenerated=autogenerated,
|
73
|
-
)
|
74
61
|
return SnowparkTransformHandlers(
|
75
62
|
dataset=dataset,
|
76
63
|
estimator=estimator,
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.calibration".replace("sk
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class CalibratedClassifierCV(BaseTransformer):
|
61
64
|
r"""Probability calibration with isotonic regression or logistic regression
|
62
65
|
For more details on this class, see [sklearn.calibration.CalibratedClassifierCV]
|
@@ -465,7 +468,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
465
468
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
466
469
|
expected_dtype = "array"
|
467
470
|
else:
|
468
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
471
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
469
472
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
470
473
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
471
474
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1126,7 +1129,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
1126
1129
|
|
1127
1130
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1128
1131
|
|
1129
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1132
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1130
1133
|
outputs: List[BaseFeatureSpec] = []
|
1131
1134
|
if hasattr(self, "predict"):
|
1132
1135
|
# keep mypy happy
|
@@ -1134,7 +1137,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
1134
1137
|
# For classifier, the type of predict is the same as the type of label
|
1135
1138
|
if self._sklearn_object._estimator_type == "classifier":
|
1136
1139
|
# label columns is the desired type for output
|
1137
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1140
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1138
1141
|
# rename the output columns
|
1139
1142
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1140
1143
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class AffinityPropagation(BaseTransformer):
|
61
64
|
r"""Perform Affinity Propagation Clustering of data
|
62
65
|
For more details on this class, see [sklearn.cluster.AffinityPropagation]
|
@@ -449,7 +452,7 @@ class AffinityPropagation(BaseTransformer):
|
|
449
452
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
450
453
|
expected_dtype = "array"
|
451
454
|
else:
|
452
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
455
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
453
456
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
454
457
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
455
458
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1106,7 +1109,7 @@ class AffinityPropagation(BaseTransformer):
|
|
1106
1109
|
|
1107
1110
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1108
1111
|
|
1109
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1112
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1110
1113
|
outputs: List[BaseFeatureSpec] = []
|
1111
1114
|
if hasattr(self, "predict"):
|
1112
1115
|
# keep mypy happy
|
@@ -1114,7 +1117,7 @@ class AffinityPropagation(BaseTransformer):
|
|
1114
1117
|
# For classifier, the type of predict is the same as the type of label
|
1115
1118
|
if self._sklearn_object._estimator_type == "classifier":
|
1116
1119
|
# label columns is the desired type for output
|
1117
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1120
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1118
1121
|
# rename the output columns
|
1119
1122
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1120
1123
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class AgglomerativeClustering(BaseTransformer):
|
61
64
|
r"""Agglomerative Clustering
|
62
65
|
For more details on this class, see [sklearn.cluster.AgglomerativeClustering]
|
@@ -478,7 +481,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
478
481
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
479
482
|
expected_dtype = "array"
|
480
483
|
else:
|
481
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
484
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
482
485
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
483
486
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
484
487
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1135,7 +1138,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
1135
1138
|
|
1136
1139
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1137
1140
|
|
1138
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1141
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1139
1142
|
outputs: List[BaseFeatureSpec] = []
|
1140
1143
|
if hasattr(self, "predict"):
|
1141
1144
|
# keep mypy happy
|
@@ -1143,7 +1146,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
1143
1146
|
# For classifier, the type of predict is the same as the type of label
|
1144
1147
|
if self._sklearn_object._estimator_type == "classifier":
|
1145
1148
|
# label columns is the desired type for output
|
1146
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1149
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1147
1150
|
# rename the output columns
|
1148
1151
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1149
1152
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class Birch(BaseTransformer):
|
61
64
|
r"""Implements the BIRCH clustering algorithm
|
62
65
|
For more details on this class, see [sklearn.cluster.Birch]
|
@@ -442,7 +445,7 @@ class Birch(BaseTransformer):
|
|
442
445
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
443
446
|
expected_dtype = "array"
|
444
447
|
else:
|
445
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
448
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
446
449
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
447
450
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
448
451
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1101,7 +1104,7 @@ class Birch(BaseTransformer):
|
|
1101
1104
|
|
1102
1105
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1103
1106
|
|
1104
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1107
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1105
1108
|
outputs: List[BaseFeatureSpec] = []
|
1106
1109
|
if hasattr(self, "predict"):
|
1107
1110
|
# keep mypy happy
|
@@ -1109,7 +1112,7 @@ class Birch(BaseTransformer):
|
|
1109
1112
|
# For classifier, the type of predict is the same as the type of label
|
1110
1113
|
if self._sklearn_object._estimator_type == "classifier":
|
1111
1114
|
# label columns is the desired type for output
|
1112
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1115
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1113
1116
|
# rename the output columns
|
1114
1117
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1115
1118
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class BisectingKMeans(BaseTransformer):
|
61
64
|
r"""Bisecting K-Means clustering
|
62
65
|
For more details on this class, see [sklearn.cluster.BisectingKMeans]
|
@@ -491,7 +494,7 @@ class BisectingKMeans(BaseTransformer):
|
|
491
494
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
492
495
|
expected_dtype = "array"
|
493
496
|
else:
|
494
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
497
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
495
498
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
496
499
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
497
500
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1152,7 +1155,7 @@ class BisectingKMeans(BaseTransformer):
|
|
1152
1155
|
|
1153
1156
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1154
1157
|
|
1155
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1158
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1156
1159
|
outputs: List[BaseFeatureSpec] = []
|
1157
1160
|
if hasattr(self, "predict"):
|
1158
1161
|
# keep mypy happy
|
@@ -1160,7 +1163,7 @@ class BisectingKMeans(BaseTransformer):
|
|
1160
1163
|
# For classifier, the type of predict is the same as the type of label
|
1161
1164
|
if self._sklearn_object._estimator_type == "classifier":
|
1162
1165
|
# label columns is the desired type for output
|
1163
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1166
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1164
1167
|
# rename the output columns
|
1165
1168
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1166
1169
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class DBSCAN(BaseTransformer):
|
61
64
|
r"""Perform DBSCAN clustering from vector array or distance matrix
|
62
65
|
For more details on this class, see [sklearn.cluster.DBSCAN]
|
@@ -458,7 +461,7 @@ class DBSCAN(BaseTransformer):
|
|
458
461
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
459
462
|
expected_dtype = "array"
|
460
463
|
else:
|
461
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
464
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
462
465
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
463
466
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
464
467
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1115,7 +1118,7 @@ class DBSCAN(BaseTransformer):
|
|
1115
1118
|
|
1116
1119
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1117
1120
|
|
1118
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1121
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1119
1122
|
outputs: List[BaseFeatureSpec] = []
|
1120
1123
|
if hasattr(self, "predict"):
|
1121
1124
|
# keep mypy happy
|
@@ -1123,7 +1126,7 @@ class DBSCAN(BaseTransformer):
|
|
1123
1126
|
# For classifier, the type of predict is the same as the type of label
|
1124
1127
|
if self._sklearn_object._estimator_type == "classifier":
|
1125
1128
|
# label columns is the desired type for output
|
1126
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1129
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1127
1130
|
# rename the output columns
|
1128
1131
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1129
1132
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class FeatureAgglomeration(BaseTransformer):
|
61
64
|
r"""Agglomerate features
|
62
65
|
For more details on this class, see [sklearn.cluster.FeatureAgglomeration]
|
@@ -477,7 +480,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
477
480
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
478
481
|
expected_dtype = "array"
|
479
482
|
else:
|
480
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
483
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
481
484
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
482
485
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
483
486
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1136,7 +1139,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1136
1139
|
|
1137
1140
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1138
1141
|
|
1139
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1142
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1140
1143
|
outputs: List[BaseFeatureSpec] = []
|
1141
1144
|
if hasattr(self, "predict"):
|
1142
1145
|
# keep mypy happy
|
@@ -1144,7 +1147,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1144
1147
|
# For classifier, the type of predict is the same as the type of label
|
1145
1148
|
if self._sklearn_object._estimator_type == "classifier":
|
1146
1149
|
# label columns is the desired type for output
|
1147
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1150
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1148
1151
|
# rename the output columns
|
1149
1152
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1150
1153
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class KMeans(BaseTransformer):
|
61
64
|
r"""K-Means clustering
|
62
65
|
For more details on this class, see [sklearn.cluster.KMeans]
|
@@ -481,7 +484,7 @@ class KMeans(BaseTransformer):
|
|
481
484
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
482
485
|
expected_dtype = "array"
|
483
486
|
else:
|
484
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
487
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
485
488
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
486
489
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
487
490
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1142,7 +1145,7 @@ class KMeans(BaseTransformer):
|
|
1142
1145
|
|
1143
1146
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1144
1147
|
|
1145
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1148
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1146
1149
|
outputs: List[BaseFeatureSpec] = []
|
1147
1150
|
if hasattr(self, "predict"):
|
1148
1151
|
# keep mypy happy
|
@@ -1150,7 +1153,7 @@ class KMeans(BaseTransformer):
|
|
1150
1153
|
# For classifier, the type of predict is the same as the type of label
|
1151
1154
|
if self._sklearn_object._estimator_type == "classifier":
|
1152
1155
|
# label columns is the desired type for output
|
1153
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1156
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1154
1157
|
# rename the output columns
|
1155
1158
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1156
1159
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class MeanShift(BaseTransformer):
|
61
64
|
r"""Mean shift clustering using a flat kernel
|
62
65
|
For more details on this class, see [sklearn.cluster.MeanShift]
|
@@ -460,7 +463,7 @@ class MeanShift(BaseTransformer):
|
|
460
463
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
461
464
|
expected_dtype = "array"
|
462
465
|
else:
|
463
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
466
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
464
467
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
465
468
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
466
469
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1117,7 +1120,7 @@ class MeanShift(BaseTransformer):
|
|
1117
1120
|
|
1118
1121
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1119
1122
|
|
1120
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1123
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1121
1124
|
outputs: List[BaseFeatureSpec] = []
|
1122
1125
|
if hasattr(self, "predict"):
|
1123
1126
|
# keep mypy happy
|
@@ -1125,7 +1128,7 @@ class MeanShift(BaseTransformer):
|
|
1125
1128
|
# For classifier, the type of predict is the same as the type of label
|
1126
1129
|
if self._sklearn_object._estimator_type == "classifier":
|
1127
1130
|
# label columns is the desired type for output
|
1128
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1131
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1129
1132
|
# rename the output columns
|
1130
1133
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1131
1134
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class MiniBatchKMeans(BaseTransformer):
|
61
64
|
r"""Mini-Batch K-Means clustering
|
62
65
|
For more details on this class, see [sklearn.cluster.MiniBatchKMeans]
|
@@ -512,7 +515,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
512
515
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
513
516
|
expected_dtype = "array"
|
514
517
|
else:
|
515
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
518
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
516
519
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
517
520
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
518
521
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1173,7 +1176,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
1173
1176
|
|
1174
1177
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1175
1178
|
|
1176
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1179
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1177
1180
|
outputs: List[BaseFeatureSpec] = []
|
1178
1181
|
if hasattr(self, "predict"):
|
1179
1182
|
# keep mypy happy
|
@@ -1181,7 +1184,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
1181
1184
|
# For classifier, the type of predict is the same as the type of label
|
1182
1185
|
if self._sklearn_object._estimator_type == "classifier":
|
1183
1186
|
# label columns is the desired type for output
|
1184
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1187
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1185
1188
|
# rename the output columns
|
1186
1189
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1187
1190
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class OPTICS(BaseTransformer):
|
61
64
|
r"""Estimate clustering structure from vector array
|
62
65
|
For more details on this class, see [sklearn.cluster.OPTICS]
|
@@ -528,7 +531,7 @@ class OPTICS(BaseTransformer):
|
|
528
531
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
529
532
|
expected_dtype = "array"
|
530
533
|
else:
|
531
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
534
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
532
535
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
533
536
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
534
537
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1185,7 +1188,7 @@ class OPTICS(BaseTransformer):
|
|
1185
1188
|
|
1186
1189
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1187
1190
|
|
1188
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1191
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1189
1192
|
outputs: List[BaseFeatureSpec] = []
|
1190
1193
|
if hasattr(self, "predict"):
|
1191
1194
|
# keep mypy happy
|
@@ -1193,7 +1196,7 @@ class OPTICS(BaseTransformer):
|
|
1193
1196
|
# For classifier, the type of predict is the same as the type of label
|
1194
1197
|
if self._sklearn_object._estimator_type == "classifier":
|
1195
1198
|
# label columns is the desired type for output
|
1196
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1199
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1197
1200
|
# rename the output columns
|
1198
1201
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1199
1202
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class SpectralBiclustering(BaseTransformer):
|
61
64
|
r"""Spectral biclustering (Kluger, 2003)
|
62
65
|
For more details on this class, see [sklearn.cluster.SpectralBiclustering]
|
@@ -466,7 +469,7 @@ class SpectralBiclustering(BaseTransformer):
|
|
466
469
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
467
470
|
expected_dtype = "array"
|
468
471
|
else:
|
469
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
472
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
470
473
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
471
474
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
472
475
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1121,7 +1124,7 @@ class SpectralBiclustering(BaseTransformer):
|
|
1121
1124
|
|
1122
1125
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1123
1126
|
|
1124
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1127
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1125
1128
|
outputs: List[BaseFeatureSpec] = []
|
1126
1129
|
if hasattr(self, "predict"):
|
1127
1130
|
# keep mypy happy
|
@@ -1129,7 +1132,7 @@ class SpectralBiclustering(BaseTransformer):
|
|
1129
1132
|
# For classifier, the type of predict is the same as the type of label
|
1130
1133
|
if self._sklearn_object._estimator_type == "classifier":
|
1131
1134
|
# label columns is the desired type for output
|
1132
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1135
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1133
1136
|
# rename the output columns
|
1134
1137
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1135
1138
|
self._model_signature_dict["predict"] = ModelSignature(
|
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
|
|
37
37
|
FeatureSpec,
|
38
38
|
ModelSignature,
|
39
39
|
_infer_signature,
|
40
|
+
_truncate_data,
|
40
41
|
_rename_signature_with_snowflake_identifiers,
|
41
42
|
)
|
42
43
|
|
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
57
58
|
|
58
59
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
59
60
|
|
61
|
+
INFER_SIGNATURE_MAX_ROWS = 100
|
62
|
+
|
60
63
|
class SpectralClustering(BaseTransformer):
|
61
64
|
r"""Apply clustering to a projection of the normalized Laplacian
|
62
65
|
For more details on this class, see [sklearn.cluster.SpectralClustering]
|
@@ -525,7 +528,7 @@ class SpectralClustering(BaseTransformer):
|
|
525
528
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
526
529
|
expected_dtype = "array"
|
527
530
|
else:
|
528
|
-
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
531
|
+
output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
|
529
532
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
530
533
|
# 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
|
531
534
|
# 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
|
@@ -1182,7 +1185,7 @@ class SpectralClustering(BaseTransformer):
|
|
1182
1185
|
|
1183
1186
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1184
1187
|
|
1185
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1188
|
+
inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
|
1186
1189
|
outputs: List[BaseFeatureSpec] = []
|
1187
1190
|
if hasattr(self, "predict"):
|
1188
1191
|
# keep mypy happy
|
@@ -1190,7 +1193,7 @@ class SpectralClustering(BaseTransformer):
|
|
1190
1193
|
# For classifier, the type of predict is the same as the type of label
|
1191
1194
|
if self._sklearn_object._estimator_type == "classifier":
|
1192
1195
|
# label columns is the desired type for output
|
1193
|
-
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1196
|
+
outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
|
1194
1197
|
# rename the output columns
|
1195
1198
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1196
1199
|
self._model_signature_dict["predict"] = ModelSignature(
|