snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class SpectralClustering(BaseTransformer):
|
71
64
|
r"""Apply clustering to a projection of the normalized Laplacian
|
72
65
|
For more details on this class, see [sklearn.cluster.SpectralClustering]
|
@@ -319,12 +312,7 @@ class SpectralClustering(BaseTransformer):
|
|
319
312
|
)
|
320
313
|
return selected_cols
|
321
314
|
|
322
|
-
|
323
|
-
project=_PROJECT,
|
324
|
-
subproject=_SUBPROJECT,
|
325
|
-
custom_tags=dict([("autogen", True)]),
|
326
|
-
)
|
327
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralClustering":
|
315
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralClustering":
|
328
316
|
"""Perform spectral clustering from features, or affinity matrix
|
329
317
|
For more details on this function, see [sklearn.cluster.SpectralClustering.fit]
|
330
318
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering.fit)
|
@@ -351,12 +339,14 @@ class SpectralClustering(BaseTransformer):
|
|
351
339
|
|
352
340
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
353
341
|
|
354
|
-
|
342
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
355
343
|
if SNOWML_SPROC_ENV in os.environ:
|
356
344
|
statement_params = telemetry.get_function_usage_statement_params(
|
357
345
|
project=_PROJECT,
|
358
346
|
subproject=_SUBPROJECT,
|
359
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
347
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
348
|
+
inspect.currentframe(), SpectralClustering.__class__.__name__
|
349
|
+
),
|
360
350
|
api_calls=[Session.call],
|
361
351
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
362
352
|
)
|
@@ -377,27 +367,24 @@ class SpectralClustering(BaseTransformer):
|
|
377
367
|
)
|
378
368
|
self._sklearn_object = model_trainer.train()
|
379
369
|
self._is_fitted = True
|
380
|
-
self.
|
370
|
+
self._generate_model_signatures(dataset)
|
381
371
|
return self
|
382
372
|
|
383
373
|
def _batch_inference_validate_snowpark(
|
384
374
|
self,
|
385
375
|
dataset: DataFrame,
|
386
376
|
inference_method: str,
|
387
|
-
) ->
|
388
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
389
|
-
return the available package that exists in the snowflake anaconda channel
|
377
|
+
) -> None:
|
378
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
390
379
|
|
391
380
|
Args:
|
392
381
|
dataset: snowpark dataframe
|
393
382
|
inference_method: the inference method such as predict, score...
|
394
|
-
|
383
|
+
|
395
384
|
Raises:
|
396
385
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
397
386
|
SnowflakeMLException: If the session is None, raise error
|
398
387
|
|
399
|
-
Returns:
|
400
|
-
A list of available package that exists in the snowflake anaconda channel
|
401
388
|
"""
|
402
389
|
if not self._is_fitted:
|
403
390
|
raise exceptions.SnowflakeMLException(
|
@@ -415,9 +402,7 @@ class SpectralClustering(BaseTransformer):
|
|
415
402
|
"Session must not specified for snowpark dataset."
|
416
403
|
),
|
417
404
|
)
|
418
|
-
|
419
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
420
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
405
|
+
|
421
406
|
|
422
407
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
423
408
|
@telemetry.send_api_usage_telemetry(
|
@@ -451,7 +436,9 @@ class SpectralClustering(BaseTransformer):
|
|
451
436
|
# when it is classifier, infer the datatype from label columns
|
452
437
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
453
438
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
454
|
-
label_cols_signatures = [
|
439
|
+
label_cols_signatures = [
|
440
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
441
|
+
]
|
455
442
|
if len(label_cols_signatures) == 0:
|
456
443
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
457
444
|
raise exceptions.SnowflakeMLException(
|
@@ -459,25 +446,23 @@ class SpectralClustering(BaseTransformer):
|
|
459
446
|
original_exception=ValueError(error_str),
|
460
447
|
)
|
461
448
|
|
462
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
463
|
-
label_cols_signatures[0].as_snowpark_type()
|
464
|
-
)
|
449
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
465
450
|
|
466
|
-
self.
|
467
|
-
|
451
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
452
|
+
self._deps = self._get_dependencies()
|
453
|
+
assert isinstance(
|
454
|
+
dataset._session, Session
|
455
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
468
456
|
|
469
457
|
transform_kwargs = dict(
|
470
|
-
session
|
471
|
-
dependencies
|
472
|
-
drop_input_cols
|
473
|
-
expected_output_cols_type
|
458
|
+
session=dataset._session,
|
459
|
+
dependencies=self._deps,
|
460
|
+
drop_input_cols=self._drop_input_cols,
|
461
|
+
expected_output_cols_type=expected_type_inferred,
|
474
462
|
)
|
475
463
|
|
476
464
|
elif isinstance(dataset, pd.DataFrame):
|
477
|
-
transform_kwargs = dict(
|
478
|
-
snowpark_input_cols = self._snowpark_cols,
|
479
|
-
drop_input_cols = self._drop_input_cols
|
480
|
-
)
|
465
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
481
466
|
|
482
467
|
transform_handlers = ModelTransformerBuilder.build(
|
483
468
|
dataset=dataset,
|
@@ -517,7 +502,7 @@ class SpectralClustering(BaseTransformer):
|
|
517
502
|
Transformed dataset.
|
518
503
|
"""
|
519
504
|
super()._check_dataset_type(dataset)
|
520
|
-
inference_method="transform"
|
505
|
+
inference_method = "transform"
|
521
506
|
|
522
507
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
523
508
|
# are specific to the type of dataset used.
|
@@ -547,24 +532,19 @@ class SpectralClustering(BaseTransformer):
|
|
547
532
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
548
533
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
549
534
|
|
550
|
-
self.
|
551
|
-
|
552
|
-
inference_method=inference_method,
|
553
|
-
)
|
535
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
536
|
+
self._deps = self._get_dependencies()
|
554
537
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
555
538
|
|
556
539
|
transform_kwargs = dict(
|
557
|
-
session
|
558
|
-
dependencies
|
559
|
-
drop_input_cols
|
560
|
-
expected_output_cols_type
|
540
|
+
session=dataset._session,
|
541
|
+
dependencies=self._deps,
|
542
|
+
drop_input_cols=self._drop_input_cols,
|
543
|
+
expected_output_cols_type=expected_dtype,
|
561
544
|
)
|
562
545
|
|
563
546
|
elif isinstance(dataset, pd.DataFrame):
|
564
|
-
transform_kwargs = dict(
|
565
|
-
snowpark_input_cols = self._snowpark_cols,
|
566
|
-
drop_input_cols = self._drop_input_cols
|
567
|
-
)
|
547
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
568
548
|
|
569
549
|
transform_handlers = ModelTransformerBuilder.build(
|
570
550
|
dataset=dataset,
|
@@ -583,7 +563,11 @@ class SpectralClustering(BaseTransformer):
|
|
583
563
|
return output_df
|
584
564
|
|
585
565
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
586
|
-
def fit_predict(
|
566
|
+
def fit_predict(
|
567
|
+
self,
|
568
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
569
|
+
output_cols_prefix: str = "fit_predict_",
|
570
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
587
571
|
""" Perform spectral clustering on `X` and return cluster labels
|
588
572
|
For more details on this function, see [sklearn.cluster.SpectralClustering.fit_predict]
|
589
573
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering.fit_predict)
|
@@ -610,22 +594,104 @@ class SpectralClustering(BaseTransformer):
|
|
610
594
|
)
|
611
595
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
612
596
|
drop_input_cols=self._drop_input_cols,
|
613
|
-
expected_output_cols_list=
|
597
|
+
expected_output_cols_list=(
|
598
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
599
|
+
),
|
614
600
|
)
|
615
601
|
self._sklearn_object = fitted_estimator
|
616
602
|
self._is_fitted = True
|
617
603
|
return output_result
|
618
604
|
|
605
|
+
|
606
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
607
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
608
|
+
""" Method not supported for this class.
|
609
|
+
|
610
|
+
|
611
|
+
Raises:
|
612
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
619
613
|
|
620
|
-
|
621
|
-
|
622
|
-
|
614
|
+
Args:
|
615
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
616
|
+
Snowpark or Pandas DataFrame.
|
617
|
+
output_cols_prefix: Prefix for the response columns
|
623
618
|
Returns:
|
624
619
|
Transformed dataset.
|
625
620
|
"""
|
626
|
-
self.
|
627
|
-
|
628
|
-
|
621
|
+
self._infer_input_output_cols(dataset)
|
622
|
+
super()._check_dataset_type(dataset)
|
623
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
624
|
+
estimator=self._sklearn_object,
|
625
|
+
dataset=dataset,
|
626
|
+
input_cols=self.input_cols,
|
627
|
+
label_cols=self.label_cols,
|
628
|
+
sample_weight_col=self.sample_weight_col,
|
629
|
+
autogenerated=self._autogenerated,
|
630
|
+
subproject=_SUBPROJECT,
|
631
|
+
)
|
632
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
633
|
+
drop_input_cols=self._drop_input_cols,
|
634
|
+
expected_output_cols_list=self.output_cols,
|
635
|
+
)
|
636
|
+
self._sklearn_object = fitted_estimator
|
637
|
+
self._is_fitted = True
|
638
|
+
return output_result
|
639
|
+
|
640
|
+
|
641
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
642
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
643
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
644
|
+
"""
|
645
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
646
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
647
|
+
if output_cols:
|
648
|
+
output_cols = [
|
649
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
650
|
+
for c in output_cols
|
651
|
+
]
|
652
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
653
|
+
output_cols = [output_cols_prefix]
|
654
|
+
elif self._sklearn_object is not None:
|
655
|
+
classes = self._sklearn_object.classes_
|
656
|
+
if isinstance(classes, numpy.ndarray):
|
657
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
658
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
659
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
660
|
+
output_cols = []
|
661
|
+
for i, cl in enumerate(classes):
|
662
|
+
# For binary classification, there is only one output column for each class
|
663
|
+
# ndarray as the two classes are complementary.
|
664
|
+
if len(cl) == 2:
|
665
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
666
|
+
else:
|
667
|
+
output_cols.extend([
|
668
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
669
|
+
])
|
670
|
+
else:
|
671
|
+
output_cols = []
|
672
|
+
|
673
|
+
# Make sure column names are valid snowflake identifiers.
|
674
|
+
assert output_cols is not None # Make MyPy happy
|
675
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
676
|
+
|
677
|
+
return rv
|
678
|
+
|
679
|
+
def _align_expected_output_names(
|
680
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
681
|
+
) -> List[str]:
|
682
|
+
# in case the inferred output column names dimension is different
|
683
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
684
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
685
|
+
output_df_columns = list(output_df_pd.columns)
|
686
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
687
|
+
if self.sample_weight_col:
|
688
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
689
|
+
# if the dimension of inferred output column names is correct; use it
|
690
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
691
|
+
return expected_output_cols_list
|
692
|
+
# otherwise, use the sklearn estimator's output
|
693
|
+
else:
|
694
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
629
695
|
|
630
696
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
631
697
|
@telemetry.send_api_usage_telemetry(
|
@@ -657,24 +723,26 @@ class SpectralClustering(BaseTransformer):
|
|
657
723
|
# are specific to the type of dataset used.
|
658
724
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
659
725
|
|
726
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
727
|
+
|
660
728
|
if isinstance(dataset, DataFrame):
|
661
|
-
self.
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
729
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
730
|
+
self._deps = self._get_dependencies()
|
731
|
+
assert isinstance(
|
732
|
+
dataset._session, Session
|
733
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
666
734
|
transform_kwargs = dict(
|
667
735
|
session=dataset._session,
|
668
736
|
dependencies=self._deps,
|
669
|
-
drop_input_cols
|
737
|
+
drop_input_cols=self._drop_input_cols,
|
670
738
|
expected_output_cols_type="float",
|
671
739
|
)
|
740
|
+
expected_output_cols = self._align_expected_output_names(
|
741
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
742
|
+
)
|
672
743
|
|
673
744
|
elif isinstance(dataset, pd.DataFrame):
|
674
|
-
transform_kwargs = dict(
|
675
|
-
snowpark_input_cols = self._snowpark_cols,
|
676
|
-
drop_input_cols = self._drop_input_cols
|
677
|
-
)
|
745
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
678
746
|
|
679
747
|
transform_handlers = ModelTransformerBuilder.build(
|
680
748
|
dataset=dataset,
|
@@ -686,7 +754,7 @@ class SpectralClustering(BaseTransformer):
|
|
686
754
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
687
755
|
inference_method=inference_method,
|
688
756
|
input_cols=self.input_cols,
|
689
|
-
expected_output_cols=
|
757
|
+
expected_output_cols=expected_output_cols,
|
690
758
|
**transform_kwargs
|
691
759
|
)
|
692
760
|
return output_df
|
@@ -716,29 +784,30 @@ class SpectralClustering(BaseTransformer):
|
|
716
784
|
Output dataset with log probability of the sample for each class in the model.
|
717
785
|
"""
|
718
786
|
super()._check_dataset_type(dataset)
|
719
|
-
inference_method="predict_log_proba"
|
787
|
+
inference_method = "predict_log_proba"
|
788
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
720
789
|
|
721
790
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
722
791
|
# are specific to the type of dataset used.
|
723
792
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
724
793
|
|
725
794
|
if isinstance(dataset, DataFrame):
|
726
|
-
self.
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
795
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
796
|
+
self._deps = self._get_dependencies()
|
797
|
+
assert isinstance(
|
798
|
+
dataset._session, Session
|
799
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
731
800
|
transform_kwargs = dict(
|
732
801
|
session=dataset._session,
|
733
802
|
dependencies=self._deps,
|
734
|
-
drop_input_cols
|
803
|
+
drop_input_cols=self._drop_input_cols,
|
735
804
|
expected_output_cols_type="float",
|
736
805
|
)
|
806
|
+
expected_output_cols = self._align_expected_output_names(
|
807
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
808
|
+
)
|
737
809
|
elif isinstance(dataset, pd.DataFrame):
|
738
|
-
transform_kwargs = dict(
|
739
|
-
snowpark_input_cols = self._snowpark_cols,
|
740
|
-
drop_input_cols = self._drop_input_cols
|
741
|
-
)
|
810
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
742
811
|
|
743
812
|
transform_handlers = ModelTransformerBuilder.build(
|
744
813
|
dataset=dataset,
|
@@ -751,7 +820,7 @@ class SpectralClustering(BaseTransformer):
|
|
751
820
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
752
821
|
inference_method=inference_method,
|
753
822
|
input_cols=self.input_cols,
|
754
|
-
expected_output_cols=
|
823
|
+
expected_output_cols=expected_output_cols,
|
755
824
|
**transform_kwargs
|
756
825
|
)
|
757
826
|
return output_df
|
@@ -777,30 +846,32 @@ class SpectralClustering(BaseTransformer):
|
|
777
846
|
Output dataset with results of the decision function for the samples in input dataset.
|
778
847
|
"""
|
779
848
|
super()._check_dataset_type(dataset)
|
780
|
-
inference_method="decision_function"
|
849
|
+
inference_method = "decision_function"
|
781
850
|
|
782
851
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
783
852
|
# are specific to the type of dataset used.
|
784
853
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
785
854
|
|
855
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
856
|
+
|
786
857
|
if isinstance(dataset, DataFrame):
|
787
|
-
self.
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
858
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
859
|
+
self._deps = self._get_dependencies()
|
860
|
+
assert isinstance(
|
861
|
+
dataset._session, Session
|
862
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
792
863
|
transform_kwargs = dict(
|
793
864
|
session=dataset._session,
|
794
865
|
dependencies=self._deps,
|
795
|
-
drop_input_cols
|
866
|
+
drop_input_cols=self._drop_input_cols,
|
796
867
|
expected_output_cols_type="float",
|
797
868
|
)
|
869
|
+
expected_output_cols = self._align_expected_output_names(
|
870
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
871
|
+
)
|
798
872
|
|
799
873
|
elif isinstance(dataset, pd.DataFrame):
|
800
|
-
transform_kwargs = dict(
|
801
|
-
snowpark_input_cols = self._snowpark_cols,
|
802
|
-
drop_input_cols = self._drop_input_cols
|
803
|
-
)
|
874
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
804
875
|
|
805
876
|
transform_handlers = ModelTransformerBuilder.build(
|
806
877
|
dataset=dataset,
|
@@ -813,7 +884,7 @@ class SpectralClustering(BaseTransformer):
|
|
813
884
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
814
885
|
inference_method=inference_method,
|
815
886
|
input_cols=self.input_cols,
|
816
|
-
expected_output_cols=
|
887
|
+
expected_output_cols=expected_output_cols,
|
817
888
|
**transform_kwargs
|
818
889
|
)
|
819
890
|
return output_df
|
@@ -842,17 +913,17 @@ class SpectralClustering(BaseTransformer):
|
|
842
913
|
Output dataset with probability of the sample for each class in the model.
|
843
914
|
"""
|
844
915
|
super()._check_dataset_type(dataset)
|
845
|
-
inference_method="score_samples"
|
916
|
+
inference_method = "score_samples"
|
846
917
|
|
847
918
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
848
919
|
# are specific to the type of dataset used.
|
849
920
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
850
921
|
|
922
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
923
|
+
|
851
924
|
if isinstance(dataset, DataFrame):
|
852
|
-
self.
|
853
|
-
|
854
|
-
inference_method=inference_method,
|
855
|
-
)
|
925
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
926
|
+
self._deps = self._get_dependencies()
|
856
927
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
857
928
|
transform_kwargs = dict(
|
858
929
|
session=dataset._session,
|
@@ -860,6 +931,9 @@ class SpectralClustering(BaseTransformer):
|
|
860
931
|
drop_input_cols = self._drop_input_cols,
|
861
932
|
expected_output_cols_type="float",
|
862
933
|
)
|
934
|
+
expected_output_cols = self._align_expected_output_names(
|
935
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
936
|
+
)
|
863
937
|
|
864
938
|
elif isinstance(dataset, pd.DataFrame):
|
865
939
|
transform_kwargs = dict(
|
@@ -878,7 +952,7 @@ class SpectralClustering(BaseTransformer):
|
|
878
952
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
879
953
|
inference_method=inference_method,
|
880
954
|
input_cols=self.input_cols,
|
881
|
-
expected_output_cols=
|
955
|
+
expected_output_cols=expected_output_cols,
|
882
956
|
**transform_kwargs
|
883
957
|
)
|
884
958
|
return output_df
|
@@ -911,17 +985,15 @@ class SpectralClustering(BaseTransformer):
|
|
911
985
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
912
986
|
|
913
987
|
if isinstance(dataset, DataFrame):
|
914
|
-
self.
|
915
|
-
|
916
|
-
inference_method="score",
|
917
|
-
)
|
988
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
989
|
+
self._deps = self._get_dependencies()
|
918
990
|
selected_cols = self._get_active_columns()
|
919
991
|
if len(selected_cols) > 0:
|
920
992
|
dataset = dataset.select(selected_cols)
|
921
993
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
922
994
|
transform_kwargs = dict(
|
923
995
|
session=dataset._session,
|
924
|
-
dependencies=
|
996
|
+
dependencies=self._deps,
|
925
997
|
score_sproc_imports=['sklearn'],
|
926
998
|
)
|
927
999
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -986,11 +1058,8 @@ class SpectralClustering(BaseTransformer):
|
|
986
1058
|
|
987
1059
|
if isinstance(dataset, DataFrame):
|
988
1060
|
|
989
|
-
self.
|
990
|
-
|
991
|
-
inference_method=inference_method,
|
992
|
-
|
993
|
-
)
|
1061
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1062
|
+
self._deps = self._get_dependencies()
|
994
1063
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
995
1064
|
transform_kwargs = dict(
|
996
1065
|
session = dataset._session,
|
@@ -1023,50 +1092,84 @@ class SpectralClustering(BaseTransformer):
|
|
1023
1092
|
)
|
1024
1093
|
return output_df
|
1025
1094
|
|
1095
|
+
|
1096
|
+
|
1097
|
+
def to_sklearn(self) -> Any:
|
1098
|
+
"""Get sklearn.cluster.SpectralClustering object.
|
1099
|
+
"""
|
1100
|
+
if self._sklearn_object is None:
|
1101
|
+
self._sklearn_object = self._create_sklearn_object()
|
1102
|
+
return self._sklearn_object
|
1103
|
+
|
1104
|
+
def to_xgboost(self) -> Any:
|
1105
|
+
raise exceptions.SnowflakeMLException(
|
1106
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1107
|
+
original_exception=AttributeError(
|
1108
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1109
|
+
"to_xgboost()",
|
1110
|
+
"to_sklearn()"
|
1111
|
+
)
|
1112
|
+
),
|
1113
|
+
)
|
1026
1114
|
|
1027
|
-
def
|
1115
|
+
def to_lightgbm(self) -> Any:
|
1116
|
+
raise exceptions.SnowflakeMLException(
|
1117
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1118
|
+
original_exception=AttributeError(
|
1119
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1120
|
+
"to_lightgbm()",
|
1121
|
+
"to_sklearn()"
|
1122
|
+
)
|
1123
|
+
),
|
1124
|
+
)
|
1125
|
+
|
1126
|
+
def _get_dependencies(self) -> List[str]:
|
1127
|
+
return self._deps
|
1128
|
+
|
1129
|
+
|
1130
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1028
1131
|
self._model_signature_dict = dict()
|
1029
1132
|
|
1030
1133
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1031
1134
|
|
1032
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1135
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1033
1136
|
outputs: List[BaseFeatureSpec] = []
|
1034
1137
|
if hasattr(self, "predict"):
|
1035
1138
|
# keep mypy happy
|
1036
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1139
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1037
1140
|
# For classifier, the type of predict is the same as the type of label
|
1038
|
-
if self._sklearn_object._estimator_type ==
|
1039
|
-
|
1141
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1142
|
+
# label columns is the desired type for output
|
1040
1143
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1041
1144
|
# rename the output columns
|
1042
1145
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1043
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1044
|
-
|
1045
|
-
|
1146
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1147
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1148
|
+
)
|
1046
1149
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1047
1150
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1048
|
-
# Clusterer returns int64 cluster labels.
|
1151
|
+
# Clusterer returns int64 cluster labels.
|
1049
1152
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1050
1153
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1051
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1154
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1155
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1156
|
+
)
|
1157
|
+
|
1055
1158
|
# For regressor, the type of predict is float64
|
1056
|
-
elif self._sklearn_object._estimator_type ==
|
1159
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1057
1160
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1058
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1161
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1162
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1163
|
+
)
|
1164
|
+
|
1062
1165
|
for prob_func in PROB_FUNCTIONS:
|
1063
1166
|
if hasattr(self, prob_func):
|
1064
1167
|
output_cols_prefix: str = f"{prob_func}_"
|
1065
1168
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1066
1169
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1067
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1068
|
-
|
1069
|
-
|
1170
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1171
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1172
|
+
)
|
1070
1173
|
|
1071
1174
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1072
1175
|
items = list(self._model_signature_dict.items())
|
@@ -1079,10 +1182,10 @@ class SpectralClustering(BaseTransformer):
|
|
1079
1182
|
"""Returns model signature of current class.
|
1080
1183
|
|
1081
1184
|
Raises:
|
1082
|
-
|
1185
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1083
1186
|
|
1084
1187
|
Returns:
|
1085
|
-
Dict
|
1188
|
+
Dict with each method and its input output signature
|
1086
1189
|
"""
|
1087
1190
|
if self._model_signature_dict is None:
|
1088
1191
|
raise exceptions.SnowflakeMLException(
|
@@ -1090,35 +1193,3 @@ class SpectralClustering(BaseTransformer):
|
|
1090
1193
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1091
1194
|
)
|
1092
1195
|
return self._model_signature_dict
|
1093
|
-
|
1094
|
-
def to_sklearn(self) -> Any:
|
1095
|
-
"""Get sklearn.cluster.SpectralClustering object.
|
1096
|
-
"""
|
1097
|
-
if self._sklearn_object is None:
|
1098
|
-
self._sklearn_object = self._create_sklearn_object()
|
1099
|
-
return self._sklearn_object
|
1100
|
-
|
1101
|
-
def to_xgboost(self) -> Any:
|
1102
|
-
raise exceptions.SnowflakeMLException(
|
1103
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1104
|
-
original_exception=AttributeError(
|
1105
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1106
|
-
"to_xgboost()",
|
1107
|
-
"to_sklearn()"
|
1108
|
-
)
|
1109
|
-
),
|
1110
|
-
)
|
1111
|
-
|
1112
|
-
def to_lightgbm(self) -> Any:
|
1113
|
-
raise exceptions.SnowflakeMLException(
|
1114
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1115
|
-
original_exception=AttributeError(
|
1116
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1117
|
-
"to_lightgbm()",
|
1118
|
-
"to_sklearn()"
|
1119
|
-
)
|
1120
|
-
),
|
1121
|
-
)
|
1122
|
-
|
1123
|
-
def _get_dependencies(self) -> List[str]:
|
1124
|
-
return self._deps
|