snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class FeatureAgglomeration(BaseTransformer):
|
71
64
|
r"""Agglomerate features
|
72
65
|
For more details on this class, see [sklearn.cluster.FeatureAgglomeration]
|
@@ -282,12 +275,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
282
275
|
)
|
283
276
|
return selected_cols
|
284
277
|
|
285
|
-
|
286
|
-
project=_PROJECT,
|
287
|
-
subproject=_SUBPROJECT,
|
288
|
-
custom_tags=dict([("autogen", True)]),
|
289
|
-
)
|
290
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "FeatureAgglomeration":
|
278
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "FeatureAgglomeration":
|
291
279
|
"""Fit the hierarchical clustering on the data
|
292
280
|
For more details on this function, see [sklearn.cluster.FeatureAgglomeration.fit]
|
293
281
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration.fit)
|
@@ -314,12 +302,14 @@ class FeatureAgglomeration(BaseTransformer):
|
|
314
302
|
|
315
303
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
316
304
|
|
317
|
-
|
305
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
318
306
|
if SNOWML_SPROC_ENV in os.environ:
|
319
307
|
statement_params = telemetry.get_function_usage_statement_params(
|
320
308
|
project=_PROJECT,
|
321
309
|
subproject=_SUBPROJECT,
|
322
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
310
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
311
|
+
inspect.currentframe(), FeatureAgglomeration.__class__.__name__
|
312
|
+
),
|
323
313
|
api_calls=[Session.call],
|
324
314
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
325
315
|
)
|
@@ -340,27 +330,24 @@ class FeatureAgglomeration(BaseTransformer):
|
|
340
330
|
)
|
341
331
|
self._sklearn_object = model_trainer.train()
|
342
332
|
self._is_fitted = True
|
343
|
-
self.
|
333
|
+
self._generate_model_signatures(dataset)
|
344
334
|
return self
|
345
335
|
|
346
336
|
def _batch_inference_validate_snowpark(
|
347
337
|
self,
|
348
338
|
dataset: DataFrame,
|
349
339
|
inference_method: str,
|
350
|
-
) ->
|
351
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
352
|
-
return the available package that exists in the snowflake anaconda channel
|
340
|
+
) -> None:
|
341
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
353
342
|
|
354
343
|
Args:
|
355
344
|
dataset: snowpark dataframe
|
356
345
|
inference_method: the inference method such as predict, score...
|
357
|
-
|
346
|
+
|
358
347
|
Raises:
|
359
348
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
360
349
|
SnowflakeMLException: If the session is None, raise error
|
361
350
|
|
362
|
-
Returns:
|
363
|
-
A list of available package that exists in the snowflake anaconda channel
|
364
351
|
"""
|
365
352
|
if not self._is_fitted:
|
366
353
|
raise exceptions.SnowflakeMLException(
|
@@ -378,9 +365,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
378
365
|
"Session must not specified for snowpark dataset."
|
379
366
|
),
|
380
367
|
)
|
381
|
-
|
382
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
383
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
368
|
+
|
384
369
|
|
385
370
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
386
371
|
@telemetry.send_api_usage_telemetry(
|
@@ -414,7 +399,9 @@ class FeatureAgglomeration(BaseTransformer):
|
|
414
399
|
# when it is classifier, infer the datatype from label columns
|
415
400
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
416
401
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
417
|
-
label_cols_signatures = [
|
402
|
+
label_cols_signatures = [
|
403
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
404
|
+
]
|
418
405
|
if len(label_cols_signatures) == 0:
|
419
406
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
420
407
|
raise exceptions.SnowflakeMLException(
|
@@ -422,25 +409,23 @@ class FeatureAgglomeration(BaseTransformer):
|
|
422
409
|
original_exception=ValueError(error_str),
|
423
410
|
)
|
424
411
|
|
425
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
426
|
-
label_cols_signatures[0].as_snowpark_type()
|
427
|
-
)
|
412
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
428
413
|
|
429
|
-
self.
|
430
|
-
|
414
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
415
|
+
self._deps = self._get_dependencies()
|
416
|
+
assert isinstance(
|
417
|
+
dataset._session, Session
|
418
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
431
419
|
|
432
420
|
transform_kwargs = dict(
|
433
|
-
session
|
434
|
-
dependencies
|
435
|
-
drop_input_cols
|
436
|
-
expected_output_cols_type
|
421
|
+
session=dataset._session,
|
422
|
+
dependencies=self._deps,
|
423
|
+
drop_input_cols=self._drop_input_cols,
|
424
|
+
expected_output_cols_type=expected_type_inferred,
|
437
425
|
)
|
438
426
|
|
439
427
|
elif isinstance(dataset, pd.DataFrame):
|
440
|
-
transform_kwargs = dict(
|
441
|
-
snowpark_input_cols = self._snowpark_cols,
|
442
|
-
drop_input_cols = self._drop_input_cols
|
443
|
-
)
|
428
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
444
429
|
|
445
430
|
transform_handlers = ModelTransformerBuilder.build(
|
446
431
|
dataset=dataset,
|
@@ -482,7 +467,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
482
467
|
Transformed dataset.
|
483
468
|
"""
|
484
469
|
super()._check_dataset_type(dataset)
|
485
|
-
inference_method="transform"
|
470
|
+
inference_method = "transform"
|
486
471
|
|
487
472
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
488
473
|
# are specific to the type of dataset used.
|
@@ -512,24 +497,19 @@ class FeatureAgglomeration(BaseTransformer):
|
|
512
497
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
513
498
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
514
499
|
|
515
|
-
self.
|
516
|
-
|
517
|
-
inference_method=inference_method,
|
518
|
-
)
|
500
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
501
|
+
self._deps = self._get_dependencies()
|
519
502
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
520
503
|
|
521
504
|
transform_kwargs = dict(
|
522
|
-
session
|
523
|
-
dependencies
|
524
|
-
drop_input_cols
|
525
|
-
expected_output_cols_type
|
505
|
+
session=dataset._session,
|
506
|
+
dependencies=self._deps,
|
507
|
+
drop_input_cols=self._drop_input_cols,
|
508
|
+
expected_output_cols_type=expected_dtype,
|
526
509
|
)
|
527
510
|
|
528
511
|
elif isinstance(dataset, pd.DataFrame):
|
529
|
-
transform_kwargs = dict(
|
530
|
-
snowpark_input_cols = self._snowpark_cols,
|
531
|
-
drop_input_cols = self._drop_input_cols
|
532
|
-
)
|
512
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
533
513
|
|
534
514
|
transform_handlers = ModelTransformerBuilder.build(
|
535
515
|
dataset=dataset,
|
@@ -548,7 +528,11 @@ class FeatureAgglomeration(BaseTransformer):
|
|
548
528
|
return output_df
|
549
529
|
|
550
530
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
551
|
-
def fit_predict(
|
531
|
+
def fit_predict(
|
532
|
+
self,
|
533
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
534
|
+
output_cols_prefix: str = "fit_predict_",
|
535
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
552
536
|
""" Fit and return the result of each sample's clustering assignment
|
553
537
|
For more details on this function, see [sklearn.cluster.FeatureAgglomeration.fit_predict]
|
554
538
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration.fit_predict)
|
@@ -575,22 +559,106 @@ class FeatureAgglomeration(BaseTransformer):
|
|
575
559
|
)
|
576
560
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
577
561
|
drop_input_cols=self._drop_input_cols,
|
578
|
-
expected_output_cols_list=
|
562
|
+
expected_output_cols_list=(
|
563
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
564
|
+
),
|
579
565
|
)
|
580
566
|
self._sklearn_object = fitted_estimator
|
581
567
|
self._is_fitted = True
|
582
568
|
return output_result
|
583
569
|
|
570
|
+
|
571
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
572
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
573
|
+
""" Fit to data, then transform it
|
574
|
+
For more details on this function, see [sklearn.cluster.FeatureAgglomeration.fit_transform]
|
575
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration.fit_transform)
|
576
|
+
|
577
|
+
|
578
|
+
Raises:
|
579
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
584
580
|
|
585
|
-
|
586
|
-
|
587
|
-
|
581
|
+
Args:
|
582
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
583
|
+
Snowpark or Pandas DataFrame.
|
584
|
+
output_cols_prefix: Prefix for the response columns
|
588
585
|
Returns:
|
589
586
|
Transformed dataset.
|
590
587
|
"""
|
591
|
-
self.
|
592
|
-
|
593
|
-
|
588
|
+
self._infer_input_output_cols(dataset)
|
589
|
+
super()._check_dataset_type(dataset)
|
590
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
591
|
+
estimator=self._sklearn_object,
|
592
|
+
dataset=dataset,
|
593
|
+
input_cols=self.input_cols,
|
594
|
+
label_cols=self.label_cols,
|
595
|
+
sample_weight_col=self.sample_weight_col,
|
596
|
+
autogenerated=self._autogenerated,
|
597
|
+
subproject=_SUBPROJECT,
|
598
|
+
)
|
599
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
600
|
+
drop_input_cols=self._drop_input_cols,
|
601
|
+
expected_output_cols_list=self.output_cols,
|
602
|
+
)
|
603
|
+
self._sklearn_object = fitted_estimator
|
604
|
+
self._is_fitted = True
|
605
|
+
return output_result
|
606
|
+
|
607
|
+
|
608
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
609
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
610
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
611
|
+
"""
|
612
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
613
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
614
|
+
if output_cols:
|
615
|
+
output_cols = [
|
616
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
617
|
+
for c in output_cols
|
618
|
+
]
|
619
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
620
|
+
output_cols = [output_cols_prefix]
|
621
|
+
elif self._sklearn_object is not None:
|
622
|
+
classes = self._sklearn_object.classes_
|
623
|
+
if isinstance(classes, numpy.ndarray):
|
624
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
625
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
626
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
627
|
+
output_cols = []
|
628
|
+
for i, cl in enumerate(classes):
|
629
|
+
# For binary classification, there is only one output column for each class
|
630
|
+
# ndarray as the two classes are complementary.
|
631
|
+
if len(cl) == 2:
|
632
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
633
|
+
else:
|
634
|
+
output_cols.extend([
|
635
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
636
|
+
])
|
637
|
+
else:
|
638
|
+
output_cols = []
|
639
|
+
|
640
|
+
# Make sure column names are valid snowflake identifiers.
|
641
|
+
assert output_cols is not None # Make MyPy happy
|
642
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
643
|
+
|
644
|
+
return rv
|
645
|
+
|
646
|
+
def _align_expected_output_names(
|
647
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
648
|
+
) -> List[str]:
|
649
|
+
# in case the inferred output column names dimension is different
|
650
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
651
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
652
|
+
output_df_columns = list(output_df_pd.columns)
|
653
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
654
|
+
if self.sample_weight_col:
|
655
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
656
|
+
# if the dimension of inferred output column names is correct; use it
|
657
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
658
|
+
return expected_output_cols_list
|
659
|
+
# otherwise, use the sklearn estimator's output
|
660
|
+
else:
|
661
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
594
662
|
|
595
663
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
596
664
|
@telemetry.send_api_usage_telemetry(
|
@@ -622,24 +690,26 @@ class FeatureAgglomeration(BaseTransformer):
|
|
622
690
|
# are specific to the type of dataset used.
|
623
691
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
624
692
|
|
693
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
694
|
+
|
625
695
|
if isinstance(dataset, DataFrame):
|
626
|
-
self.
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
696
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
697
|
+
self._deps = self._get_dependencies()
|
698
|
+
assert isinstance(
|
699
|
+
dataset._session, Session
|
700
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
631
701
|
transform_kwargs = dict(
|
632
702
|
session=dataset._session,
|
633
703
|
dependencies=self._deps,
|
634
|
-
drop_input_cols
|
704
|
+
drop_input_cols=self._drop_input_cols,
|
635
705
|
expected_output_cols_type="float",
|
636
706
|
)
|
707
|
+
expected_output_cols = self._align_expected_output_names(
|
708
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
709
|
+
)
|
637
710
|
|
638
711
|
elif isinstance(dataset, pd.DataFrame):
|
639
|
-
transform_kwargs = dict(
|
640
|
-
snowpark_input_cols = self._snowpark_cols,
|
641
|
-
drop_input_cols = self._drop_input_cols
|
642
|
-
)
|
712
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
643
713
|
|
644
714
|
transform_handlers = ModelTransformerBuilder.build(
|
645
715
|
dataset=dataset,
|
@@ -651,7 +721,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
651
721
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
652
722
|
inference_method=inference_method,
|
653
723
|
input_cols=self.input_cols,
|
654
|
-
expected_output_cols=
|
724
|
+
expected_output_cols=expected_output_cols,
|
655
725
|
**transform_kwargs
|
656
726
|
)
|
657
727
|
return output_df
|
@@ -681,29 +751,30 @@ class FeatureAgglomeration(BaseTransformer):
|
|
681
751
|
Output dataset with log probability of the sample for each class in the model.
|
682
752
|
"""
|
683
753
|
super()._check_dataset_type(dataset)
|
684
|
-
inference_method="predict_log_proba"
|
754
|
+
inference_method = "predict_log_proba"
|
755
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
685
756
|
|
686
757
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
687
758
|
# are specific to the type of dataset used.
|
688
759
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
689
760
|
|
690
761
|
if isinstance(dataset, DataFrame):
|
691
|
-
self.
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
762
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
763
|
+
self._deps = self._get_dependencies()
|
764
|
+
assert isinstance(
|
765
|
+
dataset._session, Session
|
766
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
696
767
|
transform_kwargs = dict(
|
697
768
|
session=dataset._session,
|
698
769
|
dependencies=self._deps,
|
699
|
-
drop_input_cols
|
770
|
+
drop_input_cols=self._drop_input_cols,
|
700
771
|
expected_output_cols_type="float",
|
701
772
|
)
|
773
|
+
expected_output_cols = self._align_expected_output_names(
|
774
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
775
|
+
)
|
702
776
|
elif isinstance(dataset, pd.DataFrame):
|
703
|
-
transform_kwargs = dict(
|
704
|
-
snowpark_input_cols = self._snowpark_cols,
|
705
|
-
drop_input_cols = self._drop_input_cols
|
706
|
-
)
|
777
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
707
778
|
|
708
779
|
transform_handlers = ModelTransformerBuilder.build(
|
709
780
|
dataset=dataset,
|
@@ -716,7 +787,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
716
787
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
717
788
|
inference_method=inference_method,
|
718
789
|
input_cols=self.input_cols,
|
719
|
-
expected_output_cols=
|
790
|
+
expected_output_cols=expected_output_cols,
|
720
791
|
**transform_kwargs
|
721
792
|
)
|
722
793
|
return output_df
|
@@ -742,30 +813,32 @@ class FeatureAgglomeration(BaseTransformer):
|
|
742
813
|
Output dataset with results of the decision function for the samples in input dataset.
|
743
814
|
"""
|
744
815
|
super()._check_dataset_type(dataset)
|
745
|
-
inference_method="decision_function"
|
816
|
+
inference_method = "decision_function"
|
746
817
|
|
747
818
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
748
819
|
# are specific to the type of dataset used.
|
749
820
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
750
821
|
|
822
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
823
|
+
|
751
824
|
if isinstance(dataset, DataFrame):
|
752
|
-
self.
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
825
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
826
|
+
self._deps = self._get_dependencies()
|
827
|
+
assert isinstance(
|
828
|
+
dataset._session, Session
|
829
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
757
830
|
transform_kwargs = dict(
|
758
831
|
session=dataset._session,
|
759
832
|
dependencies=self._deps,
|
760
|
-
drop_input_cols
|
833
|
+
drop_input_cols=self._drop_input_cols,
|
761
834
|
expected_output_cols_type="float",
|
762
835
|
)
|
836
|
+
expected_output_cols = self._align_expected_output_names(
|
837
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
838
|
+
)
|
763
839
|
|
764
840
|
elif isinstance(dataset, pd.DataFrame):
|
765
|
-
transform_kwargs = dict(
|
766
|
-
snowpark_input_cols = self._snowpark_cols,
|
767
|
-
drop_input_cols = self._drop_input_cols
|
768
|
-
)
|
841
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
769
842
|
|
770
843
|
transform_handlers = ModelTransformerBuilder.build(
|
771
844
|
dataset=dataset,
|
@@ -778,7 +851,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
778
851
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
779
852
|
inference_method=inference_method,
|
780
853
|
input_cols=self.input_cols,
|
781
|
-
expected_output_cols=
|
854
|
+
expected_output_cols=expected_output_cols,
|
782
855
|
**transform_kwargs
|
783
856
|
)
|
784
857
|
return output_df
|
@@ -807,17 +880,17 @@ class FeatureAgglomeration(BaseTransformer):
|
|
807
880
|
Output dataset with probability of the sample for each class in the model.
|
808
881
|
"""
|
809
882
|
super()._check_dataset_type(dataset)
|
810
|
-
inference_method="score_samples"
|
883
|
+
inference_method = "score_samples"
|
811
884
|
|
812
885
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
813
886
|
# are specific to the type of dataset used.
|
814
887
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
815
888
|
|
889
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
890
|
+
|
816
891
|
if isinstance(dataset, DataFrame):
|
817
|
-
self.
|
818
|
-
|
819
|
-
inference_method=inference_method,
|
820
|
-
)
|
892
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
893
|
+
self._deps = self._get_dependencies()
|
821
894
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
822
895
|
transform_kwargs = dict(
|
823
896
|
session=dataset._session,
|
@@ -825,6 +898,9 @@ class FeatureAgglomeration(BaseTransformer):
|
|
825
898
|
drop_input_cols = self._drop_input_cols,
|
826
899
|
expected_output_cols_type="float",
|
827
900
|
)
|
901
|
+
expected_output_cols = self._align_expected_output_names(
|
902
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
903
|
+
)
|
828
904
|
|
829
905
|
elif isinstance(dataset, pd.DataFrame):
|
830
906
|
transform_kwargs = dict(
|
@@ -843,7 +919,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
843
919
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
844
920
|
inference_method=inference_method,
|
845
921
|
input_cols=self.input_cols,
|
846
|
-
expected_output_cols=
|
922
|
+
expected_output_cols=expected_output_cols,
|
847
923
|
**transform_kwargs
|
848
924
|
)
|
849
925
|
return output_df
|
@@ -876,17 +952,15 @@ class FeatureAgglomeration(BaseTransformer):
|
|
876
952
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
877
953
|
|
878
954
|
if isinstance(dataset, DataFrame):
|
879
|
-
self.
|
880
|
-
|
881
|
-
inference_method="score",
|
882
|
-
)
|
955
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
956
|
+
self._deps = self._get_dependencies()
|
883
957
|
selected_cols = self._get_active_columns()
|
884
958
|
if len(selected_cols) > 0:
|
885
959
|
dataset = dataset.select(selected_cols)
|
886
960
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
887
961
|
transform_kwargs = dict(
|
888
962
|
session=dataset._session,
|
889
|
-
dependencies=
|
963
|
+
dependencies=self._deps,
|
890
964
|
score_sproc_imports=['sklearn'],
|
891
965
|
)
|
892
966
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -951,11 +1025,8 @@ class FeatureAgglomeration(BaseTransformer):
|
|
951
1025
|
|
952
1026
|
if isinstance(dataset, DataFrame):
|
953
1027
|
|
954
|
-
self.
|
955
|
-
|
956
|
-
inference_method=inference_method,
|
957
|
-
|
958
|
-
)
|
1028
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1029
|
+
self._deps = self._get_dependencies()
|
959
1030
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
960
1031
|
transform_kwargs = dict(
|
961
1032
|
session = dataset._session,
|
@@ -988,50 +1059,84 @@ class FeatureAgglomeration(BaseTransformer):
|
|
988
1059
|
)
|
989
1060
|
return output_df
|
990
1061
|
|
1062
|
+
|
1063
|
+
|
1064
|
+
def to_sklearn(self) -> Any:
|
1065
|
+
"""Get sklearn.cluster.FeatureAgglomeration object.
|
1066
|
+
"""
|
1067
|
+
if self._sklearn_object is None:
|
1068
|
+
self._sklearn_object = self._create_sklearn_object()
|
1069
|
+
return self._sklearn_object
|
1070
|
+
|
1071
|
+
def to_xgboost(self) -> Any:
|
1072
|
+
raise exceptions.SnowflakeMLException(
|
1073
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1074
|
+
original_exception=AttributeError(
|
1075
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1076
|
+
"to_xgboost()",
|
1077
|
+
"to_sklearn()"
|
1078
|
+
)
|
1079
|
+
),
|
1080
|
+
)
|
991
1081
|
|
992
|
-
def
|
1082
|
+
def to_lightgbm(self) -> Any:
|
1083
|
+
raise exceptions.SnowflakeMLException(
|
1084
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1085
|
+
original_exception=AttributeError(
|
1086
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1087
|
+
"to_lightgbm()",
|
1088
|
+
"to_sklearn()"
|
1089
|
+
)
|
1090
|
+
),
|
1091
|
+
)
|
1092
|
+
|
1093
|
+
def _get_dependencies(self) -> List[str]:
|
1094
|
+
return self._deps
|
1095
|
+
|
1096
|
+
|
1097
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
993
1098
|
self._model_signature_dict = dict()
|
994
1099
|
|
995
1100
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
996
1101
|
|
997
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1102
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
998
1103
|
outputs: List[BaseFeatureSpec] = []
|
999
1104
|
if hasattr(self, "predict"):
|
1000
1105
|
# keep mypy happy
|
1001
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1106
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1002
1107
|
# For classifier, the type of predict is the same as the type of label
|
1003
|
-
if self._sklearn_object._estimator_type ==
|
1004
|
-
|
1108
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1109
|
+
# label columns is the desired type for output
|
1005
1110
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1006
1111
|
# rename the output columns
|
1007
1112
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1008
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1009
|
-
|
1010
|
-
|
1113
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1114
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1115
|
+
)
|
1011
1116
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1012
1117
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1013
|
-
# Clusterer returns int64 cluster labels.
|
1118
|
+
# Clusterer returns int64 cluster labels.
|
1014
1119
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1015
1120
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1016
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1121
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1122
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1123
|
+
)
|
1124
|
+
|
1020
1125
|
# For regressor, the type of predict is float64
|
1021
|
-
elif self._sklearn_object._estimator_type ==
|
1126
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1022
1127
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1023
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1128
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1129
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1130
|
+
)
|
1131
|
+
|
1027
1132
|
for prob_func in PROB_FUNCTIONS:
|
1028
1133
|
if hasattr(self, prob_func):
|
1029
1134
|
output_cols_prefix: str = f"{prob_func}_"
|
1030
1135
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1031
1136
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1032
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1033
|
-
|
1034
|
-
|
1137
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1138
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1139
|
+
)
|
1035
1140
|
|
1036
1141
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1037
1142
|
items = list(self._model_signature_dict.items())
|
@@ -1044,10 +1149,10 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1044
1149
|
"""Returns model signature of current class.
|
1045
1150
|
|
1046
1151
|
Raises:
|
1047
|
-
|
1152
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1048
1153
|
|
1049
1154
|
Returns:
|
1050
|
-
Dict
|
1155
|
+
Dict with each method and its input output signature
|
1051
1156
|
"""
|
1052
1157
|
if self._model_signature_dict is None:
|
1053
1158
|
raise exceptions.SnowflakeMLException(
|
@@ -1055,35 +1160,3 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1055
1160
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1056
1161
|
)
|
1057
1162
|
return self._model_signature_dict
|
1058
|
-
|
1059
|
-
def to_sklearn(self) -> Any:
|
1060
|
-
"""Get sklearn.cluster.FeatureAgglomeration object.
|
1061
|
-
"""
|
1062
|
-
if self._sklearn_object is None:
|
1063
|
-
self._sklearn_object = self._create_sklearn_object()
|
1064
|
-
return self._sklearn_object
|
1065
|
-
|
1066
|
-
def to_xgboost(self) -> Any:
|
1067
|
-
raise exceptions.SnowflakeMLException(
|
1068
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1069
|
-
original_exception=AttributeError(
|
1070
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1071
|
-
"to_xgboost()",
|
1072
|
-
"to_sklearn()"
|
1073
|
-
)
|
1074
|
-
),
|
1075
|
-
)
|
1076
|
-
|
1077
|
-
def to_lightgbm(self) -> Any:
|
1078
|
-
raise exceptions.SnowflakeMLException(
|
1079
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1080
|
-
original_exception=AttributeError(
|
1081
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1082
|
-
"to_lightgbm()",
|
1083
|
-
"to_sklearn()"
|
1084
|
-
)
|
1085
|
-
),
|
1086
|
-
)
|
1087
|
-
|
1088
|
-
def _get_dependencies(self) -> List[str]:
|
1089
|
-
return self._deps
|