snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class BisectingKMeans(BaseTransformer):
|
71
64
|
r"""Bisecting K-Means clustering
|
72
65
|
For more details on this class, see [sklearn.cluster.BisectingKMeans]
|
@@ -282,12 +275,7 @@ class BisectingKMeans(BaseTransformer):
|
|
282
275
|
)
|
283
276
|
return selected_cols
|
284
277
|
|
285
|
-
|
286
|
-
project=_PROJECT,
|
287
|
-
subproject=_SUBPROJECT,
|
288
|
-
custom_tags=dict([("autogen", True)]),
|
289
|
-
)
|
290
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BisectingKMeans":
|
278
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BisectingKMeans":
|
291
279
|
"""Compute bisecting k-means clustering
|
292
280
|
For more details on this function, see [sklearn.cluster.BisectingKMeans.fit]
|
293
281
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans.fit)
|
@@ -314,12 +302,14 @@ class BisectingKMeans(BaseTransformer):
|
|
314
302
|
|
315
303
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
316
304
|
|
317
|
-
|
305
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
318
306
|
if SNOWML_SPROC_ENV in os.environ:
|
319
307
|
statement_params = telemetry.get_function_usage_statement_params(
|
320
308
|
project=_PROJECT,
|
321
309
|
subproject=_SUBPROJECT,
|
322
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
310
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
311
|
+
inspect.currentframe(), BisectingKMeans.__class__.__name__
|
312
|
+
),
|
323
313
|
api_calls=[Session.call],
|
324
314
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
325
315
|
)
|
@@ -340,27 +330,24 @@ class BisectingKMeans(BaseTransformer):
|
|
340
330
|
)
|
341
331
|
self._sklearn_object = model_trainer.train()
|
342
332
|
self._is_fitted = True
|
343
|
-
self.
|
333
|
+
self._generate_model_signatures(dataset)
|
344
334
|
return self
|
345
335
|
|
346
336
|
def _batch_inference_validate_snowpark(
|
347
337
|
self,
|
348
338
|
dataset: DataFrame,
|
349
339
|
inference_method: str,
|
350
|
-
) ->
|
351
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
352
|
-
return the available package that exists in the snowflake anaconda channel
|
340
|
+
) -> None:
|
341
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
353
342
|
|
354
343
|
Args:
|
355
344
|
dataset: snowpark dataframe
|
356
345
|
inference_method: the inference method such as predict, score...
|
357
|
-
|
346
|
+
|
358
347
|
Raises:
|
359
348
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
360
349
|
SnowflakeMLException: If the session is None, raise error
|
361
350
|
|
362
|
-
Returns:
|
363
|
-
A list of available package that exists in the snowflake anaconda channel
|
364
351
|
"""
|
365
352
|
if not self._is_fitted:
|
366
353
|
raise exceptions.SnowflakeMLException(
|
@@ -378,9 +365,7 @@ class BisectingKMeans(BaseTransformer):
|
|
378
365
|
"Session must not specified for snowpark dataset."
|
379
366
|
),
|
380
367
|
)
|
381
|
-
|
382
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
383
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
368
|
+
|
384
369
|
|
385
370
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
386
371
|
@telemetry.send_api_usage_telemetry(
|
@@ -416,7 +401,9 @@ class BisectingKMeans(BaseTransformer):
|
|
416
401
|
# when it is classifier, infer the datatype from label columns
|
417
402
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
418
403
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
419
|
-
label_cols_signatures = [
|
404
|
+
label_cols_signatures = [
|
405
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
406
|
+
]
|
420
407
|
if len(label_cols_signatures) == 0:
|
421
408
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
422
409
|
raise exceptions.SnowflakeMLException(
|
@@ -424,25 +411,23 @@ class BisectingKMeans(BaseTransformer):
|
|
424
411
|
original_exception=ValueError(error_str),
|
425
412
|
)
|
426
413
|
|
427
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
428
|
-
label_cols_signatures[0].as_snowpark_type()
|
429
|
-
)
|
414
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
430
415
|
|
431
|
-
self.
|
432
|
-
|
416
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
417
|
+
self._deps = self._get_dependencies()
|
418
|
+
assert isinstance(
|
419
|
+
dataset._session, Session
|
420
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
433
421
|
|
434
422
|
transform_kwargs = dict(
|
435
|
-
session
|
436
|
-
dependencies
|
437
|
-
drop_input_cols
|
438
|
-
expected_output_cols_type
|
423
|
+
session=dataset._session,
|
424
|
+
dependencies=self._deps,
|
425
|
+
drop_input_cols=self._drop_input_cols,
|
426
|
+
expected_output_cols_type=expected_type_inferred,
|
439
427
|
)
|
440
428
|
|
441
429
|
elif isinstance(dataset, pd.DataFrame):
|
442
|
-
transform_kwargs = dict(
|
443
|
-
snowpark_input_cols = self._snowpark_cols,
|
444
|
-
drop_input_cols = self._drop_input_cols
|
445
|
-
)
|
430
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
446
431
|
|
447
432
|
transform_handlers = ModelTransformerBuilder.build(
|
448
433
|
dataset=dataset,
|
@@ -484,7 +469,7 @@ class BisectingKMeans(BaseTransformer):
|
|
484
469
|
Transformed dataset.
|
485
470
|
"""
|
486
471
|
super()._check_dataset_type(dataset)
|
487
|
-
inference_method="transform"
|
472
|
+
inference_method = "transform"
|
488
473
|
|
489
474
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
490
475
|
# are specific to the type of dataset used.
|
@@ -514,24 +499,19 @@ class BisectingKMeans(BaseTransformer):
|
|
514
499
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
515
500
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
516
501
|
|
517
|
-
self.
|
518
|
-
|
519
|
-
inference_method=inference_method,
|
520
|
-
)
|
502
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
503
|
+
self._deps = self._get_dependencies()
|
521
504
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
522
505
|
|
523
506
|
transform_kwargs = dict(
|
524
|
-
session
|
525
|
-
dependencies
|
526
|
-
drop_input_cols
|
527
|
-
expected_output_cols_type
|
507
|
+
session=dataset._session,
|
508
|
+
dependencies=self._deps,
|
509
|
+
drop_input_cols=self._drop_input_cols,
|
510
|
+
expected_output_cols_type=expected_dtype,
|
528
511
|
)
|
529
512
|
|
530
513
|
elif isinstance(dataset, pd.DataFrame):
|
531
|
-
transform_kwargs = dict(
|
532
|
-
snowpark_input_cols = self._snowpark_cols,
|
533
|
-
drop_input_cols = self._drop_input_cols
|
534
|
-
)
|
514
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
535
515
|
|
536
516
|
transform_handlers = ModelTransformerBuilder.build(
|
537
517
|
dataset=dataset,
|
@@ -550,7 +530,11 @@ class BisectingKMeans(BaseTransformer):
|
|
550
530
|
return output_df
|
551
531
|
|
552
532
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
553
|
-
def fit_predict(
|
533
|
+
def fit_predict(
|
534
|
+
self,
|
535
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
536
|
+
output_cols_prefix: str = "fit_predict_",
|
537
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
554
538
|
""" Compute cluster centers and predict cluster index for each sample
|
555
539
|
For more details on this function, see [sklearn.cluster.BisectingKMeans.fit_predict]
|
556
540
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans.fit_predict)
|
@@ -577,22 +561,106 @@ class BisectingKMeans(BaseTransformer):
|
|
577
561
|
)
|
578
562
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
579
563
|
drop_input_cols=self._drop_input_cols,
|
580
|
-
expected_output_cols_list=
|
564
|
+
expected_output_cols_list=(
|
565
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
566
|
+
),
|
581
567
|
)
|
582
568
|
self._sklearn_object = fitted_estimator
|
583
569
|
self._is_fitted = True
|
584
570
|
return output_result
|
585
571
|
|
572
|
+
|
573
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
574
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
575
|
+
""" Compute clustering and transform X to cluster-distance space
|
576
|
+
For more details on this function, see [sklearn.cluster.BisectingKMeans.fit_transform]
|
577
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans.fit_transform)
|
578
|
+
|
579
|
+
|
580
|
+
Raises:
|
581
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
586
582
|
|
587
|
-
|
588
|
-
|
589
|
-
|
583
|
+
Args:
|
584
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
585
|
+
Snowpark or Pandas DataFrame.
|
586
|
+
output_cols_prefix: Prefix for the response columns
|
590
587
|
Returns:
|
591
588
|
Transformed dataset.
|
592
589
|
"""
|
593
|
-
self.
|
594
|
-
|
595
|
-
|
590
|
+
self._infer_input_output_cols(dataset)
|
591
|
+
super()._check_dataset_type(dataset)
|
592
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
593
|
+
estimator=self._sklearn_object,
|
594
|
+
dataset=dataset,
|
595
|
+
input_cols=self.input_cols,
|
596
|
+
label_cols=self.label_cols,
|
597
|
+
sample_weight_col=self.sample_weight_col,
|
598
|
+
autogenerated=self._autogenerated,
|
599
|
+
subproject=_SUBPROJECT,
|
600
|
+
)
|
601
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
602
|
+
drop_input_cols=self._drop_input_cols,
|
603
|
+
expected_output_cols_list=self.output_cols,
|
604
|
+
)
|
605
|
+
self._sklearn_object = fitted_estimator
|
606
|
+
self._is_fitted = True
|
607
|
+
return output_result
|
608
|
+
|
609
|
+
|
610
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
611
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
612
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
613
|
+
"""
|
614
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
615
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
616
|
+
if output_cols:
|
617
|
+
output_cols = [
|
618
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
619
|
+
for c in output_cols
|
620
|
+
]
|
621
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
622
|
+
output_cols = [output_cols_prefix]
|
623
|
+
elif self._sklearn_object is not None:
|
624
|
+
classes = self._sklearn_object.classes_
|
625
|
+
if isinstance(classes, numpy.ndarray):
|
626
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
627
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
628
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
629
|
+
output_cols = []
|
630
|
+
for i, cl in enumerate(classes):
|
631
|
+
# For binary classification, there is only one output column for each class
|
632
|
+
# ndarray as the two classes are complementary.
|
633
|
+
if len(cl) == 2:
|
634
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
635
|
+
else:
|
636
|
+
output_cols.extend([
|
637
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
638
|
+
])
|
639
|
+
else:
|
640
|
+
output_cols = []
|
641
|
+
|
642
|
+
# Make sure column names are valid snowflake identifiers.
|
643
|
+
assert output_cols is not None # Make MyPy happy
|
644
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
645
|
+
|
646
|
+
return rv
|
647
|
+
|
648
|
+
def _align_expected_output_names(
|
649
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
650
|
+
) -> List[str]:
|
651
|
+
# in case the inferred output column names dimension is different
|
652
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
653
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
654
|
+
output_df_columns = list(output_df_pd.columns)
|
655
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
656
|
+
if self.sample_weight_col:
|
657
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
658
|
+
# if the dimension of inferred output column names is correct; use it
|
659
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
660
|
+
return expected_output_cols_list
|
661
|
+
# otherwise, use the sklearn estimator's output
|
662
|
+
else:
|
663
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
596
664
|
|
597
665
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
598
666
|
@telemetry.send_api_usage_telemetry(
|
@@ -624,24 +692,26 @@ class BisectingKMeans(BaseTransformer):
|
|
624
692
|
# are specific to the type of dataset used.
|
625
693
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
626
694
|
|
695
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
696
|
+
|
627
697
|
if isinstance(dataset, DataFrame):
|
628
|
-
self.
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
698
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
699
|
+
self._deps = self._get_dependencies()
|
700
|
+
assert isinstance(
|
701
|
+
dataset._session, Session
|
702
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
633
703
|
transform_kwargs = dict(
|
634
704
|
session=dataset._session,
|
635
705
|
dependencies=self._deps,
|
636
|
-
drop_input_cols
|
706
|
+
drop_input_cols=self._drop_input_cols,
|
637
707
|
expected_output_cols_type="float",
|
638
708
|
)
|
709
|
+
expected_output_cols = self._align_expected_output_names(
|
710
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
711
|
+
)
|
639
712
|
|
640
713
|
elif isinstance(dataset, pd.DataFrame):
|
641
|
-
transform_kwargs = dict(
|
642
|
-
snowpark_input_cols = self._snowpark_cols,
|
643
|
-
drop_input_cols = self._drop_input_cols
|
644
|
-
)
|
714
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
645
715
|
|
646
716
|
transform_handlers = ModelTransformerBuilder.build(
|
647
717
|
dataset=dataset,
|
@@ -653,7 +723,7 @@ class BisectingKMeans(BaseTransformer):
|
|
653
723
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
654
724
|
inference_method=inference_method,
|
655
725
|
input_cols=self.input_cols,
|
656
|
-
expected_output_cols=
|
726
|
+
expected_output_cols=expected_output_cols,
|
657
727
|
**transform_kwargs
|
658
728
|
)
|
659
729
|
return output_df
|
@@ -683,29 +753,30 @@ class BisectingKMeans(BaseTransformer):
|
|
683
753
|
Output dataset with log probability of the sample for each class in the model.
|
684
754
|
"""
|
685
755
|
super()._check_dataset_type(dataset)
|
686
|
-
inference_method="predict_log_proba"
|
756
|
+
inference_method = "predict_log_proba"
|
757
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
687
758
|
|
688
759
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
689
760
|
# are specific to the type of dataset used.
|
690
761
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
691
762
|
|
692
763
|
if isinstance(dataset, DataFrame):
|
693
|
-
self.
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
764
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
765
|
+
self._deps = self._get_dependencies()
|
766
|
+
assert isinstance(
|
767
|
+
dataset._session, Session
|
768
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
698
769
|
transform_kwargs = dict(
|
699
770
|
session=dataset._session,
|
700
771
|
dependencies=self._deps,
|
701
|
-
drop_input_cols
|
772
|
+
drop_input_cols=self._drop_input_cols,
|
702
773
|
expected_output_cols_type="float",
|
703
774
|
)
|
775
|
+
expected_output_cols = self._align_expected_output_names(
|
776
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
777
|
+
)
|
704
778
|
elif isinstance(dataset, pd.DataFrame):
|
705
|
-
transform_kwargs = dict(
|
706
|
-
snowpark_input_cols = self._snowpark_cols,
|
707
|
-
drop_input_cols = self._drop_input_cols
|
708
|
-
)
|
779
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
709
780
|
|
710
781
|
transform_handlers = ModelTransformerBuilder.build(
|
711
782
|
dataset=dataset,
|
@@ -718,7 +789,7 @@ class BisectingKMeans(BaseTransformer):
|
|
718
789
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
719
790
|
inference_method=inference_method,
|
720
791
|
input_cols=self.input_cols,
|
721
|
-
expected_output_cols=
|
792
|
+
expected_output_cols=expected_output_cols,
|
722
793
|
**transform_kwargs
|
723
794
|
)
|
724
795
|
return output_df
|
@@ -744,30 +815,32 @@ class BisectingKMeans(BaseTransformer):
|
|
744
815
|
Output dataset with results of the decision function for the samples in input dataset.
|
745
816
|
"""
|
746
817
|
super()._check_dataset_type(dataset)
|
747
|
-
inference_method="decision_function"
|
818
|
+
inference_method = "decision_function"
|
748
819
|
|
749
820
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
750
821
|
# are specific to the type of dataset used.
|
751
822
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
752
823
|
|
824
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
825
|
+
|
753
826
|
if isinstance(dataset, DataFrame):
|
754
|
-
self.
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
827
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
828
|
+
self._deps = self._get_dependencies()
|
829
|
+
assert isinstance(
|
830
|
+
dataset._session, Session
|
831
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
759
832
|
transform_kwargs = dict(
|
760
833
|
session=dataset._session,
|
761
834
|
dependencies=self._deps,
|
762
|
-
drop_input_cols
|
835
|
+
drop_input_cols=self._drop_input_cols,
|
763
836
|
expected_output_cols_type="float",
|
764
837
|
)
|
838
|
+
expected_output_cols = self._align_expected_output_names(
|
839
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
840
|
+
)
|
765
841
|
|
766
842
|
elif isinstance(dataset, pd.DataFrame):
|
767
|
-
transform_kwargs = dict(
|
768
|
-
snowpark_input_cols = self._snowpark_cols,
|
769
|
-
drop_input_cols = self._drop_input_cols
|
770
|
-
)
|
843
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
771
844
|
|
772
845
|
transform_handlers = ModelTransformerBuilder.build(
|
773
846
|
dataset=dataset,
|
@@ -780,7 +853,7 @@ class BisectingKMeans(BaseTransformer):
|
|
780
853
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
781
854
|
inference_method=inference_method,
|
782
855
|
input_cols=self.input_cols,
|
783
|
-
expected_output_cols=
|
856
|
+
expected_output_cols=expected_output_cols,
|
784
857
|
**transform_kwargs
|
785
858
|
)
|
786
859
|
return output_df
|
@@ -809,17 +882,17 @@ class BisectingKMeans(BaseTransformer):
|
|
809
882
|
Output dataset with probability of the sample for each class in the model.
|
810
883
|
"""
|
811
884
|
super()._check_dataset_type(dataset)
|
812
|
-
inference_method="score_samples"
|
885
|
+
inference_method = "score_samples"
|
813
886
|
|
814
887
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
815
888
|
# are specific to the type of dataset used.
|
816
889
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
817
890
|
|
891
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
892
|
+
|
818
893
|
if isinstance(dataset, DataFrame):
|
819
|
-
self.
|
820
|
-
|
821
|
-
inference_method=inference_method,
|
822
|
-
)
|
894
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
895
|
+
self._deps = self._get_dependencies()
|
823
896
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
824
897
|
transform_kwargs = dict(
|
825
898
|
session=dataset._session,
|
@@ -827,6 +900,9 @@ class BisectingKMeans(BaseTransformer):
|
|
827
900
|
drop_input_cols = self._drop_input_cols,
|
828
901
|
expected_output_cols_type="float",
|
829
902
|
)
|
903
|
+
expected_output_cols = self._align_expected_output_names(
|
904
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
905
|
+
)
|
830
906
|
|
831
907
|
elif isinstance(dataset, pd.DataFrame):
|
832
908
|
transform_kwargs = dict(
|
@@ -845,7 +921,7 @@ class BisectingKMeans(BaseTransformer):
|
|
845
921
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
846
922
|
inference_method=inference_method,
|
847
923
|
input_cols=self.input_cols,
|
848
|
-
expected_output_cols=
|
924
|
+
expected_output_cols=expected_output_cols,
|
849
925
|
**transform_kwargs
|
850
926
|
)
|
851
927
|
return output_df
|
@@ -880,17 +956,15 @@ class BisectingKMeans(BaseTransformer):
|
|
880
956
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
881
957
|
|
882
958
|
if isinstance(dataset, DataFrame):
|
883
|
-
self.
|
884
|
-
|
885
|
-
inference_method="score",
|
886
|
-
)
|
959
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
960
|
+
self._deps = self._get_dependencies()
|
887
961
|
selected_cols = self._get_active_columns()
|
888
962
|
if len(selected_cols) > 0:
|
889
963
|
dataset = dataset.select(selected_cols)
|
890
964
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
891
965
|
transform_kwargs = dict(
|
892
966
|
session=dataset._session,
|
893
|
-
dependencies=
|
967
|
+
dependencies=self._deps,
|
894
968
|
score_sproc_imports=['sklearn'],
|
895
969
|
)
|
896
970
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -955,11 +1029,8 @@ class BisectingKMeans(BaseTransformer):
|
|
955
1029
|
|
956
1030
|
if isinstance(dataset, DataFrame):
|
957
1031
|
|
958
|
-
self.
|
959
|
-
|
960
|
-
inference_method=inference_method,
|
961
|
-
|
962
|
-
)
|
1032
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1033
|
+
self._deps = self._get_dependencies()
|
963
1034
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
964
1035
|
transform_kwargs = dict(
|
965
1036
|
session = dataset._session,
|
@@ -992,50 +1063,84 @@ class BisectingKMeans(BaseTransformer):
|
|
992
1063
|
)
|
993
1064
|
return output_df
|
994
1065
|
|
1066
|
+
|
1067
|
+
|
1068
|
+
def to_sklearn(self) -> Any:
|
1069
|
+
"""Get sklearn.cluster.BisectingKMeans object.
|
1070
|
+
"""
|
1071
|
+
if self._sklearn_object is None:
|
1072
|
+
self._sklearn_object = self._create_sklearn_object()
|
1073
|
+
return self._sklearn_object
|
1074
|
+
|
1075
|
+
def to_xgboost(self) -> Any:
|
1076
|
+
raise exceptions.SnowflakeMLException(
|
1077
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1078
|
+
original_exception=AttributeError(
|
1079
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1080
|
+
"to_xgboost()",
|
1081
|
+
"to_sklearn()"
|
1082
|
+
)
|
1083
|
+
),
|
1084
|
+
)
|
995
1085
|
|
996
|
-
def
|
1086
|
+
def to_lightgbm(self) -> Any:
|
1087
|
+
raise exceptions.SnowflakeMLException(
|
1088
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1089
|
+
original_exception=AttributeError(
|
1090
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1091
|
+
"to_lightgbm()",
|
1092
|
+
"to_sklearn()"
|
1093
|
+
)
|
1094
|
+
),
|
1095
|
+
)
|
1096
|
+
|
1097
|
+
def _get_dependencies(self) -> List[str]:
|
1098
|
+
return self._deps
|
1099
|
+
|
1100
|
+
|
1101
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
997
1102
|
self._model_signature_dict = dict()
|
998
1103
|
|
999
1104
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1000
1105
|
|
1001
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1106
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1002
1107
|
outputs: List[BaseFeatureSpec] = []
|
1003
1108
|
if hasattr(self, "predict"):
|
1004
1109
|
# keep mypy happy
|
1005
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1110
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1006
1111
|
# For classifier, the type of predict is the same as the type of label
|
1007
|
-
if self._sklearn_object._estimator_type ==
|
1008
|
-
|
1112
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1113
|
+
# label columns is the desired type for output
|
1009
1114
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1010
1115
|
# rename the output columns
|
1011
1116
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1012
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1013
|
-
|
1014
|
-
|
1117
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1118
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1119
|
+
)
|
1015
1120
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1016
1121
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1017
|
-
# Clusterer returns int64 cluster labels.
|
1122
|
+
# Clusterer returns int64 cluster labels.
|
1018
1123
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1019
1124
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1020
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1125
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1126
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1127
|
+
)
|
1128
|
+
|
1024
1129
|
# For regressor, the type of predict is float64
|
1025
|
-
elif self._sklearn_object._estimator_type ==
|
1130
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1026
1131
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1027
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1132
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1133
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1134
|
+
)
|
1135
|
+
|
1031
1136
|
for prob_func in PROB_FUNCTIONS:
|
1032
1137
|
if hasattr(self, prob_func):
|
1033
1138
|
output_cols_prefix: str = f"{prob_func}_"
|
1034
1139
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1035
1140
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1036
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1037
|
-
|
1038
|
-
|
1141
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1142
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1143
|
+
)
|
1039
1144
|
|
1040
1145
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1041
1146
|
items = list(self._model_signature_dict.items())
|
@@ -1048,10 +1153,10 @@ class BisectingKMeans(BaseTransformer):
|
|
1048
1153
|
"""Returns model signature of current class.
|
1049
1154
|
|
1050
1155
|
Raises:
|
1051
|
-
|
1156
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1052
1157
|
|
1053
1158
|
Returns:
|
1054
|
-
Dict
|
1159
|
+
Dict with each method and its input output signature
|
1055
1160
|
"""
|
1056
1161
|
if self._model_signature_dict is None:
|
1057
1162
|
raise exceptions.SnowflakeMLException(
|
@@ -1059,35 +1164,3 @@ class BisectingKMeans(BaseTransformer):
|
|
1059
1164
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1060
1165
|
)
|
1061
1166
|
return self._model_signature_dict
|
1062
|
-
|
1063
|
-
def to_sklearn(self) -> Any:
|
1064
|
-
"""Get sklearn.cluster.BisectingKMeans object.
|
1065
|
-
"""
|
1066
|
-
if self._sklearn_object is None:
|
1067
|
-
self._sklearn_object = self._create_sklearn_object()
|
1068
|
-
return self._sklearn_object
|
1069
|
-
|
1070
|
-
def to_xgboost(self) -> Any:
|
1071
|
-
raise exceptions.SnowflakeMLException(
|
1072
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
-
original_exception=AttributeError(
|
1074
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
-
"to_xgboost()",
|
1076
|
-
"to_sklearn()"
|
1077
|
-
)
|
1078
|
-
),
|
1079
|
-
)
|
1080
|
-
|
1081
|
-
def to_lightgbm(self) -> Any:
|
1082
|
-
raise exceptions.SnowflakeMLException(
|
1083
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
-
original_exception=AttributeError(
|
1085
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
-
"to_lightgbm()",
|
1087
|
-
"to_sklearn()"
|
1088
|
-
)
|
1089
|
-
),
|
1090
|
-
)
|
1091
|
-
|
1092
|
-
def _get_dependencies(self) -> List[str]:
|
1093
|
-
return self._deps
|