snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -282,12 +281,7 @@ class BisectingKMeans(BaseTransformer):
|
|
282
281
|
)
|
283
282
|
return selected_cols
|
284
283
|
|
285
|
-
|
286
|
-
project=_PROJECT,
|
287
|
-
subproject=_SUBPROJECT,
|
288
|
-
custom_tags=dict([("autogen", True)]),
|
289
|
-
)
|
290
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BisectingKMeans":
|
284
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BisectingKMeans":
|
291
285
|
"""Compute bisecting k-means clustering
|
292
286
|
For more details on this function, see [sklearn.cluster.BisectingKMeans.fit]
|
293
287
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans.fit)
|
@@ -314,12 +308,14 @@ class BisectingKMeans(BaseTransformer):
|
|
314
308
|
|
315
309
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
316
310
|
|
317
|
-
|
311
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
318
312
|
if SNOWML_SPROC_ENV in os.environ:
|
319
313
|
statement_params = telemetry.get_function_usage_statement_params(
|
320
314
|
project=_PROJECT,
|
321
315
|
subproject=_SUBPROJECT,
|
322
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
316
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
317
|
+
inspect.currentframe(), BisectingKMeans.__class__.__name__
|
318
|
+
),
|
323
319
|
api_calls=[Session.call],
|
324
320
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
325
321
|
)
|
@@ -340,7 +336,7 @@ class BisectingKMeans(BaseTransformer):
|
|
340
336
|
)
|
341
337
|
self._sklearn_object = model_trainer.train()
|
342
338
|
self._is_fitted = True
|
343
|
-
self.
|
339
|
+
self._generate_model_signatures(dataset)
|
344
340
|
return self
|
345
341
|
|
346
342
|
def _batch_inference_validate_snowpark(
|
@@ -416,7 +412,9 @@ class BisectingKMeans(BaseTransformer):
|
|
416
412
|
# when it is classifier, infer the datatype from label columns
|
417
413
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
418
414
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
419
|
-
label_cols_signatures = [
|
415
|
+
label_cols_signatures = [
|
416
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
417
|
+
]
|
420
418
|
if len(label_cols_signatures) == 0:
|
421
419
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
422
420
|
raise exceptions.SnowflakeMLException(
|
@@ -424,25 +422,22 @@ class BisectingKMeans(BaseTransformer):
|
|
424
422
|
original_exception=ValueError(error_str),
|
425
423
|
)
|
426
424
|
|
427
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
428
|
-
label_cols_signatures[0].as_snowpark_type()
|
429
|
-
)
|
425
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
430
426
|
|
431
427
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
432
|
-
assert isinstance(
|
428
|
+
assert isinstance(
|
429
|
+
dataset._session, Session
|
430
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
433
431
|
|
434
432
|
transform_kwargs = dict(
|
435
|
-
session
|
436
|
-
dependencies
|
437
|
-
drop_input_cols
|
438
|
-
expected_output_cols_type
|
433
|
+
session=dataset._session,
|
434
|
+
dependencies=self._deps,
|
435
|
+
drop_input_cols=self._drop_input_cols,
|
436
|
+
expected_output_cols_type=expected_type_inferred,
|
439
437
|
)
|
440
438
|
|
441
439
|
elif isinstance(dataset, pd.DataFrame):
|
442
|
-
transform_kwargs = dict(
|
443
|
-
snowpark_input_cols = self._snowpark_cols,
|
444
|
-
drop_input_cols = self._drop_input_cols
|
445
|
-
)
|
440
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
446
441
|
|
447
442
|
transform_handlers = ModelTransformerBuilder.build(
|
448
443
|
dataset=dataset,
|
@@ -484,7 +479,7 @@ class BisectingKMeans(BaseTransformer):
|
|
484
479
|
Transformed dataset.
|
485
480
|
"""
|
486
481
|
super()._check_dataset_type(dataset)
|
487
|
-
inference_method="transform"
|
482
|
+
inference_method = "transform"
|
488
483
|
|
489
484
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
490
485
|
# are specific to the type of dataset used.
|
@@ -521,17 +516,14 @@ class BisectingKMeans(BaseTransformer):
|
|
521
516
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
522
517
|
|
523
518
|
transform_kwargs = dict(
|
524
|
-
session
|
525
|
-
dependencies
|
526
|
-
drop_input_cols
|
527
|
-
expected_output_cols_type
|
519
|
+
session=dataset._session,
|
520
|
+
dependencies=self._deps,
|
521
|
+
drop_input_cols=self._drop_input_cols,
|
522
|
+
expected_output_cols_type=expected_dtype,
|
528
523
|
)
|
529
524
|
|
530
525
|
elif isinstance(dataset, pd.DataFrame):
|
531
|
-
transform_kwargs = dict(
|
532
|
-
snowpark_input_cols = self._snowpark_cols,
|
533
|
-
drop_input_cols = self._drop_input_cols
|
534
|
-
)
|
526
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
535
527
|
|
536
528
|
transform_handlers = ModelTransformerBuilder.build(
|
537
529
|
dataset=dataset,
|
@@ -550,7 +542,11 @@ class BisectingKMeans(BaseTransformer):
|
|
550
542
|
return output_df
|
551
543
|
|
552
544
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
553
|
-
def fit_predict(
|
545
|
+
def fit_predict(
|
546
|
+
self,
|
547
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
548
|
+
output_cols_prefix: str = "fit_predict_",
|
549
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
554
550
|
""" Compute cluster centers and predict cluster index for each sample
|
555
551
|
For more details on this function, see [sklearn.cluster.BisectingKMeans.fit_predict]
|
556
552
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans.fit_predict)
|
@@ -577,7 +573,9 @@ class BisectingKMeans(BaseTransformer):
|
|
577
573
|
)
|
578
574
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
579
575
|
drop_input_cols=self._drop_input_cols,
|
580
|
-
expected_output_cols_list=
|
576
|
+
expected_output_cols_list=(
|
577
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
578
|
+
),
|
581
579
|
)
|
582
580
|
self._sklearn_object = fitted_estimator
|
583
581
|
self._is_fitted = True
|
@@ -594,6 +592,62 @@ class BisectingKMeans(BaseTransformer):
|
|
594
592
|
assert self._sklearn_object is not None
|
595
593
|
return self._sklearn_object.embedding_
|
596
594
|
|
595
|
+
|
596
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
597
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
598
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
599
|
+
"""
|
600
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
601
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
602
|
+
if output_cols:
|
603
|
+
output_cols = [
|
604
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
605
|
+
for c in output_cols
|
606
|
+
]
|
607
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
608
|
+
output_cols = [output_cols_prefix]
|
609
|
+
elif self._sklearn_object is not None:
|
610
|
+
classes = self._sklearn_object.classes_
|
611
|
+
if isinstance(classes, numpy.ndarray):
|
612
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
613
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
614
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
615
|
+
output_cols = []
|
616
|
+
for i, cl in enumerate(classes):
|
617
|
+
# For binary classification, there is only one output column for each class
|
618
|
+
# ndarray as the two classes are complementary.
|
619
|
+
if len(cl) == 2:
|
620
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
621
|
+
else:
|
622
|
+
output_cols.extend([
|
623
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
624
|
+
])
|
625
|
+
else:
|
626
|
+
output_cols = []
|
627
|
+
|
628
|
+
# Make sure column names are valid snowflake identifiers.
|
629
|
+
assert output_cols is not None # Make MyPy happy
|
630
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
631
|
+
|
632
|
+
return rv
|
633
|
+
|
634
|
+
def _align_expected_output_names(
|
635
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
636
|
+
) -> List[str]:
|
637
|
+
# in case the inferred output column names dimension is different
|
638
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
639
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
640
|
+
output_df_columns = list(output_df_pd.columns)
|
641
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
642
|
+
if self.sample_weight_col:
|
643
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
644
|
+
# if the dimension of inferred output column names is correct; use it
|
645
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
646
|
+
return expected_output_cols_list
|
647
|
+
# otherwise, use the sklearn estimator's output
|
648
|
+
else:
|
649
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
650
|
+
|
597
651
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
598
652
|
@telemetry.send_api_usage_telemetry(
|
599
653
|
project=_PROJECT,
|
@@ -624,24 +678,28 @@ class BisectingKMeans(BaseTransformer):
|
|
624
678
|
# are specific to the type of dataset used.
|
625
679
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
626
680
|
|
681
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
682
|
+
|
627
683
|
if isinstance(dataset, DataFrame):
|
628
684
|
self._deps = self._batch_inference_validate_snowpark(
|
629
685
|
dataset=dataset,
|
630
686
|
inference_method=inference_method,
|
631
687
|
)
|
632
|
-
assert isinstance(
|
688
|
+
assert isinstance(
|
689
|
+
dataset._session, Session
|
690
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
633
691
|
transform_kwargs = dict(
|
634
692
|
session=dataset._session,
|
635
693
|
dependencies=self._deps,
|
636
|
-
drop_input_cols
|
694
|
+
drop_input_cols=self._drop_input_cols,
|
637
695
|
expected_output_cols_type="float",
|
638
696
|
)
|
697
|
+
expected_output_cols = self._align_expected_output_names(
|
698
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
699
|
+
)
|
639
700
|
|
640
701
|
elif isinstance(dataset, pd.DataFrame):
|
641
|
-
transform_kwargs = dict(
|
642
|
-
snowpark_input_cols = self._snowpark_cols,
|
643
|
-
drop_input_cols = self._drop_input_cols
|
644
|
-
)
|
702
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
645
703
|
|
646
704
|
transform_handlers = ModelTransformerBuilder.build(
|
647
705
|
dataset=dataset,
|
@@ -653,7 +711,7 @@ class BisectingKMeans(BaseTransformer):
|
|
653
711
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
654
712
|
inference_method=inference_method,
|
655
713
|
input_cols=self.input_cols,
|
656
|
-
expected_output_cols=
|
714
|
+
expected_output_cols=expected_output_cols,
|
657
715
|
**transform_kwargs
|
658
716
|
)
|
659
717
|
return output_df
|
@@ -683,7 +741,8 @@ class BisectingKMeans(BaseTransformer):
|
|
683
741
|
Output dataset with log probability of the sample for each class in the model.
|
684
742
|
"""
|
685
743
|
super()._check_dataset_type(dataset)
|
686
|
-
inference_method="predict_log_proba"
|
744
|
+
inference_method = "predict_log_proba"
|
745
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
687
746
|
|
688
747
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
689
748
|
# are specific to the type of dataset used.
|
@@ -694,18 +753,20 @@ class BisectingKMeans(BaseTransformer):
|
|
694
753
|
dataset=dataset,
|
695
754
|
inference_method=inference_method,
|
696
755
|
)
|
697
|
-
assert isinstance(
|
756
|
+
assert isinstance(
|
757
|
+
dataset._session, Session
|
758
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
698
759
|
transform_kwargs = dict(
|
699
760
|
session=dataset._session,
|
700
761
|
dependencies=self._deps,
|
701
|
-
drop_input_cols
|
762
|
+
drop_input_cols=self._drop_input_cols,
|
702
763
|
expected_output_cols_type="float",
|
703
764
|
)
|
765
|
+
expected_output_cols = self._align_expected_output_names(
|
766
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
767
|
+
)
|
704
768
|
elif isinstance(dataset, pd.DataFrame):
|
705
|
-
transform_kwargs = dict(
|
706
|
-
snowpark_input_cols = self._snowpark_cols,
|
707
|
-
drop_input_cols = self._drop_input_cols
|
708
|
-
)
|
769
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
709
770
|
|
710
771
|
transform_handlers = ModelTransformerBuilder.build(
|
711
772
|
dataset=dataset,
|
@@ -718,7 +779,7 @@ class BisectingKMeans(BaseTransformer):
|
|
718
779
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
719
780
|
inference_method=inference_method,
|
720
781
|
input_cols=self.input_cols,
|
721
|
-
expected_output_cols=
|
782
|
+
expected_output_cols=expected_output_cols,
|
722
783
|
**transform_kwargs
|
723
784
|
)
|
724
785
|
return output_df
|
@@ -744,30 +805,34 @@ class BisectingKMeans(BaseTransformer):
|
|
744
805
|
Output dataset with results of the decision function for the samples in input dataset.
|
745
806
|
"""
|
746
807
|
super()._check_dataset_type(dataset)
|
747
|
-
inference_method="decision_function"
|
808
|
+
inference_method = "decision_function"
|
748
809
|
|
749
810
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
750
811
|
# are specific to the type of dataset used.
|
751
812
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
752
813
|
|
814
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
815
|
+
|
753
816
|
if isinstance(dataset, DataFrame):
|
754
817
|
self._deps = self._batch_inference_validate_snowpark(
|
755
818
|
dataset=dataset,
|
756
819
|
inference_method=inference_method,
|
757
820
|
)
|
758
|
-
assert isinstance(
|
821
|
+
assert isinstance(
|
822
|
+
dataset._session, Session
|
823
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
759
824
|
transform_kwargs = dict(
|
760
825
|
session=dataset._session,
|
761
826
|
dependencies=self._deps,
|
762
|
-
drop_input_cols
|
827
|
+
drop_input_cols=self._drop_input_cols,
|
763
828
|
expected_output_cols_type="float",
|
764
829
|
)
|
830
|
+
expected_output_cols = self._align_expected_output_names(
|
831
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
832
|
+
)
|
765
833
|
|
766
834
|
elif isinstance(dataset, pd.DataFrame):
|
767
|
-
transform_kwargs = dict(
|
768
|
-
snowpark_input_cols = self._snowpark_cols,
|
769
|
-
drop_input_cols = self._drop_input_cols
|
770
|
-
)
|
835
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
771
836
|
|
772
837
|
transform_handlers = ModelTransformerBuilder.build(
|
773
838
|
dataset=dataset,
|
@@ -780,7 +845,7 @@ class BisectingKMeans(BaseTransformer):
|
|
780
845
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
781
846
|
inference_method=inference_method,
|
782
847
|
input_cols=self.input_cols,
|
783
|
-
expected_output_cols=
|
848
|
+
expected_output_cols=expected_output_cols,
|
784
849
|
**transform_kwargs
|
785
850
|
)
|
786
851
|
return output_df
|
@@ -809,12 +874,14 @@ class BisectingKMeans(BaseTransformer):
|
|
809
874
|
Output dataset with probability of the sample for each class in the model.
|
810
875
|
"""
|
811
876
|
super()._check_dataset_type(dataset)
|
812
|
-
inference_method="score_samples"
|
877
|
+
inference_method = "score_samples"
|
813
878
|
|
814
879
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
815
880
|
# are specific to the type of dataset used.
|
816
881
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
817
882
|
|
883
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
884
|
+
|
818
885
|
if isinstance(dataset, DataFrame):
|
819
886
|
self._deps = self._batch_inference_validate_snowpark(
|
820
887
|
dataset=dataset,
|
@@ -827,6 +894,9 @@ class BisectingKMeans(BaseTransformer):
|
|
827
894
|
drop_input_cols = self._drop_input_cols,
|
828
895
|
expected_output_cols_type="float",
|
829
896
|
)
|
897
|
+
expected_output_cols = self._align_expected_output_names(
|
898
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
899
|
+
)
|
830
900
|
|
831
901
|
elif isinstance(dataset, pd.DataFrame):
|
832
902
|
transform_kwargs = dict(
|
@@ -845,7 +915,7 @@ class BisectingKMeans(BaseTransformer):
|
|
845
915
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
846
916
|
inference_method=inference_method,
|
847
917
|
input_cols=self.input_cols,
|
848
|
-
expected_output_cols=
|
918
|
+
expected_output_cols=expected_output_cols,
|
849
919
|
**transform_kwargs
|
850
920
|
)
|
851
921
|
return output_df
|
@@ -992,50 +1062,84 @@ class BisectingKMeans(BaseTransformer):
|
|
992
1062
|
)
|
993
1063
|
return output_df
|
994
1064
|
|
1065
|
+
|
1066
|
+
|
1067
|
+
def to_sklearn(self) -> Any:
|
1068
|
+
"""Get sklearn.cluster.BisectingKMeans object.
|
1069
|
+
"""
|
1070
|
+
if self._sklearn_object is None:
|
1071
|
+
self._sklearn_object = self._create_sklearn_object()
|
1072
|
+
return self._sklearn_object
|
1073
|
+
|
1074
|
+
def to_xgboost(self) -> Any:
|
1075
|
+
raise exceptions.SnowflakeMLException(
|
1076
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1077
|
+
original_exception=AttributeError(
|
1078
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1079
|
+
"to_xgboost()",
|
1080
|
+
"to_sklearn()"
|
1081
|
+
)
|
1082
|
+
),
|
1083
|
+
)
|
1084
|
+
|
1085
|
+
def to_lightgbm(self) -> Any:
|
1086
|
+
raise exceptions.SnowflakeMLException(
|
1087
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1088
|
+
original_exception=AttributeError(
|
1089
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1090
|
+
"to_lightgbm()",
|
1091
|
+
"to_sklearn()"
|
1092
|
+
)
|
1093
|
+
),
|
1094
|
+
)
|
995
1095
|
|
996
|
-
def
|
1096
|
+
def _get_dependencies(self) -> List[str]:
|
1097
|
+
return self._deps
|
1098
|
+
|
1099
|
+
|
1100
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
997
1101
|
self._model_signature_dict = dict()
|
998
1102
|
|
999
1103
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1000
1104
|
|
1001
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1105
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1002
1106
|
outputs: List[BaseFeatureSpec] = []
|
1003
1107
|
if hasattr(self, "predict"):
|
1004
1108
|
# keep mypy happy
|
1005
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1109
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1006
1110
|
# For classifier, the type of predict is the same as the type of label
|
1007
|
-
if self._sklearn_object._estimator_type ==
|
1008
|
-
|
1111
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1112
|
+
# label columns is the desired type for output
|
1009
1113
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1010
1114
|
# rename the output columns
|
1011
1115
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1012
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1013
|
-
|
1014
|
-
|
1116
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1117
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1118
|
+
)
|
1015
1119
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1016
1120
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1017
|
-
# Clusterer returns int64 cluster labels.
|
1121
|
+
# Clusterer returns int64 cluster labels.
|
1018
1122
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1019
1123
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1020
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1124
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1125
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1126
|
+
)
|
1127
|
+
|
1024
1128
|
# For regressor, the type of predict is float64
|
1025
|
-
elif self._sklearn_object._estimator_type ==
|
1129
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1026
1130
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1027
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1131
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1132
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1133
|
+
)
|
1134
|
+
|
1031
1135
|
for prob_func in PROB_FUNCTIONS:
|
1032
1136
|
if hasattr(self, prob_func):
|
1033
1137
|
output_cols_prefix: str = f"{prob_func}_"
|
1034
1138
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1035
1139
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1036
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1037
|
-
|
1038
|
-
|
1140
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1141
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1142
|
+
)
|
1039
1143
|
|
1040
1144
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1041
1145
|
items = list(self._model_signature_dict.items())
|
@@ -1048,10 +1152,10 @@ class BisectingKMeans(BaseTransformer):
|
|
1048
1152
|
"""Returns model signature of current class.
|
1049
1153
|
|
1050
1154
|
Raises:
|
1051
|
-
|
1155
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1052
1156
|
|
1053
1157
|
Returns:
|
1054
|
-
Dict
|
1158
|
+
Dict with each method and its input output signature
|
1055
1159
|
"""
|
1056
1160
|
if self._model_signature_dict is None:
|
1057
1161
|
raise exceptions.SnowflakeMLException(
|
@@ -1059,35 +1163,3 @@ class BisectingKMeans(BaseTransformer):
|
|
1059
1163
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1060
1164
|
)
|
1061
1165
|
return self._model_signature_dict
|
1062
|
-
|
1063
|
-
def to_sklearn(self) -> Any:
|
1064
|
-
"""Get sklearn.cluster.BisectingKMeans object.
|
1065
|
-
"""
|
1066
|
-
if self._sklearn_object is None:
|
1067
|
-
self._sklearn_object = self._create_sklearn_object()
|
1068
|
-
return self._sklearn_object
|
1069
|
-
|
1070
|
-
def to_xgboost(self) -> Any:
|
1071
|
-
raise exceptions.SnowflakeMLException(
|
1072
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
-
original_exception=AttributeError(
|
1074
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
-
"to_xgboost()",
|
1076
|
-
"to_sklearn()"
|
1077
|
-
)
|
1078
|
-
),
|
1079
|
-
)
|
1080
|
-
|
1081
|
-
def to_lightgbm(self) -> Any:
|
1082
|
-
raise exceptions.SnowflakeMLException(
|
1083
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
-
original_exception=AttributeError(
|
1085
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
-
"to_lightgbm()",
|
1087
|
-
"to_sklearn()"
|
1088
|
-
)
|
1089
|
-
),
|
1090
|
-
)
|
1091
|
-
|
1092
|
-
def _get_dependencies(self) -> List[str]:
|
1093
|
-
return self._deps
|