snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -325,12 +324,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
325
324
|
)
|
326
325
|
return selected_cols
|
327
326
|
|
328
|
-
|
329
|
-
project=_PROJECT,
|
330
|
-
subproject=_SUBPROJECT,
|
331
|
-
custom_tags=dict([("autogen", True)]),
|
332
|
-
)
|
333
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BayesianGaussianMixture":
|
327
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BayesianGaussianMixture":
|
334
328
|
"""Estimate model parameters with the EM algorithm
|
335
329
|
For more details on this function, see [sklearn.mixture.BayesianGaussianMixture.fit]
|
336
330
|
(https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html#sklearn.mixture.BayesianGaussianMixture.fit)
|
@@ -357,12 +351,14 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
357
351
|
|
358
352
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
359
353
|
|
360
|
-
|
354
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
361
355
|
if SNOWML_SPROC_ENV in os.environ:
|
362
356
|
statement_params = telemetry.get_function_usage_statement_params(
|
363
357
|
project=_PROJECT,
|
364
358
|
subproject=_SUBPROJECT,
|
365
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
359
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
360
|
+
inspect.currentframe(), BayesianGaussianMixture.__class__.__name__
|
361
|
+
),
|
366
362
|
api_calls=[Session.call],
|
367
363
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
368
364
|
)
|
@@ -383,7 +379,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
383
379
|
)
|
384
380
|
self._sklearn_object = model_trainer.train()
|
385
381
|
self._is_fitted = True
|
386
|
-
self.
|
382
|
+
self._generate_model_signatures(dataset)
|
387
383
|
return self
|
388
384
|
|
389
385
|
def _batch_inference_validate_snowpark(
|
@@ -459,7 +455,9 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
459
455
|
# when it is classifier, infer the datatype from label columns
|
460
456
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
461
457
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
462
|
-
label_cols_signatures = [
|
458
|
+
label_cols_signatures = [
|
459
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
460
|
+
]
|
463
461
|
if len(label_cols_signatures) == 0:
|
464
462
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
465
463
|
raise exceptions.SnowflakeMLException(
|
@@ -467,25 +465,22 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
467
465
|
original_exception=ValueError(error_str),
|
468
466
|
)
|
469
467
|
|
470
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
471
|
-
label_cols_signatures[0].as_snowpark_type()
|
472
|
-
)
|
468
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
473
469
|
|
474
470
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
475
|
-
assert isinstance(
|
471
|
+
assert isinstance(
|
472
|
+
dataset._session, Session
|
473
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
476
474
|
|
477
475
|
transform_kwargs = dict(
|
478
|
-
session
|
479
|
-
dependencies
|
480
|
-
drop_input_cols
|
481
|
-
expected_output_cols_type
|
476
|
+
session=dataset._session,
|
477
|
+
dependencies=self._deps,
|
478
|
+
drop_input_cols=self._drop_input_cols,
|
479
|
+
expected_output_cols_type=expected_type_inferred,
|
482
480
|
)
|
483
481
|
|
484
482
|
elif isinstance(dataset, pd.DataFrame):
|
485
|
-
transform_kwargs = dict(
|
486
|
-
snowpark_input_cols = self._snowpark_cols,
|
487
|
-
drop_input_cols = self._drop_input_cols
|
488
|
-
)
|
483
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
489
484
|
|
490
485
|
transform_handlers = ModelTransformerBuilder.build(
|
491
486
|
dataset=dataset,
|
@@ -525,7 +520,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
525
520
|
Transformed dataset.
|
526
521
|
"""
|
527
522
|
super()._check_dataset_type(dataset)
|
528
|
-
inference_method="transform"
|
523
|
+
inference_method = "transform"
|
529
524
|
|
530
525
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
531
526
|
# are specific to the type of dataset used.
|
@@ -562,17 +557,14 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
562
557
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
563
558
|
|
564
559
|
transform_kwargs = dict(
|
565
|
-
session
|
566
|
-
dependencies
|
567
|
-
drop_input_cols
|
568
|
-
expected_output_cols_type
|
560
|
+
session=dataset._session,
|
561
|
+
dependencies=self._deps,
|
562
|
+
drop_input_cols=self._drop_input_cols,
|
563
|
+
expected_output_cols_type=expected_dtype,
|
569
564
|
)
|
570
565
|
|
571
566
|
elif isinstance(dataset, pd.DataFrame):
|
572
|
-
transform_kwargs = dict(
|
573
|
-
snowpark_input_cols = self._snowpark_cols,
|
574
|
-
drop_input_cols = self._drop_input_cols
|
575
|
-
)
|
567
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
576
568
|
|
577
569
|
transform_handlers = ModelTransformerBuilder.build(
|
578
570
|
dataset=dataset,
|
@@ -591,7 +583,11 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
591
583
|
return output_df
|
592
584
|
|
593
585
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
594
|
-
def fit_predict(
|
586
|
+
def fit_predict(
|
587
|
+
self,
|
588
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
589
|
+
output_cols_prefix: str = "fit_predict_",
|
590
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
595
591
|
""" Estimate model parameters using X and predict the labels for X
|
596
592
|
For more details on this function, see [sklearn.mixture.BayesianGaussianMixture.fit_predict]
|
597
593
|
(https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html#sklearn.mixture.BayesianGaussianMixture.fit_predict)
|
@@ -618,7 +614,9 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
618
614
|
)
|
619
615
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
620
616
|
drop_input_cols=self._drop_input_cols,
|
621
|
-
expected_output_cols_list=
|
617
|
+
expected_output_cols_list=(
|
618
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
619
|
+
),
|
622
620
|
)
|
623
621
|
self._sklearn_object = fitted_estimator
|
624
622
|
self._is_fitted = True
|
@@ -635,6 +633,62 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
635
633
|
assert self._sklearn_object is not None
|
636
634
|
return self._sklearn_object.embedding_
|
637
635
|
|
636
|
+
|
637
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
638
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
639
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
640
|
+
"""
|
641
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
642
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
643
|
+
if output_cols:
|
644
|
+
output_cols = [
|
645
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
646
|
+
for c in output_cols
|
647
|
+
]
|
648
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
649
|
+
output_cols = [output_cols_prefix]
|
650
|
+
elif self._sklearn_object is not None:
|
651
|
+
classes = self._sklearn_object.classes_
|
652
|
+
if isinstance(classes, numpy.ndarray):
|
653
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
654
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
655
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
656
|
+
output_cols = []
|
657
|
+
for i, cl in enumerate(classes):
|
658
|
+
# For binary classification, there is only one output column for each class
|
659
|
+
# ndarray as the two classes are complementary.
|
660
|
+
if len(cl) == 2:
|
661
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
662
|
+
else:
|
663
|
+
output_cols.extend([
|
664
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
665
|
+
])
|
666
|
+
else:
|
667
|
+
output_cols = []
|
668
|
+
|
669
|
+
# Make sure column names are valid snowflake identifiers.
|
670
|
+
assert output_cols is not None # Make MyPy happy
|
671
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
672
|
+
|
673
|
+
return rv
|
674
|
+
|
675
|
+
def _align_expected_output_names(
|
676
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
677
|
+
) -> List[str]:
|
678
|
+
# in case the inferred output column names dimension is different
|
679
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
680
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
681
|
+
output_df_columns = list(output_df_pd.columns)
|
682
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
683
|
+
if self.sample_weight_col:
|
684
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
685
|
+
# if the dimension of inferred output column names is correct; use it
|
686
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
687
|
+
return expected_output_cols_list
|
688
|
+
# otherwise, use the sklearn estimator's output
|
689
|
+
else:
|
690
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
691
|
+
|
638
692
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
639
693
|
@telemetry.send_api_usage_telemetry(
|
640
694
|
project=_PROJECT,
|
@@ -667,24 +721,28 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
667
721
|
# are specific to the type of dataset used.
|
668
722
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
669
723
|
|
724
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
725
|
+
|
670
726
|
if isinstance(dataset, DataFrame):
|
671
727
|
self._deps = self._batch_inference_validate_snowpark(
|
672
728
|
dataset=dataset,
|
673
729
|
inference_method=inference_method,
|
674
730
|
)
|
675
|
-
assert isinstance(
|
731
|
+
assert isinstance(
|
732
|
+
dataset._session, Session
|
733
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
676
734
|
transform_kwargs = dict(
|
677
735
|
session=dataset._session,
|
678
736
|
dependencies=self._deps,
|
679
|
-
drop_input_cols
|
737
|
+
drop_input_cols=self._drop_input_cols,
|
680
738
|
expected_output_cols_type="float",
|
681
739
|
)
|
740
|
+
expected_output_cols = self._align_expected_output_names(
|
741
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
742
|
+
)
|
682
743
|
|
683
744
|
elif isinstance(dataset, pd.DataFrame):
|
684
|
-
transform_kwargs = dict(
|
685
|
-
snowpark_input_cols = self._snowpark_cols,
|
686
|
-
drop_input_cols = self._drop_input_cols
|
687
|
-
)
|
745
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
688
746
|
|
689
747
|
transform_handlers = ModelTransformerBuilder.build(
|
690
748
|
dataset=dataset,
|
@@ -696,7 +754,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
696
754
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
697
755
|
inference_method=inference_method,
|
698
756
|
input_cols=self.input_cols,
|
699
|
-
expected_output_cols=
|
757
|
+
expected_output_cols=expected_output_cols,
|
700
758
|
**transform_kwargs
|
701
759
|
)
|
702
760
|
return output_df
|
@@ -728,7 +786,8 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
728
786
|
Output dataset with log probability of the sample for each class in the model.
|
729
787
|
"""
|
730
788
|
super()._check_dataset_type(dataset)
|
731
|
-
inference_method="predict_log_proba"
|
789
|
+
inference_method = "predict_log_proba"
|
790
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
732
791
|
|
733
792
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
734
793
|
# are specific to the type of dataset used.
|
@@ -739,18 +798,20 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
739
798
|
dataset=dataset,
|
740
799
|
inference_method=inference_method,
|
741
800
|
)
|
742
|
-
assert isinstance(
|
801
|
+
assert isinstance(
|
802
|
+
dataset._session, Session
|
803
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
743
804
|
transform_kwargs = dict(
|
744
805
|
session=dataset._session,
|
745
806
|
dependencies=self._deps,
|
746
|
-
drop_input_cols
|
807
|
+
drop_input_cols=self._drop_input_cols,
|
747
808
|
expected_output_cols_type="float",
|
748
809
|
)
|
810
|
+
expected_output_cols = self._align_expected_output_names(
|
811
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
812
|
+
)
|
749
813
|
elif isinstance(dataset, pd.DataFrame):
|
750
|
-
transform_kwargs = dict(
|
751
|
-
snowpark_input_cols = self._snowpark_cols,
|
752
|
-
drop_input_cols = self._drop_input_cols
|
753
|
-
)
|
814
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
754
815
|
|
755
816
|
transform_handlers = ModelTransformerBuilder.build(
|
756
817
|
dataset=dataset,
|
@@ -763,7 +824,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
763
824
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
764
825
|
inference_method=inference_method,
|
765
826
|
input_cols=self.input_cols,
|
766
|
-
expected_output_cols=
|
827
|
+
expected_output_cols=expected_output_cols,
|
767
828
|
**transform_kwargs
|
768
829
|
)
|
769
830
|
return output_df
|
@@ -789,30 +850,34 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
789
850
|
Output dataset with results of the decision function for the samples in input dataset.
|
790
851
|
"""
|
791
852
|
super()._check_dataset_type(dataset)
|
792
|
-
inference_method="decision_function"
|
853
|
+
inference_method = "decision_function"
|
793
854
|
|
794
855
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
795
856
|
# are specific to the type of dataset used.
|
796
857
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
797
858
|
|
859
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
860
|
+
|
798
861
|
if isinstance(dataset, DataFrame):
|
799
862
|
self._deps = self._batch_inference_validate_snowpark(
|
800
863
|
dataset=dataset,
|
801
864
|
inference_method=inference_method,
|
802
865
|
)
|
803
|
-
assert isinstance(
|
866
|
+
assert isinstance(
|
867
|
+
dataset._session, Session
|
868
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
804
869
|
transform_kwargs = dict(
|
805
870
|
session=dataset._session,
|
806
871
|
dependencies=self._deps,
|
807
|
-
drop_input_cols
|
872
|
+
drop_input_cols=self._drop_input_cols,
|
808
873
|
expected_output_cols_type="float",
|
809
874
|
)
|
875
|
+
expected_output_cols = self._align_expected_output_names(
|
876
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
877
|
+
)
|
810
878
|
|
811
879
|
elif isinstance(dataset, pd.DataFrame):
|
812
|
-
transform_kwargs = dict(
|
813
|
-
snowpark_input_cols = self._snowpark_cols,
|
814
|
-
drop_input_cols = self._drop_input_cols
|
815
|
-
)
|
880
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
816
881
|
|
817
882
|
transform_handlers = ModelTransformerBuilder.build(
|
818
883
|
dataset=dataset,
|
@@ -825,7 +890,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
825
890
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
826
891
|
inference_method=inference_method,
|
827
892
|
input_cols=self.input_cols,
|
828
|
-
expected_output_cols=
|
893
|
+
expected_output_cols=expected_output_cols,
|
829
894
|
**transform_kwargs
|
830
895
|
)
|
831
896
|
return output_df
|
@@ -856,12 +921,14 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
856
921
|
Output dataset with probability of the sample for each class in the model.
|
857
922
|
"""
|
858
923
|
super()._check_dataset_type(dataset)
|
859
|
-
inference_method="score_samples"
|
924
|
+
inference_method = "score_samples"
|
860
925
|
|
861
926
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
862
927
|
# are specific to the type of dataset used.
|
863
928
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
864
929
|
|
930
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
931
|
+
|
865
932
|
if isinstance(dataset, DataFrame):
|
866
933
|
self._deps = self._batch_inference_validate_snowpark(
|
867
934
|
dataset=dataset,
|
@@ -874,6 +941,9 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
874
941
|
drop_input_cols = self._drop_input_cols,
|
875
942
|
expected_output_cols_type="float",
|
876
943
|
)
|
944
|
+
expected_output_cols = self._align_expected_output_names(
|
945
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
946
|
+
)
|
877
947
|
|
878
948
|
elif isinstance(dataset, pd.DataFrame):
|
879
949
|
transform_kwargs = dict(
|
@@ -892,7 +962,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
892
962
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
893
963
|
inference_method=inference_method,
|
894
964
|
input_cols=self.input_cols,
|
895
|
-
expected_output_cols=
|
965
|
+
expected_output_cols=expected_output_cols,
|
896
966
|
**transform_kwargs
|
897
967
|
)
|
898
968
|
return output_df
|
@@ -1039,50 +1109,84 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
1039
1109
|
)
|
1040
1110
|
return output_df
|
1041
1111
|
|
1112
|
+
|
1113
|
+
|
1114
|
+
def to_sklearn(self) -> Any:
|
1115
|
+
"""Get sklearn.mixture.BayesianGaussianMixture object.
|
1116
|
+
"""
|
1117
|
+
if self._sklearn_object is None:
|
1118
|
+
self._sklearn_object = self._create_sklearn_object()
|
1119
|
+
return self._sklearn_object
|
1120
|
+
|
1121
|
+
def to_xgboost(self) -> Any:
|
1122
|
+
raise exceptions.SnowflakeMLException(
|
1123
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1124
|
+
original_exception=AttributeError(
|
1125
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1126
|
+
"to_xgboost()",
|
1127
|
+
"to_sklearn()"
|
1128
|
+
)
|
1129
|
+
),
|
1130
|
+
)
|
1131
|
+
|
1132
|
+
def to_lightgbm(self) -> Any:
|
1133
|
+
raise exceptions.SnowflakeMLException(
|
1134
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1135
|
+
original_exception=AttributeError(
|
1136
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1137
|
+
"to_lightgbm()",
|
1138
|
+
"to_sklearn()"
|
1139
|
+
)
|
1140
|
+
),
|
1141
|
+
)
|
1042
1142
|
|
1043
|
-
def
|
1143
|
+
def _get_dependencies(self) -> List[str]:
|
1144
|
+
return self._deps
|
1145
|
+
|
1146
|
+
|
1147
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1044
1148
|
self._model_signature_dict = dict()
|
1045
1149
|
|
1046
1150
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1047
1151
|
|
1048
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1152
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1049
1153
|
outputs: List[BaseFeatureSpec] = []
|
1050
1154
|
if hasattr(self, "predict"):
|
1051
1155
|
# keep mypy happy
|
1052
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1156
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1053
1157
|
# For classifier, the type of predict is the same as the type of label
|
1054
|
-
if self._sklearn_object._estimator_type ==
|
1055
|
-
|
1158
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1159
|
+
# label columns is the desired type for output
|
1056
1160
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1057
1161
|
# rename the output columns
|
1058
1162
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1059
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1060
|
-
|
1061
|
-
|
1163
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1164
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1165
|
+
)
|
1062
1166
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1063
1167
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1064
|
-
# Clusterer returns int64 cluster labels.
|
1168
|
+
# Clusterer returns int64 cluster labels.
|
1065
1169
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1066
1170
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1067
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1171
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1172
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1173
|
+
)
|
1174
|
+
|
1071
1175
|
# For regressor, the type of predict is float64
|
1072
|
-
elif self._sklearn_object._estimator_type ==
|
1176
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1073
1177
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1074
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1178
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1179
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1180
|
+
)
|
1181
|
+
|
1078
1182
|
for prob_func in PROB_FUNCTIONS:
|
1079
1183
|
if hasattr(self, prob_func):
|
1080
1184
|
output_cols_prefix: str = f"{prob_func}_"
|
1081
1185
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1082
1186
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1083
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1084
|
-
|
1085
|
-
|
1187
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1188
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1189
|
+
)
|
1086
1190
|
|
1087
1191
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1088
1192
|
items = list(self._model_signature_dict.items())
|
@@ -1095,10 +1199,10 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
1095
1199
|
"""Returns model signature of current class.
|
1096
1200
|
|
1097
1201
|
Raises:
|
1098
|
-
|
1202
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1099
1203
|
|
1100
1204
|
Returns:
|
1101
|
-
Dict
|
1205
|
+
Dict with each method and its input output signature
|
1102
1206
|
"""
|
1103
1207
|
if self._model_signature_dict is None:
|
1104
1208
|
raise exceptions.SnowflakeMLException(
|
@@ -1106,35 +1210,3 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
1106
1210
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1107
1211
|
)
|
1108
1212
|
return self._model_signature_dict
|
1109
|
-
|
1110
|
-
def to_sklearn(self) -> Any:
|
1111
|
-
"""Get sklearn.mixture.BayesianGaussianMixture object.
|
1112
|
-
"""
|
1113
|
-
if self._sklearn_object is None:
|
1114
|
-
self._sklearn_object = self._create_sklearn_object()
|
1115
|
-
return self._sklearn_object
|
1116
|
-
|
1117
|
-
def to_xgboost(self) -> Any:
|
1118
|
-
raise exceptions.SnowflakeMLException(
|
1119
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1120
|
-
original_exception=AttributeError(
|
1121
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1122
|
-
"to_xgboost()",
|
1123
|
-
"to_sklearn()"
|
1124
|
-
)
|
1125
|
-
),
|
1126
|
-
)
|
1127
|
-
|
1128
|
-
def to_lightgbm(self) -> Any:
|
1129
|
-
raise exceptions.SnowflakeMLException(
|
1130
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1131
|
-
original_exception=AttributeError(
|
1132
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1133
|
-
"to_lightgbm()",
|
1134
|
-
"to_sklearn()"
|
1135
|
-
)
|
1136
|
-
),
|
1137
|
-
)
|
1138
|
-
|
1139
|
-
def _get_dependencies(self) -> List[str]:
|
1140
|
-
return self._deps
|