snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -282,12 +281,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
282
281
|
)
|
283
282
|
return selected_cols
|
284
283
|
|
285
|
-
|
286
|
-
project=_PROJECT,
|
287
|
-
subproject=_SUBPROJECT,
|
288
|
-
custom_tags=dict([("autogen", True)]),
|
289
|
-
)
|
290
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "FeatureAgglomeration":
|
284
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "FeatureAgglomeration":
|
291
285
|
"""Fit the hierarchical clustering on the data
|
292
286
|
For more details on this function, see [sklearn.cluster.FeatureAgglomeration.fit]
|
293
287
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration.fit)
|
@@ -314,12 +308,14 @@ class FeatureAgglomeration(BaseTransformer):
|
|
314
308
|
|
315
309
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
316
310
|
|
317
|
-
|
311
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
318
312
|
if SNOWML_SPROC_ENV in os.environ:
|
319
313
|
statement_params = telemetry.get_function_usage_statement_params(
|
320
314
|
project=_PROJECT,
|
321
315
|
subproject=_SUBPROJECT,
|
322
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
316
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
317
|
+
inspect.currentframe(), FeatureAgglomeration.__class__.__name__
|
318
|
+
),
|
323
319
|
api_calls=[Session.call],
|
324
320
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
325
321
|
)
|
@@ -340,7 +336,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
340
336
|
)
|
341
337
|
self._sklearn_object = model_trainer.train()
|
342
338
|
self._is_fitted = True
|
343
|
-
self.
|
339
|
+
self._generate_model_signatures(dataset)
|
344
340
|
return self
|
345
341
|
|
346
342
|
def _batch_inference_validate_snowpark(
|
@@ -414,7 +410,9 @@ class FeatureAgglomeration(BaseTransformer):
|
|
414
410
|
# when it is classifier, infer the datatype from label columns
|
415
411
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
416
412
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
417
|
-
label_cols_signatures = [
|
413
|
+
label_cols_signatures = [
|
414
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
415
|
+
]
|
418
416
|
if len(label_cols_signatures) == 0:
|
419
417
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
420
418
|
raise exceptions.SnowflakeMLException(
|
@@ -422,25 +420,22 @@ class FeatureAgglomeration(BaseTransformer):
|
|
422
420
|
original_exception=ValueError(error_str),
|
423
421
|
)
|
424
422
|
|
425
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
426
|
-
label_cols_signatures[0].as_snowpark_type()
|
427
|
-
)
|
423
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
428
424
|
|
429
425
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
430
|
-
assert isinstance(
|
426
|
+
assert isinstance(
|
427
|
+
dataset._session, Session
|
428
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
431
429
|
|
432
430
|
transform_kwargs = dict(
|
433
|
-
session
|
434
|
-
dependencies
|
435
|
-
drop_input_cols
|
436
|
-
expected_output_cols_type
|
431
|
+
session=dataset._session,
|
432
|
+
dependencies=self._deps,
|
433
|
+
drop_input_cols=self._drop_input_cols,
|
434
|
+
expected_output_cols_type=expected_type_inferred,
|
437
435
|
)
|
438
436
|
|
439
437
|
elif isinstance(dataset, pd.DataFrame):
|
440
|
-
transform_kwargs = dict(
|
441
|
-
snowpark_input_cols = self._snowpark_cols,
|
442
|
-
drop_input_cols = self._drop_input_cols
|
443
|
-
)
|
438
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
444
439
|
|
445
440
|
transform_handlers = ModelTransformerBuilder.build(
|
446
441
|
dataset=dataset,
|
@@ -482,7 +477,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
482
477
|
Transformed dataset.
|
483
478
|
"""
|
484
479
|
super()._check_dataset_type(dataset)
|
485
|
-
inference_method="transform"
|
480
|
+
inference_method = "transform"
|
486
481
|
|
487
482
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
488
483
|
# are specific to the type of dataset used.
|
@@ -519,17 +514,14 @@ class FeatureAgglomeration(BaseTransformer):
|
|
519
514
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
520
515
|
|
521
516
|
transform_kwargs = dict(
|
522
|
-
session
|
523
|
-
dependencies
|
524
|
-
drop_input_cols
|
525
|
-
expected_output_cols_type
|
517
|
+
session=dataset._session,
|
518
|
+
dependencies=self._deps,
|
519
|
+
drop_input_cols=self._drop_input_cols,
|
520
|
+
expected_output_cols_type=expected_dtype,
|
526
521
|
)
|
527
522
|
|
528
523
|
elif isinstance(dataset, pd.DataFrame):
|
529
|
-
transform_kwargs = dict(
|
530
|
-
snowpark_input_cols = self._snowpark_cols,
|
531
|
-
drop_input_cols = self._drop_input_cols
|
532
|
-
)
|
524
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
533
525
|
|
534
526
|
transform_handlers = ModelTransformerBuilder.build(
|
535
527
|
dataset=dataset,
|
@@ -548,7 +540,11 @@ class FeatureAgglomeration(BaseTransformer):
|
|
548
540
|
return output_df
|
549
541
|
|
550
542
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
551
|
-
def fit_predict(
|
543
|
+
def fit_predict(
|
544
|
+
self,
|
545
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
546
|
+
output_cols_prefix: str = "fit_predict_",
|
547
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
552
548
|
""" Fit and return the result of each sample's clustering assignment
|
553
549
|
For more details on this function, see [sklearn.cluster.FeatureAgglomeration.fit_predict]
|
554
550
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration.fit_predict)
|
@@ -575,7 +571,9 @@ class FeatureAgglomeration(BaseTransformer):
|
|
575
571
|
)
|
576
572
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
577
573
|
drop_input_cols=self._drop_input_cols,
|
578
|
-
expected_output_cols_list=
|
574
|
+
expected_output_cols_list=(
|
575
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
576
|
+
),
|
579
577
|
)
|
580
578
|
self._sklearn_object = fitted_estimator
|
581
579
|
self._is_fitted = True
|
@@ -592,6 +590,62 @@ class FeatureAgglomeration(BaseTransformer):
|
|
592
590
|
assert self._sklearn_object is not None
|
593
591
|
return self._sklearn_object.embedding_
|
594
592
|
|
593
|
+
|
594
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
595
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
596
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
597
|
+
"""
|
598
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
599
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
600
|
+
if output_cols:
|
601
|
+
output_cols = [
|
602
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
603
|
+
for c in output_cols
|
604
|
+
]
|
605
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
606
|
+
output_cols = [output_cols_prefix]
|
607
|
+
elif self._sklearn_object is not None:
|
608
|
+
classes = self._sklearn_object.classes_
|
609
|
+
if isinstance(classes, numpy.ndarray):
|
610
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
611
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
612
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
613
|
+
output_cols = []
|
614
|
+
for i, cl in enumerate(classes):
|
615
|
+
# For binary classification, there is only one output column for each class
|
616
|
+
# ndarray as the two classes are complementary.
|
617
|
+
if len(cl) == 2:
|
618
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
619
|
+
else:
|
620
|
+
output_cols.extend([
|
621
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
622
|
+
])
|
623
|
+
else:
|
624
|
+
output_cols = []
|
625
|
+
|
626
|
+
# Make sure column names are valid snowflake identifiers.
|
627
|
+
assert output_cols is not None # Make MyPy happy
|
628
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
629
|
+
|
630
|
+
return rv
|
631
|
+
|
632
|
+
def _align_expected_output_names(
|
633
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
634
|
+
) -> List[str]:
|
635
|
+
# in case the inferred output column names dimension is different
|
636
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
637
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
638
|
+
output_df_columns = list(output_df_pd.columns)
|
639
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
640
|
+
if self.sample_weight_col:
|
641
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
642
|
+
# if the dimension of inferred output column names is correct; use it
|
643
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
644
|
+
return expected_output_cols_list
|
645
|
+
# otherwise, use the sklearn estimator's output
|
646
|
+
else:
|
647
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
648
|
+
|
595
649
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
596
650
|
@telemetry.send_api_usage_telemetry(
|
597
651
|
project=_PROJECT,
|
@@ -622,24 +676,28 @@ class FeatureAgglomeration(BaseTransformer):
|
|
622
676
|
# are specific to the type of dataset used.
|
623
677
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
624
678
|
|
679
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
680
|
+
|
625
681
|
if isinstance(dataset, DataFrame):
|
626
682
|
self._deps = self._batch_inference_validate_snowpark(
|
627
683
|
dataset=dataset,
|
628
684
|
inference_method=inference_method,
|
629
685
|
)
|
630
|
-
assert isinstance(
|
686
|
+
assert isinstance(
|
687
|
+
dataset._session, Session
|
688
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
631
689
|
transform_kwargs = dict(
|
632
690
|
session=dataset._session,
|
633
691
|
dependencies=self._deps,
|
634
|
-
drop_input_cols
|
692
|
+
drop_input_cols=self._drop_input_cols,
|
635
693
|
expected_output_cols_type="float",
|
636
694
|
)
|
695
|
+
expected_output_cols = self._align_expected_output_names(
|
696
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
697
|
+
)
|
637
698
|
|
638
699
|
elif isinstance(dataset, pd.DataFrame):
|
639
|
-
transform_kwargs = dict(
|
640
|
-
snowpark_input_cols = self._snowpark_cols,
|
641
|
-
drop_input_cols = self._drop_input_cols
|
642
|
-
)
|
700
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
643
701
|
|
644
702
|
transform_handlers = ModelTransformerBuilder.build(
|
645
703
|
dataset=dataset,
|
@@ -651,7 +709,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
651
709
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
652
710
|
inference_method=inference_method,
|
653
711
|
input_cols=self.input_cols,
|
654
|
-
expected_output_cols=
|
712
|
+
expected_output_cols=expected_output_cols,
|
655
713
|
**transform_kwargs
|
656
714
|
)
|
657
715
|
return output_df
|
@@ -681,7 +739,8 @@ class FeatureAgglomeration(BaseTransformer):
|
|
681
739
|
Output dataset with log probability of the sample for each class in the model.
|
682
740
|
"""
|
683
741
|
super()._check_dataset_type(dataset)
|
684
|
-
inference_method="predict_log_proba"
|
742
|
+
inference_method = "predict_log_proba"
|
743
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
685
744
|
|
686
745
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
687
746
|
# are specific to the type of dataset used.
|
@@ -692,18 +751,20 @@ class FeatureAgglomeration(BaseTransformer):
|
|
692
751
|
dataset=dataset,
|
693
752
|
inference_method=inference_method,
|
694
753
|
)
|
695
|
-
assert isinstance(
|
754
|
+
assert isinstance(
|
755
|
+
dataset._session, Session
|
756
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
696
757
|
transform_kwargs = dict(
|
697
758
|
session=dataset._session,
|
698
759
|
dependencies=self._deps,
|
699
|
-
drop_input_cols
|
760
|
+
drop_input_cols=self._drop_input_cols,
|
700
761
|
expected_output_cols_type="float",
|
701
762
|
)
|
763
|
+
expected_output_cols = self._align_expected_output_names(
|
764
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
765
|
+
)
|
702
766
|
elif isinstance(dataset, pd.DataFrame):
|
703
|
-
transform_kwargs = dict(
|
704
|
-
snowpark_input_cols = self._snowpark_cols,
|
705
|
-
drop_input_cols = self._drop_input_cols
|
706
|
-
)
|
767
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
707
768
|
|
708
769
|
transform_handlers = ModelTransformerBuilder.build(
|
709
770
|
dataset=dataset,
|
@@ -716,7 +777,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
716
777
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
717
778
|
inference_method=inference_method,
|
718
779
|
input_cols=self.input_cols,
|
719
|
-
expected_output_cols=
|
780
|
+
expected_output_cols=expected_output_cols,
|
720
781
|
**transform_kwargs
|
721
782
|
)
|
722
783
|
return output_df
|
@@ -742,30 +803,34 @@ class FeatureAgglomeration(BaseTransformer):
|
|
742
803
|
Output dataset with results of the decision function for the samples in input dataset.
|
743
804
|
"""
|
744
805
|
super()._check_dataset_type(dataset)
|
745
|
-
inference_method="decision_function"
|
806
|
+
inference_method = "decision_function"
|
746
807
|
|
747
808
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
748
809
|
# are specific to the type of dataset used.
|
749
810
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
750
811
|
|
812
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
813
|
+
|
751
814
|
if isinstance(dataset, DataFrame):
|
752
815
|
self._deps = self._batch_inference_validate_snowpark(
|
753
816
|
dataset=dataset,
|
754
817
|
inference_method=inference_method,
|
755
818
|
)
|
756
|
-
assert isinstance(
|
819
|
+
assert isinstance(
|
820
|
+
dataset._session, Session
|
821
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
757
822
|
transform_kwargs = dict(
|
758
823
|
session=dataset._session,
|
759
824
|
dependencies=self._deps,
|
760
|
-
drop_input_cols
|
825
|
+
drop_input_cols=self._drop_input_cols,
|
761
826
|
expected_output_cols_type="float",
|
762
827
|
)
|
828
|
+
expected_output_cols = self._align_expected_output_names(
|
829
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
830
|
+
)
|
763
831
|
|
764
832
|
elif isinstance(dataset, pd.DataFrame):
|
765
|
-
transform_kwargs = dict(
|
766
|
-
snowpark_input_cols = self._snowpark_cols,
|
767
|
-
drop_input_cols = self._drop_input_cols
|
768
|
-
)
|
833
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
769
834
|
|
770
835
|
transform_handlers = ModelTransformerBuilder.build(
|
771
836
|
dataset=dataset,
|
@@ -778,7 +843,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
778
843
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
779
844
|
inference_method=inference_method,
|
780
845
|
input_cols=self.input_cols,
|
781
|
-
expected_output_cols=
|
846
|
+
expected_output_cols=expected_output_cols,
|
782
847
|
**transform_kwargs
|
783
848
|
)
|
784
849
|
return output_df
|
@@ -807,12 +872,14 @@ class FeatureAgglomeration(BaseTransformer):
|
|
807
872
|
Output dataset with probability of the sample for each class in the model.
|
808
873
|
"""
|
809
874
|
super()._check_dataset_type(dataset)
|
810
|
-
inference_method="score_samples"
|
875
|
+
inference_method = "score_samples"
|
811
876
|
|
812
877
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
813
878
|
# are specific to the type of dataset used.
|
814
879
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
815
880
|
|
881
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
882
|
+
|
816
883
|
if isinstance(dataset, DataFrame):
|
817
884
|
self._deps = self._batch_inference_validate_snowpark(
|
818
885
|
dataset=dataset,
|
@@ -825,6 +892,9 @@ class FeatureAgglomeration(BaseTransformer):
|
|
825
892
|
drop_input_cols = self._drop_input_cols,
|
826
893
|
expected_output_cols_type="float",
|
827
894
|
)
|
895
|
+
expected_output_cols = self._align_expected_output_names(
|
896
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
897
|
+
)
|
828
898
|
|
829
899
|
elif isinstance(dataset, pd.DataFrame):
|
830
900
|
transform_kwargs = dict(
|
@@ -843,7 +913,7 @@ class FeatureAgglomeration(BaseTransformer):
|
|
843
913
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
844
914
|
inference_method=inference_method,
|
845
915
|
input_cols=self.input_cols,
|
846
|
-
expected_output_cols=
|
916
|
+
expected_output_cols=expected_output_cols,
|
847
917
|
**transform_kwargs
|
848
918
|
)
|
849
919
|
return output_df
|
@@ -988,50 +1058,84 @@ class FeatureAgglomeration(BaseTransformer):
|
|
988
1058
|
)
|
989
1059
|
return output_df
|
990
1060
|
|
1061
|
+
|
1062
|
+
|
1063
|
+
def to_sklearn(self) -> Any:
|
1064
|
+
"""Get sklearn.cluster.FeatureAgglomeration object.
|
1065
|
+
"""
|
1066
|
+
if self._sklearn_object is None:
|
1067
|
+
self._sklearn_object = self._create_sklearn_object()
|
1068
|
+
return self._sklearn_object
|
1069
|
+
|
1070
|
+
def to_xgboost(self) -> Any:
|
1071
|
+
raise exceptions.SnowflakeMLException(
|
1072
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
+
original_exception=AttributeError(
|
1074
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
+
"to_xgboost()",
|
1076
|
+
"to_sklearn()"
|
1077
|
+
)
|
1078
|
+
),
|
1079
|
+
)
|
1080
|
+
|
1081
|
+
def to_lightgbm(self) -> Any:
|
1082
|
+
raise exceptions.SnowflakeMLException(
|
1083
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
+
original_exception=AttributeError(
|
1085
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
+
"to_lightgbm()",
|
1087
|
+
"to_sklearn()"
|
1088
|
+
)
|
1089
|
+
),
|
1090
|
+
)
|
991
1091
|
|
992
|
-
def
|
1092
|
+
def _get_dependencies(self) -> List[str]:
|
1093
|
+
return self._deps
|
1094
|
+
|
1095
|
+
|
1096
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
993
1097
|
self._model_signature_dict = dict()
|
994
1098
|
|
995
1099
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
996
1100
|
|
997
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1101
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
998
1102
|
outputs: List[BaseFeatureSpec] = []
|
999
1103
|
if hasattr(self, "predict"):
|
1000
1104
|
# keep mypy happy
|
1001
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1105
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1002
1106
|
# For classifier, the type of predict is the same as the type of label
|
1003
|
-
if self._sklearn_object._estimator_type ==
|
1004
|
-
|
1107
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1108
|
+
# label columns is the desired type for output
|
1005
1109
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1006
1110
|
# rename the output columns
|
1007
1111
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1008
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1009
|
-
|
1010
|
-
|
1112
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1113
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1114
|
+
)
|
1011
1115
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1012
1116
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1013
|
-
# Clusterer returns int64 cluster labels.
|
1117
|
+
# Clusterer returns int64 cluster labels.
|
1014
1118
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1015
1119
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1016
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1120
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1121
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1122
|
+
)
|
1123
|
+
|
1020
1124
|
# For regressor, the type of predict is float64
|
1021
|
-
elif self._sklearn_object._estimator_type ==
|
1125
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1022
1126
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1023
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1127
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1128
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1129
|
+
)
|
1130
|
+
|
1027
1131
|
for prob_func in PROB_FUNCTIONS:
|
1028
1132
|
if hasattr(self, prob_func):
|
1029
1133
|
output_cols_prefix: str = f"{prob_func}_"
|
1030
1134
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1031
1135
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1032
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1033
|
-
|
1034
|
-
|
1136
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1137
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1138
|
+
)
|
1035
1139
|
|
1036
1140
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1037
1141
|
items = list(self._model_signature_dict.items())
|
@@ -1044,10 +1148,10 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1044
1148
|
"""Returns model signature of current class.
|
1045
1149
|
|
1046
1150
|
Raises:
|
1047
|
-
|
1151
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1048
1152
|
|
1049
1153
|
Returns:
|
1050
|
-
Dict
|
1154
|
+
Dict with each method and its input output signature
|
1051
1155
|
"""
|
1052
1156
|
if self._model_signature_dict is None:
|
1053
1157
|
raise exceptions.SnowflakeMLException(
|
@@ -1055,35 +1159,3 @@ class FeatureAgglomeration(BaseTransformer):
|
|
1055
1159
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1056
1160
|
)
|
1057
1161
|
return self._model_signature_dict
|
1058
|
-
|
1059
|
-
def to_sklearn(self) -> Any:
|
1060
|
-
"""Get sklearn.cluster.FeatureAgglomeration object.
|
1061
|
-
"""
|
1062
|
-
if self._sklearn_object is None:
|
1063
|
-
self._sklearn_object = self._create_sklearn_object()
|
1064
|
-
return self._sklearn_object
|
1065
|
-
|
1066
|
-
def to_xgboost(self) -> Any:
|
1067
|
-
raise exceptions.SnowflakeMLException(
|
1068
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1069
|
-
original_exception=AttributeError(
|
1070
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1071
|
-
"to_xgboost()",
|
1072
|
-
"to_sklearn()"
|
1073
|
-
)
|
1074
|
-
),
|
1075
|
-
)
|
1076
|
-
|
1077
|
-
def to_lightgbm(self) -> Any:
|
1078
|
-
raise exceptions.SnowflakeMLException(
|
1079
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1080
|
-
original_exception=AttributeError(
|
1081
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1082
|
-
"to_lightgbm()",
|
1083
|
-
"to_sklearn()"
|
1084
|
-
)
|
1085
|
-
),
|
1086
|
-
)
|
1087
|
-
|
1088
|
-
def _get_dependencies(self) -> List[str]:
|
1089
|
-
return self._deps
|