snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -275,12 +274,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
275
274
|
)
|
276
275
|
return selected_cols
|
277
276
|
|
278
|
-
|
279
|
-
project=_PROJECT,
|
280
|
-
subproject=_SUBPROJECT,
|
281
|
-
custom_tags=dict([("autogen", True)]),
|
282
|
-
)
|
283
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "AgglomerativeClustering":
|
277
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "AgglomerativeClustering":
|
284
278
|
"""Fit the hierarchical clustering from features, or distance matrix
|
285
279
|
For more details on this function, see [sklearn.cluster.AgglomerativeClustering.fit]
|
286
280
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering.fit)
|
@@ -307,12 +301,14 @@ class AgglomerativeClustering(BaseTransformer):
|
|
307
301
|
|
308
302
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
309
303
|
|
310
|
-
|
304
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
311
305
|
if SNOWML_SPROC_ENV in os.environ:
|
312
306
|
statement_params = telemetry.get_function_usage_statement_params(
|
313
307
|
project=_PROJECT,
|
314
308
|
subproject=_SUBPROJECT,
|
315
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
309
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
310
|
+
inspect.currentframe(), AgglomerativeClustering.__class__.__name__
|
311
|
+
),
|
316
312
|
api_calls=[Session.call],
|
317
313
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
318
314
|
)
|
@@ -333,7 +329,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
333
329
|
)
|
334
330
|
self._sklearn_object = model_trainer.train()
|
335
331
|
self._is_fitted = True
|
336
|
-
self.
|
332
|
+
self._generate_model_signatures(dataset)
|
337
333
|
return self
|
338
334
|
|
339
335
|
def _batch_inference_validate_snowpark(
|
@@ -407,7 +403,9 @@ class AgglomerativeClustering(BaseTransformer):
|
|
407
403
|
# when it is classifier, infer the datatype from label columns
|
408
404
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
409
405
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
410
|
-
label_cols_signatures = [
|
406
|
+
label_cols_signatures = [
|
407
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
408
|
+
]
|
411
409
|
if len(label_cols_signatures) == 0:
|
412
410
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
413
411
|
raise exceptions.SnowflakeMLException(
|
@@ -415,25 +413,22 @@ class AgglomerativeClustering(BaseTransformer):
|
|
415
413
|
original_exception=ValueError(error_str),
|
416
414
|
)
|
417
415
|
|
418
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
419
|
-
label_cols_signatures[0].as_snowpark_type()
|
420
|
-
)
|
416
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
421
417
|
|
422
418
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
423
|
-
assert isinstance(
|
419
|
+
assert isinstance(
|
420
|
+
dataset._session, Session
|
421
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
424
422
|
|
425
423
|
transform_kwargs = dict(
|
426
|
-
session
|
427
|
-
dependencies
|
428
|
-
drop_input_cols
|
429
|
-
expected_output_cols_type
|
424
|
+
session=dataset._session,
|
425
|
+
dependencies=self._deps,
|
426
|
+
drop_input_cols=self._drop_input_cols,
|
427
|
+
expected_output_cols_type=expected_type_inferred,
|
430
428
|
)
|
431
429
|
|
432
430
|
elif isinstance(dataset, pd.DataFrame):
|
433
|
-
transform_kwargs = dict(
|
434
|
-
snowpark_input_cols = self._snowpark_cols,
|
435
|
-
drop_input_cols = self._drop_input_cols
|
436
|
-
)
|
431
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
437
432
|
|
438
433
|
transform_handlers = ModelTransformerBuilder.build(
|
439
434
|
dataset=dataset,
|
@@ -473,7 +468,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
473
468
|
Transformed dataset.
|
474
469
|
"""
|
475
470
|
super()._check_dataset_type(dataset)
|
476
|
-
inference_method="transform"
|
471
|
+
inference_method = "transform"
|
477
472
|
|
478
473
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
479
474
|
# are specific to the type of dataset used.
|
@@ -510,17 +505,14 @@ class AgglomerativeClustering(BaseTransformer):
|
|
510
505
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
511
506
|
|
512
507
|
transform_kwargs = dict(
|
513
|
-
session
|
514
|
-
dependencies
|
515
|
-
drop_input_cols
|
516
|
-
expected_output_cols_type
|
508
|
+
session=dataset._session,
|
509
|
+
dependencies=self._deps,
|
510
|
+
drop_input_cols=self._drop_input_cols,
|
511
|
+
expected_output_cols_type=expected_dtype,
|
517
512
|
)
|
518
513
|
|
519
514
|
elif isinstance(dataset, pd.DataFrame):
|
520
|
-
transform_kwargs = dict(
|
521
|
-
snowpark_input_cols = self._snowpark_cols,
|
522
|
-
drop_input_cols = self._drop_input_cols
|
523
|
-
)
|
515
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
524
516
|
|
525
517
|
transform_handlers = ModelTransformerBuilder.build(
|
526
518
|
dataset=dataset,
|
@@ -539,7 +531,11 @@ class AgglomerativeClustering(BaseTransformer):
|
|
539
531
|
return output_df
|
540
532
|
|
541
533
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
542
|
-
def fit_predict(
|
534
|
+
def fit_predict(
|
535
|
+
self,
|
536
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
537
|
+
output_cols_prefix: str = "fit_predict_",
|
538
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
543
539
|
""" Fit and return the result of each sample's clustering assignment
|
544
540
|
For more details on this function, see [sklearn.cluster.AgglomerativeClustering.fit_predict]
|
545
541
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering.fit_predict)
|
@@ -566,7 +562,9 @@ class AgglomerativeClustering(BaseTransformer):
|
|
566
562
|
)
|
567
563
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
568
564
|
drop_input_cols=self._drop_input_cols,
|
569
|
-
expected_output_cols_list=
|
565
|
+
expected_output_cols_list=(
|
566
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
567
|
+
),
|
570
568
|
)
|
571
569
|
self._sklearn_object = fitted_estimator
|
572
570
|
self._is_fitted = True
|
@@ -583,6 +581,62 @@ class AgglomerativeClustering(BaseTransformer):
|
|
583
581
|
assert self._sklearn_object is not None
|
584
582
|
return self._sklearn_object.embedding_
|
585
583
|
|
584
|
+
|
585
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
586
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
587
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
588
|
+
"""
|
589
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
590
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
591
|
+
if output_cols:
|
592
|
+
output_cols = [
|
593
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
594
|
+
for c in output_cols
|
595
|
+
]
|
596
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
597
|
+
output_cols = [output_cols_prefix]
|
598
|
+
elif self._sklearn_object is not None:
|
599
|
+
classes = self._sklearn_object.classes_
|
600
|
+
if isinstance(classes, numpy.ndarray):
|
601
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
602
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
603
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
604
|
+
output_cols = []
|
605
|
+
for i, cl in enumerate(classes):
|
606
|
+
# For binary classification, there is only one output column for each class
|
607
|
+
# ndarray as the two classes are complementary.
|
608
|
+
if len(cl) == 2:
|
609
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
610
|
+
else:
|
611
|
+
output_cols.extend([
|
612
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
613
|
+
])
|
614
|
+
else:
|
615
|
+
output_cols = []
|
616
|
+
|
617
|
+
# Make sure column names are valid snowflake identifiers.
|
618
|
+
assert output_cols is not None # Make MyPy happy
|
619
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
620
|
+
|
621
|
+
return rv
|
622
|
+
|
623
|
+
def _align_expected_output_names(
|
624
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
625
|
+
) -> List[str]:
|
626
|
+
# in case the inferred output column names dimension is different
|
627
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
628
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
629
|
+
output_df_columns = list(output_df_pd.columns)
|
630
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
631
|
+
if self.sample_weight_col:
|
632
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
633
|
+
# if the dimension of inferred output column names is correct; use it
|
634
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
635
|
+
return expected_output_cols_list
|
636
|
+
# otherwise, use the sklearn estimator's output
|
637
|
+
else:
|
638
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
639
|
+
|
586
640
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
587
641
|
@telemetry.send_api_usage_telemetry(
|
588
642
|
project=_PROJECT,
|
@@ -613,24 +667,28 @@ class AgglomerativeClustering(BaseTransformer):
|
|
613
667
|
# are specific to the type of dataset used.
|
614
668
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
615
669
|
|
670
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
671
|
+
|
616
672
|
if isinstance(dataset, DataFrame):
|
617
673
|
self._deps = self._batch_inference_validate_snowpark(
|
618
674
|
dataset=dataset,
|
619
675
|
inference_method=inference_method,
|
620
676
|
)
|
621
|
-
assert isinstance(
|
677
|
+
assert isinstance(
|
678
|
+
dataset._session, Session
|
679
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
622
680
|
transform_kwargs = dict(
|
623
681
|
session=dataset._session,
|
624
682
|
dependencies=self._deps,
|
625
|
-
drop_input_cols
|
683
|
+
drop_input_cols=self._drop_input_cols,
|
626
684
|
expected_output_cols_type="float",
|
627
685
|
)
|
686
|
+
expected_output_cols = self._align_expected_output_names(
|
687
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
688
|
+
)
|
628
689
|
|
629
690
|
elif isinstance(dataset, pd.DataFrame):
|
630
|
-
transform_kwargs = dict(
|
631
|
-
snowpark_input_cols = self._snowpark_cols,
|
632
|
-
drop_input_cols = self._drop_input_cols
|
633
|
-
)
|
691
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
634
692
|
|
635
693
|
transform_handlers = ModelTransformerBuilder.build(
|
636
694
|
dataset=dataset,
|
@@ -642,7 +700,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
642
700
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
643
701
|
inference_method=inference_method,
|
644
702
|
input_cols=self.input_cols,
|
645
|
-
expected_output_cols=
|
703
|
+
expected_output_cols=expected_output_cols,
|
646
704
|
**transform_kwargs
|
647
705
|
)
|
648
706
|
return output_df
|
@@ -672,7 +730,8 @@ class AgglomerativeClustering(BaseTransformer):
|
|
672
730
|
Output dataset with log probability of the sample for each class in the model.
|
673
731
|
"""
|
674
732
|
super()._check_dataset_type(dataset)
|
675
|
-
inference_method="predict_log_proba"
|
733
|
+
inference_method = "predict_log_proba"
|
734
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
676
735
|
|
677
736
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
678
737
|
# are specific to the type of dataset used.
|
@@ -683,18 +742,20 @@ class AgglomerativeClustering(BaseTransformer):
|
|
683
742
|
dataset=dataset,
|
684
743
|
inference_method=inference_method,
|
685
744
|
)
|
686
|
-
assert isinstance(
|
745
|
+
assert isinstance(
|
746
|
+
dataset._session, Session
|
747
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
687
748
|
transform_kwargs = dict(
|
688
749
|
session=dataset._session,
|
689
750
|
dependencies=self._deps,
|
690
|
-
drop_input_cols
|
751
|
+
drop_input_cols=self._drop_input_cols,
|
691
752
|
expected_output_cols_type="float",
|
692
753
|
)
|
754
|
+
expected_output_cols = self._align_expected_output_names(
|
755
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
756
|
+
)
|
693
757
|
elif isinstance(dataset, pd.DataFrame):
|
694
|
-
transform_kwargs = dict(
|
695
|
-
snowpark_input_cols = self._snowpark_cols,
|
696
|
-
drop_input_cols = self._drop_input_cols
|
697
|
-
)
|
758
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
698
759
|
|
699
760
|
transform_handlers = ModelTransformerBuilder.build(
|
700
761
|
dataset=dataset,
|
@@ -707,7 +768,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
707
768
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
708
769
|
inference_method=inference_method,
|
709
770
|
input_cols=self.input_cols,
|
710
|
-
expected_output_cols=
|
771
|
+
expected_output_cols=expected_output_cols,
|
711
772
|
**transform_kwargs
|
712
773
|
)
|
713
774
|
return output_df
|
@@ -733,30 +794,34 @@ class AgglomerativeClustering(BaseTransformer):
|
|
733
794
|
Output dataset with results of the decision function for the samples in input dataset.
|
734
795
|
"""
|
735
796
|
super()._check_dataset_type(dataset)
|
736
|
-
inference_method="decision_function"
|
797
|
+
inference_method = "decision_function"
|
737
798
|
|
738
799
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
739
800
|
# are specific to the type of dataset used.
|
740
801
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
741
802
|
|
803
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
804
|
+
|
742
805
|
if isinstance(dataset, DataFrame):
|
743
806
|
self._deps = self._batch_inference_validate_snowpark(
|
744
807
|
dataset=dataset,
|
745
808
|
inference_method=inference_method,
|
746
809
|
)
|
747
|
-
assert isinstance(
|
810
|
+
assert isinstance(
|
811
|
+
dataset._session, Session
|
812
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
748
813
|
transform_kwargs = dict(
|
749
814
|
session=dataset._session,
|
750
815
|
dependencies=self._deps,
|
751
|
-
drop_input_cols
|
816
|
+
drop_input_cols=self._drop_input_cols,
|
752
817
|
expected_output_cols_type="float",
|
753
818
|
)
|
819
|
+
expected_output_cols = self._align_expected_output_names(
|
820
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
821
|
+
)
|
754
822
|
|
755
823
|
elif isinstance(dataset, pd.DataFrame):
|
756
|
-
transform_kwargs = dict(
|
757
|
-
snowpark_input_cols = self._snowpark_cols,
|
758
|
-
drop_input_cols = self._drop_input_cols
|
759
|
-
)
|
824
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
760
825
|
|
761
826
|
transform_handlers = ModelTransformerBuilder.build(
|
762
827
|
dataset=dataset,
|
@@ -769,7 +834,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
769
834
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
770
835
|
inference_method=inference_method,
|
771
836
|
input_cols=self.input_cols,
|
772
|
-
expected_output_cols=
|
837
|
+
expected_output_cols=expected_output_cols,
|
773
838
|
**transform_kwargs
|
774
839
|
)
|
775
840
|
return output_df
|
@@ -798,12 +863,14 @@ class AgglomerativeClustering(BaseTransformer):
|
|
798
863
|
Output dataset with probability of the sample for each class in the model.
|
799
864
|
"""
|
800
865
|
super()._check_dataset_type(dataset)
|
801
|
-
inference_method="score_samples"
|
866
|
+
inference_method = "score_samples"
|
802
867
|
|
803
868
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
804
869
|
# are specific to the type of dataset used.
|
805
870
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
806
871
|
|
872
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
873
|
+
|
807
874
|
if isinstance(dataset, DataFrame):
|
808
875
|
self._deps = self._batch_inference_validate_snowpark(
|
809
876
|
dataset=dataset,
|
@@ -816,6 +883,9 @@ class AgglomerativeClustering(BaseTransformer):
|
|
816
883
|
drop_input_cols = self._drop_input_cols,
|
817
884
|
expected_output_cols_type="float",
|
818
885
|
)
|
886
|
+
expected_output_cols = self._align_expected_output_names(
|
887
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
888
|
+
)
|
819
889
|
|
820
890
|
elif isinstance(dataset, pd.DataFrame):
|
821
891
|
transform_kwargs = dict(
|
@@ -834,7 +904,7 @@ class AgglomerativeClustering(BaseTransformer):
|
|
834
904
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
835
905
|
inference_method=inference_method,
|
836
906
|
input_cols=self.input_cols,
|
837
|
-
expected_output_cols=
|
907
|
+
expected_output_cols=expected_output_cols,
|
838
908
|
**transform_kwargs
|
839
909
|
)
|
840
910
|
return output_df
|
@@ -979,50 +1049,84 @@ class AgglomerativeClustering(BaseTransformer):
|
|
979
1049
|
)
|
980
1050
|
return output_df
|
981
1051
|
|
1052
|
+
|
1053
|
+
|
1054
|
+
def to_sklearn(self) -> Any:
|
1055
|
+
"""Get sklearn.cluster.AgglomerativeClustering object.
|
1056
|
+
"""
|
1057
|
+
if self._sklearn_object is None:
|
1058
|
+
self._sklearn_object = self._create_sklearn_object()
|
1059
|
+
return self._sklearn_object
|
1060
|
+
|
1061
|
+
def to_xgboost(self) -> Any:
|
1062
|
+
raise exceptions.SnowflakeMLException(
|
1063
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1064
|
+
original_exception=AttributeError(
|
1065
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1066
|
+
"to_xgboost()",
|
1067
|
+
"to_sklearn()"
|
1068
|
+
)
|
1069
|
+
),
|
1070
|
+
)
|
1071
|
+
|
1072
|
+
def to_lightgbm(self) -> Any:
|
1073
|
+
raise exceptions.SnowflakeMLException(
|
1074
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1075
|
+
original_exception=AttributeError(
|
1076
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1077
|
+
"to_lightgbm()",
|
1078
|
+
"to_sklearn()"
|
1079
|
+
)
|
1080
|
+
),
|
1081
|
+
)
|
982
1082
|
|
983
|
-
def
|
1083
|
+
def _get_dependencies(self) -> List[str]:
|
1084
|
+
return self._deps
|
1085
|
+
|
1086
|
+
|
1087
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
984
1088
|
self._model_signature_dict = dict()
|
985
1089
|
|
986
1090
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
987
1091
|
|
988
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1092
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
989
1093
|
outputs: List[BaseFeatureSpec] = []
|
990
1094
|
if hasattr(self, "predict"):
|
991
1095
|
# keep mypy happy
|
992
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1096
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
993
1097
|
# For classifier, the type of predict is the same as the type of label
|
994
|
-
if self._sklearn_object._estimator_type ==
|
995
|
-
|
1098
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1099
|
+
# label columns is the desired type for output
|
996
1100
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
997
1101
|
# rename the output columns
|
998
1102
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
999
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1000
|
-
|
1001
|
-
|
1103
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1104
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1105
|
+
)
|
1002
1106
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1003
1107
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1004
|
-
# Clusterer returns int64 cluster labels.
|
1108
|
+
# Clusterer returns int64 cluster labels.
|
1005
1109
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1006
1110
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1007
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1111
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1112
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1113
|
+
)
|
1114
|
+
|
1011
1115
|
# For regressor, the type of predict is float64
|
1012
|
-
elif self._sklearn_object._estimator_type ==
|
1116
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1013
1117
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1014
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1118
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1119
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1120
|
+
)
|
1121
|
+
|
1018
1122
|
for prob_func in PROB_FUNCTIONS:
|
1019
1123
|
if hasattr(self, prob_func):
|
1020
1124
|
output_cols_prefix: str = f"{prob_func}_"
|
1021
1125
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1022
1126
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1023
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1024
|
-
|
1025
|
-
|
1127
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1128
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1129
|
+
)
|
1026
1130
|
|
1027
1131
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1028
1132
|
items = list(self._model_signature_dict.items())
|
@@ -1035,10 +1139,10 @@ class AgglomerativeClustering(BaseTransformer):
|
|
1035
1139
|
"""Returns model signature of current class.
|
1036
1140
|
|
1037
1141
|
Raises:
|
1038
|
-
|
1142
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1039
1143
|
|
1040
1144
|
Returns:
|
1041
|
-
Dict
|
1145
|
+
Dict with each method and its input output signature
|
1042
1146
|
"""
|
1043
1147
|
if self._model_signature_dict is None:
|
1044
1148
|
raise exceptions.SnowflakeMLException(
|
@@ -1046,35 +1150,3 @@ class AgglomerativeClustering(BaseTransformer):
|
|
1046
1150
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1047
1151
|
)
|
1048
1152
|
return self._model_signature_dict
|
1049
|
-
|
1050
|
-
def to_sklearn(self) -> Any:
|
1051
|
-
"""Get sklearn.cluster.AgglomerativeClustering object.
|
1052
|
-
"""
|
1053
|
-
if self._sklearn_object is None:
|
1054
|
-
self._sklearn_object = self._create_sklearn_object()
|
1055
|
-
return self._sklearn_object
|
1056
|
-
|
1057
|
-
def to_xgboost(self) -> Any:
|
1058
|
-
raise exceptions.SnowflakeMLException(
|
1059
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1060
|
-
original_exception=AttributeError(
|
1061
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1062
|
-
"to_xgboost()",
|
1063
|
-
"to_sklearn()"
|
1064
|
-
)
|
1065
|
-
),
|
1066
|
-
)
|
1067
|
-
|
1068
|
-
def to_lightgbm(self) -> Any:
|
1069
|
-
raise exceptions.SnowflakeMLException(
|
1070
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1071
|
-
original_exception=AttributeError(
|
1072
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1073
|
-
"to_lightgbm()",
|
1074
|
-
"to_sklearn()"
|
1075
|
-
)
|
1076
|
-
),
|
1077
|
-
)
|
1078
|
-
|
1079
|
-
def _get_dependencies(self) -> List[str]:
|
1080
|
-
return self._deps
|