snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -267,12 +266,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
267
266
|
)
|
268
267
|
return selected_cols
|
269
268
|
|
270
|
-
|
271
|
-
project=_PROJECT,
|
272
|
-
subproject=_SUBPROJECT,
|
273
|
-
custom_tags=dict([("autogen", True)]),
|
274
|
-
)
|
275
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "CalibratedClassifierCV":
|
269
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "CalibratedClassifierCV":
|
276
270
|
"""Fit the calibrated model
|
277
271
|
For more details on this function, see [sklearn.calibration.CalibratedClassifierCV.fit]
|
278
272
|
(https://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html#sklearn.calibration.CalibratedClassifierCV.fit)
|
@@ -299,12 +293,14 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
299
293
|
|
300
294
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
301
295
|
|
302
|
-
|
296
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
303
297
|
if SNOWML_SPROC_ENV in os.environ:
|
304
298
|
statement_params = telemetry.get_function_usage_statement_params(
|
305
299
|
project=_PROJECT,
|
306
300
|
subproject=_SUBPROJECT,
|
307
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
301
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
302
|
+
inspect.currentframe(), CalibratedClassifierCV.__class__.__name__
|
303
|
+
),
|
308
304
|
api_calls=[Session.call],
|
309
305
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
310
306
|
)
|
@@ -325,7 +321,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
325
321
|
)
|
326
322
|
self._sklearn_object = model_trainer.train()
|
327
323
|
self._is_fitted = True
|
328
|
-
self.
|
324
|
+
self._generate_model_signatures(dataset)
|
329
325
|
return self
|
330
326
|
|
331
327
|
def _batch_inference_validate_snowpark(
|
@@ -401,7 +397,9 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
401
397
|
# when it is classifier, infer the datatype from label columns
|
402
398
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
403
399
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
404
|
-
label_cols_signatures = [
|
400
|
+
label_cols_signatures = [
|
401
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
402
|
+
]
|
405
403
|
if len(label_cols_signatures) == 0:
|
406
404
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
407
405
|
raise exceptions.SnowflakeMLException(
|
@@ -409,25 +407,22 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
409
407
|
original_exception=ValueError(error_str),
|
410
408
|
)
|
411
409
|
|
412
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
413
|
-
label_cols_signatures[0].as_snowpark_type()
|
414
|
-
)
|
410
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
415
411
|
|
416
412
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
417
|
-
assert isinstance(
|
413
|
+
assert isinstance(
|
414
|
+
dataset._session, Session
|
415
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
418
416
|
|
419
417
|
transform_kwargs = dict(
|
420
|
-
session
|
421
|
-
dependencies
|
422
|
-
drop_input_cols
|
423
|
-
expected_output_cols_type
|
418
|
+
session=dataset._session,
|
419
|
+
dependencies=self._deps,
|
420
|
+
drop_input_cols=self._drop_input_cols,
|
421
|
+
expected_output_cols_type=expected_type_inferred,
|
424
422
|
)
|
425
423
|
|
426
424
|
elif isinstance(dataset, pd.DataFrame):
|
427
|
-
transform_kwargs = dict(
|
428
|
-
snowpark_input_cols = self._snowpark_cols,
|
429
|
-
drop_input_cols = self._drop_input_cols
|
430
|
-
)
|
425
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
431
426
|
|
432
427
|
transform_handlers = ModelTransformerBuilder.build(
|
433
428
|
dataset=dataset,
|
@@ -467,7 +462,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
467
462
|
Transformed dataset.
|
468
463
|
"""
|
469
464
|
super()._check_dataset_type(dataset)
|
470
|
-
inference_method="transform"
|
465
|
+
inference_method = "transform"
|
471
466
|
|
472
467
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
473
468
|
# are specific to the type of dataset used.
|
@@ -504,17 +499,14 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
504
499
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
505
500
|
|
506
501
|
transform_kwargs = dict(
|
507
|
-
session
|
508
|
-
dependencies
|
509
|
-
drop_input_cols
|
510
|
-
expected_output_cols_type
|
502
|
+
session=dataset._session,
|
503
|
+
dependencies=self._deps,
|
504
|
+
drop_input_cols=self._drop_input_cols,
|
505
|
+
expected_output_cols_type=expected_dtype,
|
511
506
|
)
|
512
507
|
|
513
508
|
elif isinstance(dataset, pd.DataFrame):
|
514
|
-
transform_kwargs = dict(
|
515
|
-
snowpark_input_cols = self._snowpark_cols,
|
516
|
-
drop_input_cols = self._drop_input_cols
|
517
|
-
)
|
509
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
518
510
|
|
519
511
|
transform_handlers = ModelTransformerBuilder.build(
|
520
512
|
dataset=dataset,
|
@@ -533,7 +525,11 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
533
525
|
return output_df
|
534
526
|
|
535
527
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
536
|
-
def fit_predict(
|
528
|
+
def fit_predict(
|
529
|
+
self,
|
530
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
531
|
+
output_cols_prefix: str = "fit_predict_",
|
532
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
537
533
|
""" Method not supported for this class.
|
538
534
|
|
539
535
|
|
@@ -558,7 +554,9 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
558
554
|
)
|
559
555
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
560
556
|
drop_input_cols=self._drop_input_cols,
|
561
|
-
expected_output_cols_list=
|
557
|
+
expected_output_cols_list=(
|
558
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
559
|
+
),
|
562
560
|
)
|
563
561
|
self._sklearn_object = fitted_estimator
|
564
562
|
self._is_fitted = True
|
@@ -575,6 +573,62 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
575
573
|
assert self._sklearn_object is not None
|
576
574
|
return self._sklearn_object.embedding_
|
577
575
|
|
576
|
+
|
577
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
578
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
579
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
580
|
+
"""
|
581
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
582
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
583
|
+
if output_cols:
|
584
|
+
output_cols = [
|
585
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
586
|
+
for c in output_cols
|
587
|
+
]
|
588
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
589
|
+
output_cols = [output_cols_prefix]
|
590
|
+
elif self._sklearn_object is not None:
|
591
|
+
classes = self._sklearn_object.classes_
|
592
|
+
if isinstance(classes, numpy.ndarray):
|
593
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
594
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
595
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
596
|
+
output_cols = []
|
597
|
+
for i, cl in enumerate(classes):
|
598
|
+
# For binary classification, there is only one output column for each class
|
599
|
+
# ndarray as the two classes are complementary.
|
600
|
+
if len(cl) == 2:
|
601
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
602
|
+
else:
|
603
|
+
output_cols.extend([
|
604
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
605
|
+
])
|
606
|
+
else:
|
607
|
+
output_cols = []
|
608
|
+
|
609
|
+
# Make sure column names are valid snowflake identifiers.
|
610
|
+
assert output_cols is not None # Make MyPy happy
|
611
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
612
|
+
|
613
|
+
return rv
|
614
|
+
|
615
|
+
def _align_expected_output_names(
|
616
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
617
|
+
) -> List[str]:
|
618
|
+
# in case the inferred output column names dimension is different
|
619
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
620
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
621
|
+
output_df_columns = list(output_df_pd.columns)
|
622
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
623
|
+
if self.sample_weight_col:
|
624
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
625
|
+
# if the dimension of inferred output column names is correct; use it
|
626
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
627
|
+
return expected_output_cols_list
|
628
|
+
# otherwise, use the sklearn estimator's output
|
629
|
+
else:
|
630
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
631
|
+
|
578
632
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
579
633
|
@telemetry.send_api_usage_telemetry(
|
580
634
|
project=_PROJECT,
|
@@ -607,24 +661,28 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
607
661
|
# are specific to the type of dataset used.
|
608
662
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
609
663
|
|
664
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
665
|
+
|
610
666
|
if isinstance(dataset, DataFrame):
|
611
667
|
self._deps = self._batch_inference_validate_snowpark(
|
612
668
|
dataset=dataset,
|
613
669
|
inference_method=inference_method,
|
614
670
|
)
|
615
|
-
assert isinstance(
|
671
|
+
assert isinstance(
|
672
|
+
dataset._session, Session
|
673
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
616
674
|
transform_kwargs = dict(
|
617
675
|
session=dataset._session,
|
618
676
|
dependencies=self._deps,
|
619
|
-
drop_input_cols
|
677
|
+
drop_input_cols=self._drop_input_cols,
|
620
678
|
expected_output_cols_type="float",
|
621
679
|
)
|
680
|
+
expected_output_cols = self._align_expected_output_names(
|
681
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
682
|
+
)
|
622
683
|
|
623
684
|
elif isinstance(dataset, pd.DataFrame):
|
624
|
-
transform_kwargs = dict(
|
625
|
-
snowpark_input_cols = self._snowpark_cols,
|
626
|
-
drop_input_cols = self._drop_input_cols
|
627
|
-
)
|
685
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
628
686
|
|
629
687
|
transform_handlers = ModelTransformerBuilder.build(
|
630
688
|
dataset=dataset,
|
@@ -636,7 +694,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
636
694
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
637
695
|
inference_method=inference_method,
|
638
696
|
input_cols=self.input_cols,
|
639
|
-
expected_output_cols=
|
697
|
+
expected_output_cols=expected_output_cols,
|
640
698
|
**transform_kwargs
|
641
699
|
)
|
642
700
|
return output_df
|
@@ -668,7 +726,8 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
668
726
|
Output dataset with log probability of the sample for each class in the model.
|
669
727
|
"""
|
670
728
|
super()._check_dataset_type(dataset)
|
671
|
-
inference_method="predict_log_proba"
|
729
|
+
inference_method = "predict_log_proba"
|
730
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
672
731
|
|
673
732
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
674
733
|
# are specific to the type of dataset used.
|
@@ -679,18 +738,20 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
679
738
|
dataset=dataset,
|
680
739
|
inference_method=inference_method,
|
681
740
|
)
|
682
|
-
assert isinstance(
|
741
|
+
assert isinstance(
|
742
|
+
dataset._session, Session
|
743
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
683
744
|
transform_kwargs = dict(
|
684
745
|
session=dataset._session,
|
685
746
|
dependencies=self._deps,
|
686
|
-
drop_input_cols
|
747
|
+
drop_input_cols=self._drop_input_cols,
|
687
748
|
expected_output_cols_type="float",
|
688
749
|
)
|
750
|
+
expected_output_cols = self._align_expected_output_names(
|
751
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
752
|
+
)
|
689
753
|
elif isinstance(dataset, pd.DataFrame):
|
690
|
-
transform_kwargs = dict(
|
691
|
-
snowpark_input_cols = self._snowpark_cols,
|
692
|
-
drop_input_cols = self._drop_input_cols
|
693
|
-
)
|
754
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
694
755
|
|
695
756
|
transform_handlers = ModelTransformerBuilder.build(
|
696
757
|
dataset=dataset,
|
@@ -703,7 +764,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
703
764
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
704
765
|
inference_method=inference_method,
|
705
766
|
input_cols=self.input_cols,
|
706
|
-
expected_output_cols=
|
767
|
+
expected_output_cols=expected_output_cols,
|
707
768
|
**transform_kwargs
|
708
769
|
)
|
709
770
|
return output_df
|
@@ -729,30 +790,34 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
729
790
|
Output dataset with results of the decision function for the samples in input dataset.
|
730
791
|
"""
|
731
792
|
super()._check_dataset_type(dataset)
|
732
|
-
inference_method="decision_function"
|
793
|
+
inference_method = "decision_function"
|
733
794
|
|
734
795
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
735
796
|
# are specific to the type of dataset used.
|
736
797
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
737
798
|
|
799
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
800
|
+
|
738
801
|
if isinstance(dataset, DataFrame):
|
739
802
|
self._deps = self._batch_inference_validate_snowpark(
|
740
803
|
dataset=dataset,
|
741
804
|
inference_method=inference_method,
|
742
805
|
)
|
743
|
-
assert isinstance(
|
806
|
+
assert isinstance(
|
807
|
+
dataset._session, Session
|
808
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
744
809
|
transform_kwargs = dict(
|
745
810
|
session=dataset._session,
|
746
811
|
dependencies=self._deps,
|
747
|
-
drop_input_cols
|
812
|
+
drop_input_cols=self._drop_input_cols,
|
748
813
|
expected_output_cols_type="float",
|
749
814
|
)
|
815
|
+
expected_output_cols = self._align_expected_output_names(
|
816
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
817
|
+
)
|
750
818
|
|
751
819
|
elif isinstance(dataset, pd.DataFrame):
|
752
|
-
transform_kwargs = dict(
|
753
|
-
snowpark_input_cols = self._snowpark_cols,
|
754
|
-
drop_input_cols = self._drop_input_cols
|
755
|
-
)
|
820
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
756
821
|
|
757
822
|
transform_handlers = ModelTransformerBuilder.build(
|
758
823
|
dataset=dataset,
|
@@ -765,7 +830,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
765
830
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
766
831
|
inference_method=inference_method,
|
767
832
|
input_cols=self.input_cols,
|
768
|
-
expected_output_cols=
|
833
|
+
expected_output_cols=expected_output_cols,
|
769
834
|
**transform_kwargs
|
770
835
|
)
|
771
836
|
return output_df
|
@@ -794,12 +859,14 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
794
859
|
Output dataset with probability of the sample for each class in the model.
|
795
860
|
"""
|
796
861
|
super()._check_dataset_type(dataset)
|
797
|
-
inference_method="score_samples"
|
862
|
+
inference_method = "score_samples"
|
798
863
|
|
799
864
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
800
865
|
# are specific to the type of dataset used.
|
801
866
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
802
867
|
|
868
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
869
|
+
|
803
870
|
if isinstance(dataset, DataFrame):
|
804
871
|
self._deps = self._batch_inference_validate_snowpark(
|
805
872
|
dataset=dataset,
|
@@ -812,6 +879,9 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
812
879
|
drop_input_cols = self._drop_input_cols,
|
813
880
|
expected_output_cols_type="float",
|
814
881
|
)
|
882
|
+
expected_output_cols = self._align_expected_output_names(
|
883
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
884
|
+
)
|
815
885
|
|
816
886
|
elif isinstance(dataset, pd.DataFrame):
|
817
887
|
transform_kwargs = dict(
|
@@ -830,7 +900,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
830
900
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
831
901
|
inference_method=inference_method,
|
832
902
|
input_cols=self.input_cols,
|
833
|
-
expected_output_cols=
|
903
|
+
expected_output_cols=expected_output_cols,
|
834
904
|
**transform_kwargs
|
835
905
|
)
|
836
906
|
return output_df
|
@@ -977,50 +1047,84 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
977
1047
|
)
|
978
1048
|
return output_df
|
979
1049
|
|
1050
|
+
|
1051
|
+
|
1052
|
+
def to_sklearn(self) -> Any:
|
1053
|
+
"""Get sklearn.calibration.CalibratedClassifierCV object.
|
1054
|
+
"""
|
1055
|
+
if self._sklearn_object is None:
|
1056
|
+
self._sklearn_object = self._create_sklearn_object()
|
1057
|
+
return self._sklearn_object
|
1058
|
+
|
1059
|
+
def to_xgboost(self) -> Any:
|
1060
|
+
raise exceptions.SnowflakeMLException(
|
1061
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1062
|
+
original_exception=AttributeError(
|
1063
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1064
|
+
"to_xgboost()",
|
1065
|
+
"to_sklearn()"
|
1066
|
+
)
|
1067
|
+
),
|
1068
|
+
)
|
1069
|
+
|
1070
|
+
def to_lightgbm(self) -> Any:
|
1071
|
+
raise exceptions.SnowflakeMLException(
|
1072
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
+
original_exception=AttributeError(
|
1074
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
+
"to_lightgbm()",
|
1076
|
+
"to_sklearn()"
|
1077
|
+
)
|
1078
|
+
),
|
1079
|
+
)
|
980
1080
|
|
981
|
-
def
|
1081
|
+
def _get_dependencies(self) -> List[str]:
|
1082
|
+
return self._deps
|
1083
|
+
|
1084
|
+
|
1085
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
982
1086
|
self._model_signature_dict = dict()
|
983
1087
|
|
984
1088
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
985
1089
|
|
986
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1090
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
987
1091
|
outputs: List[BaseFeatureSpec] = []
|
988
1092
|
if hasattr(self, "predict"):
|
989
1093
|
# keep mypy happy
|
990
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1094
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
991
1095
|
# For classifier, the type of predict is the same as the type of label
|
992
|
-
if self._sklearn_object._estimator_type ==
|
993
|
-
|
1096
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1097
|
+
# label columns is the desired type for output
|
994
1098
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
995
1099
|
# rename the output columns
|
996
1100
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
997
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
998
|
-
|
999
|
-
|
1101
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1102
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1103
|
+
)
|
1000
1104
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1001
1105
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1002
|
-
# Clusterer returns int64 cluster labels.
|
1106
|
+
# Clusterer returns int64 cluster labels.
|
1003
1107
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1004
1108
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1005
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1109
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1110
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1111
|
+
)
|
1112
|
+
|
1009
1113
|
# For regressor, the type of predict is float64
|
1010
|
-
elif self._sklearn_object._estimator_type ==
|
1114
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1011
1115
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1012
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1116
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1117
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1118
|
+
)
|
1119
|
+
|
1016
1120
|
for prob_func in PROB_FUNCTIONS:
|
1017
1121
|
if hasattr(self, prob_func):
|
1018
1122
|
output_cols_prefix: str = f"{prob_func}_"
|
1019
1123
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1020
1124
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1021
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1022
|
-
|
1023
|
-
|
1125
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1126
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1127
|
+
)
|
1024
1128
|
|
1025
1129
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1026
1130
|
items = list(self._model_signature_dict.items())
|
@@ -1033,10 +1137,10 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
1033
1137
|
"""Returns model signature of current class.
|
1034
1138
|
|
1035
1139
|
Raises:
|
1036
|
-
|
1140
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1037
1141
|
|
1038
1142
|
Returns:
|
1039
|
-
Dict
|
1143
|
+
Dict with each method and its input output signature
|
1040
1144
|
"""
|
1041
1145
|
if self._model_signature_dict is None:
|
1042
1146
|
raise exceptions.SnowflakeMLException(
|
@@ -1044,35 +1148,3 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
1044
1148
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1045
1149
|
)
|
1046
1150
|
return self._model_signature_dict
|
1047
|
-
|
1048
|
-
def to_sklearn(self) -> Any:
|
1049
|
-
"""Get sklearn.calibration.CalibratedClassifierCV object.
|
1050
|
-
"""
|
1051
|
-
if self._sklearn_object is None:
|
1052
|
-
self._sklearn_object = self._create_sklearn_object()
|
1053
|
-
return self._sklearn_object
|
1054
|
-
|
1055
|
-
def to_xgboost(self) -> Any:
|
1056
|
-
raise exceptions.SnowflakeMLException(
|
1057
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1058
|
-
original_exception=AttributeError(
|
1059
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1060
|
-
"to_xgboost()",
|
1061
|
-
"to_sklearn()"
|
1062
|
-
)
|
1063
|
-
),
|
1064
|
-
)
|
1065
|
-
|
1066
|
-
def to_lightgbm(self) -> Any:
|
1067
|
-
raise exceptions.SnowflakeMLException(
|
1068
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1069
|
-
original_exception=AttributeError(
|
1070
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1071
|
-
"to_lightgbm()",
|
1072
|
-
"to_sklearn()"
|
1073
|
-
)
|
1074
|
-
),
|
1075
|
-
)
|
1076
|
-
|
1077
|
-
def _get_dependencies(self) -> List[str]:
|
1078
|
-
return self._deps
|