snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.ensemble".replace("sklea
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class GradientBoostingClassifier(BaseTransformer):
|
71
64
|
r"""Gradient Boosting for classification
|
72
65
|
For more details on this class, see [sklearn.ensemble.GradientBoostingClassifier]
|
@@ -391,12 +384,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
391
384
|
)
|
392
385
|
return selected_cols
|
393
386
|
|
394
|
-
|
395
|
-
project=_PROJECT,
|
396
|
-
subproject=_SUBPROJECT,
|
397
|
-
custom_tags=dict([("autogen", True)]),
|
398
|
-
)
|
399
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GradientBoostingClassifier":
|
387
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GradientBoostingClassifier":
|
400
388
|
"""Fit the gradient boosting model
|
401
389
|
For more details on this function, see [sklearn.ensemble.GradientBoostingClassifier.fit]
|
402
390
|
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier.fit)
|
@@ -423,12 +411,14 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
423
411
|
|
424
412
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
425
413
|
|
426
|
-
|
414
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
427
415
|
if SNOWML_SPROC_ENV in os.environ:
|
428
416
|
statement_params = telemetry.get_function_usage_statement_params(
|
429
417
|
project=_PROJECT,
|
430
418
|
subproject=_SUBPROJECT,
|
431
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
419
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
420
|
+
inspect.currentframe(), GradientBoostingClassifier.__class__.__name__
|
421
|
+
),
|
432
422
|
api_calls=[Session.call],
|
433
423
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
434
424
|
)
|
@@ -449,27 +439,24 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
449
439
|
)
|
450
440
|
self._sklearn_object = model_trainer.train()
|
451
441
|
self._is_fitted = True
|
452
|
-
self.
|
442
|
+
self._generate_model_signatures(dataset)
|
453
443
|
return self
|
454
444
|
|
455
445
|
def _batch_inference_validate_snowpark(
|
456
446
|
self,
|
457
447
|
dataset: DataFrame,
|
458
448
|
inference_method: str,
|
459
|
-
) ->
|
460
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
461
|
-
return the available package that exists in the snowflake anaconda channel
|
449
|
+
) -> None:
|
450
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
462
451
|
|
463
452
|
Args:
|
464
453
|
dataset: snowpark dataframe
|
465
454
|
inference_method: the inference method such as predict, score...
|
466
|
-
|
455
|
+
|
467
456
|
Raises:
|
468
457
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
469
458
|
SnowflakeMLException: If the session is None, raise error
|
470
459
|
|
471
|
-
Returns:
|
472
|
-
A list of available package that exists in the snowflake anaconda channel
|
473
460
|
"""
|
474
461
|
if not self._is_fitted:
|
475
462
|
raise exceptions.SnowflakeMLException(
|
@@ -487,9 +474,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
487
474
|
"Session must not specified for snowpark dataset."
|
488
475
|
),
|
489
476
|
)
|
490
|
-
|
491
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
492
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
477
|
+
|
493
478
|
|
494
479
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
495
480
|
@telemetry.send_api_usage_telemetry(
|
@@ -525,7 +510,9 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
525
510
|
# when it is classifier, infer the datatype from label columns
|
526
511
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
527
512
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
528
|
-
label_cols_signatures = [
|
513
|
+
label_cols_signatures = [
|
514
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
515
|
+
]
|
529
516
|
if len(label_cols_signatures) == 0:
|
530
517
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
531
518
|
raise exceptions.SnowflakeMLException(
|
@@ -533,25 +520,23 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
533
520
|
original_exception=ValueError(error_str),
|
534
521
|
)
|
535
522
|
|
536
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
537
|
-
label_cols_signatures[0].as_snowpark_type()
|
538
|
-
)
|
523
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
539
524
|
|
540
|
-
self.
|
541
|
-
|
525
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
526
|
+
self._deps = self._get_dependencies()
|
527
|
+
assert isinstance(
|
528
|
+
dataset._session, Session
|
529
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
542
530
|
|
543
531
|
transform_kwargs = dict(
|
544
|
-
session
|
545
|
-
dependencies
|
546
|
-
drop_input_cols
|
547
|
-
expected_output_cols_type
|
532
|
+
session=dataset._session,
|
533
|
+
dependencies=self._deps,
|
534
|
+
drop_input_cols=self._drop_input_cols,
|
535
|
+
expected_output_cols_type=expected_type_inferred,
|
548
536
|
)
|
549
537
|
|
550
538
|
elif isinstance(dataset, pd.DataFrame):
|
551
|
-
transform_kwargs = dict(
|
552
|
-
snowpark_input_cols = self._snowpark_cols,
|
553
|
-
drop_input_cols = self._drop_input_cols
|
554
|
-
)
|
539
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
555
540
|
|
556
541
|
transform_handlers = ModelTransformerBuilder.build(
|
557
542
|
dataset=dataset,
|
@@ -591,7 +576,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
591
576
|
Transformed dataset.
|
592
577
|
"""
|
593
578
|
super()._check_dataset_type(dataset)
|
594
|
-
inference_method="transform"
|
579
|
+
inference_method = "transform"
|
595
580
|
|
596
581
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
597
582
|
# are specific to the type of dataset used.
|
@@ -621,24 +606,19 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
621
606
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
622
607
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
623
608
|
|
624
|
-
self.
|
625
|
-
|
626
|
-
inference_method=inference_method,
|
627
|
-
)
|
609
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
610
|
+
self._deps = self._get_dependencies()
|
628
611
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
629
612
|
|
630
613
|
transform_kwargs = dict(
|
631
|
-
session
|
632
|
-
dependencies
|
633
|
-
drop_input_cols
|
634
|
-
expected_output_cols_type
|
614
|
+
session=dataset._session,
|
615
|
+
dependencies=self._deps,
|
616
|
+
drop_input_cols=self._drop_input_cols,
|
617
|
+
expected_output_cols_type=expected_dtype,
|
635
618
|
)
|
636
619
|
|
637
620
|
elif isinstance(dataset, pd.DataFrame):
|
638
|
-
transform_kwargs = dict(
|
639
|
-
snowpark_input_cols = self._snowpark_cols,
|
640
|
-
drop_input_cols = self._drop_input_cols
|
641
|
-
)
|
621
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
642
622
|
|
643
623
|
transform_handlers = ModelTransformerBuilder.build(
|
644
624
|
dataset=dataset,
|
@@ -657,7 +637,11 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
657
637
|
return output_df
|
658
638
|
|
659
639
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
660
|
-
def fit_predict(
|
640
|
+
def fit_predict(
|
641
|
+
self,
|
642
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
643
|
+
output_cols_prefix: str = "fit_predict_",
|
644
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
661
645
|
""" Method not supported for this class.
|
662
646
|
|
663
647
|
|
@@ -682,22 +666,104 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
682
666
|
)
|
683
667
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
684
668
|
drop_input_cols=self._drop_input_cols,
|
685
|
-
expected_output_cols_list=
|
669
|
+
expected_output_cols_list=(
|
670
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
671
|
+
),
|
686
672
|
)
|
687
673
|
self._sklearn_object = fitted_estimator
|
688
674
|
self._is_fitted = True
|
689
675
|
return output_result
|
690
676
|
|
677
|
+
|
678
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
679
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
680
|
+
""" Method not supported for this class.
|
681
|
+
|
691
682
|
|
692
|
-
|
693
|
-
|
694
|
-
|
683
|
+
Raises:
|
684
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
685
|
+
|
686
|
+
Args:
|
687
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
688
|
+
Snowpark or Pandas DataFrame.
|
689
|
+
output_cols_prefix: Prefix for the response columns
|
695
690
|
Returns:
|
696
691
|
Transformed dataset.
|
697
692
|
"""
|
698
|
-
self.
|
699
|
-
|
700
|
-
|
693
|
+
self._infer_input_output_cols(dataset)
|
694
|
+
super()._check_dataset_type(dataset)
|
695
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
696
|
+
estimator=self._sklearn_object,
|
697
|
+
dataset=dataset,
|
698
|
+
input_cols=self.input_cols,
|
699
|
+
label_cols=self.label_cols,
|
700
|
+
sample_weight_col=self.sample_weight_col,
|
701
|
+
autogenerated=self._autogenerated,
|
702
|
+
subproject=_SUBPROJECT,
|
703
|
+
)
|
704
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
705
|
+
drop_input_cols=self._drop_input_cols,
|
706
|
+
expected_output_cols_list=self.output_cols,
|
707
|
+
)
|
708
|
+
self._sklearn_object = fitted_estimator
|
709
|
+
self._is_fitted = True
|
710
|
+
return output_result
|
711
|
+
|
712
|
+
|
713
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
714
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
715
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
716
|
+
"""
|
717
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
718
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
719
|
+
if output_cols:
|
720
|
+
output_cols = [
|
721
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
722
|
+
for c in output_cols
|
723
|
+
]
|
724
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
725
|
+
output_cols = [output_cols_prefix]
|
726
|
+
elif self._sklearn_object is not None:
|
727
|
+
classes = self._sklearn_object.classes_
|
728
|
+
if isinstance(classes, numpy.ndarray):
|
729
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
730
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
731
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
732
|
+
output_cols = []
|
733
|
+
for i, cl in enumerate(classes):
|
734
|
+
# For binary classification, there is only one output column for each class
|
735
|
+
# ndarray as the two classes are complementary.
|
736
|
+
if len(cl) == 2:
|
737
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
738
|
+
else:
|
739
|
+
output_cols.extend([
|
740
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
741
|
+
])
|
742
|
+
else:
|
743
|
+
output_cols = []
|
744
|
+
|
745
|
+
# Make sure column names are valid snowflake identifiers.
|
746
|
+
assert output_cols is not None # Make MyPy happy
|
747
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
748
|
+
|
749
|
+
return rv
|
750
|
+
|
751
|
+
def _align_expected_output_names(
|
752
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
753
|
+
) -> List[str]:
|
754
|
+
# in case the inferred output column names dimension is different
|
755
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
756
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
757
|
+
output_df_columns = list(output_df_pd.columns)
|
758
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
759
|
+
if self.sample_weight_col:
|
760
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
761
|
+
# if the dimension of inferred output column names is correct; use it
|
762
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
763
|
+
return expected_output_cols_list
|
764
|
+
# otherwise, use the sklearn estimator's output
|
765
|
+
else:
|
766
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
701
767
|
|
702
768
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
703
769
|
@telemetry.send_api_usage_telemetry(
|
@@ -731,24 +797,26 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
731
797
|
# are specific to the type of dataset used.
|
732
798
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
733
799
|
|
800
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
801
|
+
|
734
802
|
if isinstance(dataset, DataFrame):
|
735
|
-
self.
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
803
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
804
|
+
self._deps = self._get_dependencies()
|
805
|
+
assert isinstance(
|
806
|
+
dataset._session, Session
|
807
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
740
808
|
transform_kwargs = dict(
|
741
809
|
session=dataset._session,
|
742
810
|
dependencies=self._deps,
|
743
|
-
drop_input_cols
|
811
|
+
drop_input_cols=self._drop_input_cols,
|
744
812
|
expected_output_cols_type="float",
|
745
813
|
)
|
814
|
+
expected_output_cols = self._align_expected_output_names(
|
815
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
816
|
+
)
|
746
817
|
|
747
818
|
elif isinstance(dataset, pd.DataFrame):
|
748
|
-
transform_kwargs = dict(
|
749
|
-
snowpark_input_cols = self._snowpark_cols,
|
750
|
-
drop_input_cols = self._drop_input_cols
|
751
|
-
)
|
819
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
752
820
|
|
753
821
|
transform_handlers = ModelTransformerBuilder.build(
|
754
822
|
dataset=dataset,
|
@@ -760,7 +828,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
760
828
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
761
829
|
inference_method=inference_method,
|
762
830
|
input_cols=self.input_cols,
|
763
|
-
expected_output_cols=
|
831
|
+
expected_output_cols=expected_output_cols,
|
764
832
|
**transform_kwargs
|
765
833
|
)
|
766
834
|
return output_df
|
@@ -792,29 +860,30 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
792
860
|
Output dataset with log probability of the sample for each class in the model.
|
793
861
|
"""
|
794
862
|
super()._check_dataset_type(dataset)
|
795
|
-
inference_method="predict_log_proba"
|
863
|
+
inference_method = "predict_log_proba"
|
864
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
796
865
|
|
797
866
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
798
867
|
# are specific to the type of dataset used.
|
799
868
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
800
869
|
|
801
870
|
if isinstance(dataset, DataFrame):
|
802
|
-
self.
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
871
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
872
|
+
self._deps = self._get_dependencies()
|
873
|
+
assert isinstance(
|
874
|
+
dataset._session, Session
|
875
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
807
876
|
transform_kwargs = dict(
|
808
877
|
session=dataset._session,
|
809
878
|
dependencies=self._deps,
|
810
|
-
drop_input_cols
|
879
|
+
drop_input_cols=self._drop_input_cols,
|
811
880
|
expected_output_cols_type="float",
|
812
881
|
)
|
882
|
+
expected_output_cols = self._align_expected_output_names(
|
883
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
884
|
+
)
|
813
885
|
elif isinstance(dataset, pd.DataFrame):
|
814
|
-
transform_kwargs = dict(
|
815
|
-
snowpark_input_cols = self._snowpark_cols,
|
816
|
-
drop_input_cols = self._drop_input_cols
|
817
|
-
)
|
886
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
818
887
|
|
819
888
|
transform_handlers = ModelTransformerBuilder.build(
|
820
889
|
dataset=dataset,
|
@@ -827,7 +896,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
827
896
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
828
897
|
inference_method=inference_method,
|
829
898
|
input_cols=self.input_cols,
|
830
|
-
expected_output_cols=
|
899
|
+
expected_output_cols=expected_output_cols,
|
831
900
|
**transform_kwargs
|
832
901
|
)
|
833
902
|
return output_df
|
@@ -855,30 +924,32 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
855
924
|
Output dataset with results of the decision function for the samples in input dataset.
|
856
925
|
"""
|
857
926
|
super()._check_dataset_type(dataset)
|
858
|
-
inference_method="decision_function"
|
927
|
+
inference_method = "decision_function"
|
859
928
|
|
860
929
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
861
930
|
# are specific to the type of dataset used.
|
862
931
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
863
932
|
|
933
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
934
|
+
|
864
935
|
if isinstance(dataset, DataFrame):
|
865
|
-
self.
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
936
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
937
|
+
self._deps = self._get_dependencies()
|
938
|
+
assert isinstance(
|
939
|
+
dataset._session, Session
|
940
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
870
941
|
transform_kwargs = dict(
|
871
942
|
session=dataset._session,
|
872
943
|
dependencies=self._deps,
|
873
|
-
drop_input_cols
|
944
|
+
drop_input_cols=self._drop_input_cols,
|
874
945
|
expected_output_cols_type="float",
|
875
946
|
)
|
947
|
+
expected_output_cols = self._align_expected_output_names(
|
948
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
949
|
+
)
|
876
950
|
|
877
951
|
elif isinstance(dataset, pd.DataFrame):
|
878
|
-
transform_kwargs = dict(
|
879
|
-
snowpark_input_cols = self._snowpark_cols,
|
880
|
-
drop_input_cols = self._drop_input_cols
|
881
|
-
)
|
952
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
882
953
|
|
883
954
|
transform_handlers = ModelTransformerBuilder.build(
|
884
955
|
dataset=dataset,
|
@@ -891,7 +962,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
891
962
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
892
963
|
inference_method=inference_method,
|
893
964
|
input_cols=self.input_cols,
|
894
|
-
expected_output_cols=
|
965
|
+
expected_output_cols=expected_output_cols,
|
895
966
|
**transform_kwargs
|
896
967
|
)
|
897
968
|
return output_df
|
@@ -920,17 +991,17 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
920
991
|
Output dataset with probability of the sample for each class in the model.
|
921
992
|
"""
|
922
993
|
super()._check_dataset_type(dataset)
|
923
|
-
inference_method="score_samples"
|
994
|
+
inference_method = "score_samples"
|
924
995
|
|
925
996
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
926
997
|
# are specific to the type of dataset used.
|
927
998
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
928
999
|
|
1000
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
1001
|
+
|
929
1002
|
if isinstance(dataset, DataFrame):
|
930
|
-
self.
|
931
|
-
|
932
|
-
inference_method=inference_method,
|
933
|
-
)
|
1003
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1004
|
+
self._deps = self._get_dependencies()
|
934
1005
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
935
1006
|
transform_kwargs = dict(
|
936
1007
|
session=dataset._session,
|
@@ -938,6 +1009,9 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
938
1009
|
drop_input_cols = self._drop_input_cols,
|
939
1010
|
expected_output_cols_type="float",
|
940
1011
|
)
|
1012
|
+
expected_output_cols = self._align_expected_output_names(
|
1013
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
1014
|
+
)
|
941
1015
|
|
942
1016
|
elif isinstance(dataset, pd.DataFrame):
|
943
1017
|
transform_kwargs = dict(
|
@@ -956,7 +1030,7 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
956
1030
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
957
1031
|
inference_method=inference_method,
|
958
1032
|
input_cols=self.input_cols,
|
959
|
-
expected_output_cols=
|
1033
|
+
expected_output_cols=expected_output_cols,
|
960
1034
|
**transform_kwargs
|
961
1035
|
)
|
962
1036
|
return output_df
|
@@ -991,17 +1065,15 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
991
1065
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
992
1066
|
|
993
1067
|
if isinstance(dataset, DataFrame):
|
994
|
-
self.
|
995
|
-
|
996
|
-
inference_method="score",
|
997
|
-
)
|
1068
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
1069
|
+
self._deps = self._get_dependencies()
|
998
1070
|
selected_cols = self._get_active_columns()
|
999
1071
|
if len(selected_cols) > 0:
|
1000
1072
|
dataset = dataset.select(selected_cols)
|
1001
1073
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
1002
1074
|
transform_kwargs = dict(
|
1003
1075
|
session=dataset._session,
|
1004
|
-
dependencies=
|
1076
|
+
dependencies=self._deps,
|
1005
1077
|
score_sproc_imports=['sklearn'],
|
1006
1078
|
)
|
1007
1079
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -1066,11 +1138,8 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
1066
1138
|
|
1067
1139
|
if isinstance(dataset, DataFrame):
|
1068
1140
|
|
1069
|
-
self.
|
1070
|
-
|
1071
|
-
inference_method=inference_method,
|
1072
|
-
|
1073
|
-
)
|
1141
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1142
|
+
self._deps = self._get_dependencies()
|
1074
1143
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
1075
1144
|
transform_kwargs = dict(
|
1076
1145
|
session = dataset._session,
|
@@ -1103,50 +1172,84 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
1103
1172
|
)
|
1104
1173
|
return output_df
|
1105
1174
|
|
1175
|
+
|
1176
|
+
|
1177
|
+
def to_sklearn(self) -> Any:
|
1178
|
+
"""Get sklearn.ensemble.GradientBoostingClassifier object.
|
1179
|
+
"""
|
1180
|
+
if self._sklearn_object is None:
|
1181
|
+
self._sklearn_object = self._create_sklearn_object()
|
1182
|
+
return self._sklearn_object
|
1183
|
+
|
1184
|
+
def to_xgboost(self) -> Any:
|
1185
|
+
raise exceptions.SnowflakeMLException(
|
1186
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1187
|
+
original_exception=AttributeError(
|
1188
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1189
|
+
"to_xgboost()",
|
1190
|
+
"to_sklearn()"
|
1191
|
+
)
|
1192
|
+
),
|
1193
|
+
)
|
1194
|
+
|
1195
|
+
def to_lightgbm(self) -> Any:
|
1196
|
+
raise exceptions.SnowflakeMLException(
|
1197
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1198
|
+
original_exception=AttributeError(
|
1199
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1200
|
+
"to_lightgbm()",
|
1201
|
+
"to_sklearn()"
|
1202
|
+
)
|
1203
|
+
),
|
1204
|
+
)
|
1205
|
+
|
1206
|
+
def _get_dependencies(self) -> List[str]:
|
1207
|
+
return self._deps
|
1208
|
+
|
1106
1209
|
|
1107
|
-
def
|
1210
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1108
1211
|
self._model_signature_dict = dict()
|
1109
1212
|
|
1110
1213
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1111
1214
|
|
1112
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1215
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1113
1216
|
outputs: List[BaseFeatureSpec] = []
|
1114
1217
|
if hasattr(self, "predict"):
|
1115
1218
|
# keep mypy happy
|
1116
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1219
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1117
1220
|
# For classifier, the type of predict is the same as the type of label
|
1118
|
-
if self._sklearn_object._estimator_type ==
|
1119
|
-
|
1221
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1222
|
+
# label columns is the desired type for output
|
1120
1223
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1121
1224
|
# rename the output columns
|
1122
1225
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1123
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1124
|
-
|
1125
|
-
|
1226
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1227
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1228
|
+
)
|
1126
1229
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1127
1230
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1128
|
-
# Clusterer returns int64 cluster labels.
|
1231
|
+
# Clusterer returns int64 cluster labels.
|
1129
1232
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1130
1233
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1131
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1234
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1235
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1236
|
+
)
|
1237
|
+
|
1135
1238
|
# For regressor, the type of predict is float64
|
1136
|
-
elif self._sklearn_object._estimator_type ==
|
1239
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1137
1240
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1138
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1241
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1242
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1243
|
+
)
|
1244
|
+
|
1142
1245
|
for prob_func in PROB_FUNCTIONS:
|
1143
1246
|
if hasattr(self, prob_func):
|
1144
1247
|
output_cols_prefix: str = f"{prob_func}_"
|
1145
1248
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1146
1249
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1147
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1148
|
-
|
1149
|
-
|
1250
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1251
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1252
|
+
)
|
1150
1253
|
|
1151
1254
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1152
1255
|
items = list(self._model_signature_dict.items())
|
@@ -1159,10 +1262,10 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
1159
1262
|
"""Returns model signature of current class.
|
1160
1263
|
|
1161
1264
|
Raises:
|
1162
|
-
|
1265
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1163
1266
|
|
1164
1267
|
Returns:
|
1165
|
-
Dict
|
1268
|
+
Dict with each method and its input output signature
|
1166
1269
|
"""
|
1167
1270
|
if self._model_signature_dict is None:
|
1168
1271
|
raise exceptions.SnowflakeMLException(
|
@@ -1170,35 +1273,3 @@ class GradientBoostingClassifier(BaseTransformer):
|
|
1170
1273
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1171
1274
|
)
|
1172
1275
|
return self._model_signature_dict
|
1173
|
-
|
1174
|
-
def to_sklearn(self) -> Any:
|
1175
|
-
"""Get sklearn.ensemble.GradientBoostingClassifier object.
|
1176
|
-
"""
|
1177
|
-
if self._sklearn_object is None:
|
1178
|
-
self._sklearn_object = self._create_sklearn_object()
|
1179
|
-
return self._sklearn_object
|
1180
|
-
|
1181
|
-
def to_xgboost(self) -> Any:
|
1182
|
-
raise exceptions.SnowflakeMLException(
|
1183
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1184
|
-
original_exception=AttributeError(
|
1185
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1186
|
-
"to_xgboost()",
|
1187
|
-
"to_sklearn()"
|
1188
|
-
)
|
1189
|
-
),
|
1190
|
-
)
|
1191
|
-
|
1192
|
-
def to_lightgbm(self) -> Any:
|
1193
|
-
raise exceptions.SnowflakeMLException(
|
1194
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1195
|
-
original_exception=AttributeError(
|
1196
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1197
|
-
"to_lightgbm()",
|
1198
|
-
"to_sklearn()"
|
1199
|
-
)
|
1200
|
-
),
|
1201
|
-
)
|
1202
|
-
|
1203
|
-
def _get_dependencies(self) -> List[str]:
|
1204
|
-
return self._deps
|