snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
32
32
|
BatchInferenceKwargsTypedDict,
|
33
33
|
ScoreKwargsTypedDict
|
34
34
|
)
|
35
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
36
|
+
from snowflake.ml.model.model_signature import (
|
37
|
+
BaseFeatureSpec,
|
38
|
+
DataType,
|
39
|
+
FeatureSpec,
|
40
|
+
ModelSignature,
|
41
|
+
_infer_signature,
|
42
|
+
_rename_signature_with_snowflake_identifiers,
|
43
|
+
)
|
35
44
|
|
36
45
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
37
46
|
|
@@ -42,16 +51,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
42
51
|
validate_sklearn_args,
|
43
52
|
)
|
44
53
|
|
45
|
-
from snowflake.ml.model.model_signature import (
|
46
|
-
DataType,
|
47
|
-
FeatureSpec,
|
48
|
-
ModelSignature,
|
49
|
-
_infer_signature,
|
50
|
-
_rename_signature_with_snowflake_identifiers,
|
51
|
-
BaseFeatureSpec,
|
52
|
-
)
|
53
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
54
|
-
|
55
54
|
_PROJECT = "ModelDevelopment"
|
56
55
|
# Derive subproject from module name by removing "sklearn"
|
57
56
|
# and converting module name from underscore to CamelCase
|
@@ -60,12 +59,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "xgboost".replace("sklearn.", "")
|
|
60
59
|
|
61
60
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
62
61
|
|
63
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
64
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
65
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
66
|
-
return check
|
67
|
-
|
68
|
-
|
69
62
|
class XGBClassifier(BaseTransformer):
|
70
63
|
r"""Implementation of the scikit-learn API for XGBoost classification
|
71
64
|
For more details on this class, see [xgboost.XGBClassifier]
|
@@ -422,12 +415,7 @@ class XGBClassifier(BaseTransformer):
|
|
422
415
|
)
|
423
416
|
return selected_cols
|
424
417
|
|
425
|
-
|
426
|
-
project=_PROJECT,
|
427
|
-
subproject=_SUBPROJECT,
|
428
|
-
custom_tags=dict([("autogen", True)]),
|
429
|
-
)
|
430
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "XGBClassifier":
|
418
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "XGBClassifier":
|
431
419
|
"""Fit gradient boosting classifier
|
432
420
|
For more details on this function, see [xgboost.XGBClassifier.fit]
|
433
421
|
(https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier.fit)
|
@@ -454,12 +442,14 @@ class XGBClassifier(BaseTransformer):
|
|
454
442
|
|
455
443
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
456
444
|
|
457
|
-
|
445
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
458
446
|
if SNOWML_SPROC_ENV in os.environ:
|
459
447
|
statement_params = telemetry.get_function_usage_statement_params(
|
460
448
|
project=_PROJECT,
|
461
449
|
subproject=_SUBPROJECT,
|
462
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
450
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
451
|
+
inspect.currentframe(), XGBClassifier.__class__.__name__
|
452
|
+
),
|
463
453
|
api_calls=[Session.call],
|
464
454
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
465
455
|
)
|
@@ -480,27 +470,24 @@ class XGBClassifier(BaseTransformer):
|
|
480
470
|
)
|
481
471
|
self._sklearn_object = model_trainer.train()
|
482
472
|
self._is_fitted = True
|
483
|
-
self.
|
473
|
+
self._generate_model_signatures(dataset)
|
484
474
|
return self
|
485
475
|
|
486
476
|
def _batch_inference_validate_snowpark(
|
487
477
|
self,
|
488
478
|
dataset: DataFrame,
|
489
479
|
inference_method: str,
|
490
|
-
) ->
|
491
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
492
|
-
return the available package that exists in the snowflake anaconda channel
|
480
|
+
) -> None:
|
481
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
493
482
|
|
494
483
|
Args:
|
495
484
|
dataset: snowpark dataframe
|
496
485
|
inference_method: the inference method such as predict, score...
|
497
|
-
|
486
|
+
|
498
487
|
Raises:
|
499
488
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
500
489
|
SnowflakeMLException: If the session is None, raise error
|
501
490
|
|
502
|
-
Returns:
|
503
|
-
A list of available package that exists in the snowflake anaconda channel
|
504
491
|
"""
|
505
492
|
if not self._is_fitted:
|
506
493
|
raise exceptions.SnowflakeMLException(
|
@@ -518,9 +505,7 @@ class XGBClassifier(BaseTransformer):
|
|
518
505
|
"Session must not specified for snowpark dataset."
|
519
506
|
),
|
520
507
|
)
|
521
|
-
|
522
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
523
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
508
|
+
|
524
509
|
|
525
510
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
526
511
|
@telemetry.send_api_usage_telemetry(
|
@@ -556,7 +541,9 @@ class XGBClassifier(BaseTransformer):
|
|
556
541
|
# when it is classifier, infer the datatype from label columns
|
557
542
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
558
543
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
559
|
-
label_cols_signatures = [
|
544
|
+
label_cols_signatures = [
|
545
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
546
|
+
]
|
560
547
|
if len(label_cols_signatures) == 0:
|
561
548
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
562
549
|
raise exceptions.SnowflakeMLException(
|
@@ -564,25 +551,23 @@ class XGBClassifier(BaseTransformer):
|
|
564
551
|
original_exception=ValueError(error_str),
|
565
552
|
)
|
566
553
|
|
567
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
568
|
-
label_cols_signatures[0].as_snowpark_type()
|
569
|
-
)
|
554
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
570
555
|
|
571
|
-
self.
|
572
|
-
|
556
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
557
|
+
self._deps = self._get_dependencies()
|
558
|
+
assert isinstance(
|
559
|
+
dataset._session, Session
|
560
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
573
561
|
|
574
562
|
transform_kwargs = dict(
|
575
|
-
session
|
576
|
-
dependencies
|
577
|
-
drop_input_cols
|
578
|
-
expected_output_cols_type
|
563
|
+
session=dataset._session,
|
564
|
+
dependencies=self._deps,
|
565
|
+
drop_input_cols=self._drop_input_cols,
|
566
|
+
expected_output_cols_type=expected_type_inferred,
|
579
567
|
)
|
580
568
|
|
581
569
|
elif isinstance(dataset, pd.DataFrame):
|
582
|
-
transform_kwargs = dict(
|
583
|
-
snowpark_input_cols = self._snowpark_cols,
|
584
|
-
drop_input_cols = self._drop_input_cols
|
585
|
-
)
|
570
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
586
571
|
|
587
572
|
transform_handlers = ModelTransformerBuilder.build(
|
588
573
|
dataset=dataset,
|
@@ -622,7 +607,7 @@ class XGBClassifier(BaseTransformer):
|
|
622
607
|
Transformed dataset.
|
623
608
|
"""
|
624
609
|
super()._check_dataset_type(dataset)
|
625
|
-
inference_method="transform"
|
610
|
+
inference_method = "transform"
|
626
611
|
|
627
612
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
628
613
|
# are specific to the type of dataset used.
|
@@ -652,24 +637,19 @@ class XGBClassifier(BaseTransformer):
|
|
652
637
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
653
638
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
654
639
|
|
655
|
-
self.
|
656
|
-
|
657
|
-
inference_method=inference_method,
|
658
|
-
)
|
640
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
641
|
+
self._deps = self._get_dependencies()
|
659
642
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
660
643
|
|
661
644
|
transform_kwargs = dict(
|
662
|
-
session
|
663
|
-
dependencies
|
664
|
-
drop_input_cols
|
665
|
-
expected_output_cols_type
|
645
|
+
session=dataset._session,
|
646
|
+
dependencies=self._deps,
|
647
|
+
drop_input_cols=self._drop_input_cols,
|
648
|
+
expected_output_cols_type=expected_dtype,
|
666
649
|
)
|
667
650
|
|
668
651
|
elif isinstance(dataset, pd.DataFrame):
|
669
|
-
transform_kwargs = dict(
|
670
|
-
snowpark_input_cols = self._snowpark_cols,
|
671
|
-
drop_input_cols = self._drop_input_cols
|
672
|
-
)
|
652
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
673
653
|
|
674
654
|
transform_handlers = ModelTransformerBuilder.build(
|
675
655
|
dataset=dataset,
|
@@ -688,7 +668,11 @@ class XGBClassifier(BaseTransformer):
|
|
688
668
|
return output_df
|
689
669
|
|
690
670
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
691
|
-
def fit_predict(
|
671
|
+
def fit_predict(
|
672
|
+
self,
|
673
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
674
|
+
output_cols_prefix: str = "fit_predict_",
|
675
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
692
676
|
""" Method not supported for this class.
|
693
677
|
|
694
678
|
|
@@ -713,22 +697,104 @@ class XGBClassifier(BaseTransformer):
|
|
713
697
|
)
|
714
698
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
715
699
|
drop_input_cols=self._drop_input_cols,
|
716
|
-
expected_output_cols_list=
|
700
|
+
expected_output_cols_list=(
|
701
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
702
|
+
),
|
717
703
|
)
|
718
704
|
self._sklearn_object = fitted_estimator
|
719
705
|
self._is_fitted = True
|
720
706
|
return output_result
|
721
707
|
|
708
|
+
|
709
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
710
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
711
|
+
""" Method not supported for this class.
|
712
|
+
|
722
713
|
|
723
|
-
|
724
|
-
|
725
|
-
|
714
|
+
Raises:
|
715
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
716
|
+
|
717
|
+
Args:
|
718
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
719
|
+
Snowpark or Pandas DataFrame.
|
720
|
+
output_cols_prefix: Prefix for the response columns
|
726
721
|
Returns:
|
727
722
|
Transformed dataset.
|
728
723
|
"""
|
729
|
-
self.
|
730
|
-
|
731
|
-
|
724
|
+
self._infer_input_output_cols(dataset)
|
725
|
+
super()._check_dataset_type(dataset)
|
726
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
727
|
+
estimator=self._sklearn_object,
|
728
|
+
dataset=dataset,
|
729
|
+
input_cols=self.input_cols,
|
730
|
+
label_cols=self.label_cols,
|
731
|
+
sample_weight_col=self.sample_weight_col,
|
732
|
+
autogenerated=self._autogenerated,
|
733
|
+
subproject=_SUBPROJECT,
|
734
|
+
)
|
735
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
736
|
+
drop_input_cols=self._drop_input_cols,
|
737
|
+
expected_output_cols_list=self.output_cols,
|
738
|
+
)
|
739
|
+
self._sklearn_object = fitted_estimator
|
740
|
+
self._is_fitted = True
|
741
|
+
return output_result
|
742
|
+
|
743
|
+
|
744
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
745
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
746
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
747
|
+
"""
|
748
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
749
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
750
|
+
if output_cols:
|
751
|
+
output_cols = [
|
752
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
753
|
+
for c in output_cols
|
754
|
+
]
|
755
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
756
|
+
output_cols = [output_cols_prefix]
|
757
|
+
elif self._sklearn_object is not None:
|
758
|
+
classes = self._sklearn_object.classes_
|
759
|
+
if isinstance(classes, numpy.ndarray):
|
760
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
761
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
762
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
763
|
+
output_cols = []
|
764
|
+
for i, cl in enumerate(classes):
|
765
|
+
# For binary classification, there is only one output column for each class
|
766
|
+
# ndarray as the two classes are complementary.
|
767
|
+
if len(cl) == 2:
|
768
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
769
|
+
else:
|
770
|
+
output_cols.extend([
|
771
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
772
|
+
])
|
773
|
+
else:
|
774
|
+
output_cols = []
|
775
|
+
|
776
|
+
# Make sure column names are valid snowflake identifiers.
|
777
|
+
assert output_cols is not None # Make MyPy happy
|
778
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
779
|
+
|
780
|
+
return rv
|
781
|
+
|
782
|
+
def _align_expected_output_names(
|
783
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
784
|
+
) -> List[str]:
|
785
|
+
# in case the inferred output column names dimension is different
|
786
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
787
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
788
|
+
output_df_columns = list(output_df_pd.columns)
|
789
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
790
|
+
if self.sample_weight_col:
|
791
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
792
|
+
# if the dimension of inferred output column names is correct; use it
|
793
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
794
|
+
return expected_output_cols_list
|
795
|
+
# otherwise, use the sklearn estimator's output
|
796
|
+
else:
|
797
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
732
798
|
|
733
799
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
734
800
|
@telemetry.send_api_usage_telemetry(
|
@@ -762,24 +828,26 @@ class XGBClassifier(BaseTransformer):
|
|
762
828
|
# are specific to the type of dataset used.
|
763
829
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
764
830
|
|
831
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
832
|
+
|
765
833
|
if isinstance(dataset, DataFrame):
|
766
|
-
self.
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
834
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
835
|
+
self._deps = self._get_dependencies()
|
836
|
+
assert isinstance(
|
837
|
+
dataset._session, Session
|
838
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
771
839
|
transform_kwargs = dict(
|
772
840
|
session=dataset._session,
|
773
841
|
dependencies=self._deps,
|
774
|
-
drop_input_cols
|
842
|
+
drop_input_cols=self._drop_input_cols,
|
775
843
|
expected_output_cols_type="float",
|
776
844
|
)
|
845
|
+
expected_output_cols = self._align_expected_output_names(
|
846
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
847
|
+
)
|
777
848
|
|
778
849
|
elif isinstance(dataset, pd.DataFrame):
|
779
|
-
transform_kwargs = dict(
|
780
|
-
snowpark_input_cols = self._snowpark_cols,
|
781
|
-
drop_input_cols = self._drop_input_cols
|
782
|
-
)
|
850
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
783
851
|
|
784
852
|
transform_handlers = ModelTransformerBuilder.build(
|
785
853
|
dataset=dataset,
|
@@ -791,7 +859,7 @@ class XGBClassifier(BaseTransformer):
|
|
791
859
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
792
860
|
inference_method=inference_method,
|
793
861
|
input_cols=self.input_cols,
|
794
|
-
expected_output_cols=
|
862
|
+
expected_output_cols=expected_output_cols,
|
795
863
|
**transform_kwargs
|
796
864
|
)
|
797
865
|
return output_df
|
@@ -823,29 +891,30 @@ class XGBClassifier(BaseTransformer):
|
|
823
891
|
Output dataset with log probability of the sample for each class in the model.
|
824
892
|
"""
|
825
893
|
super()._check_dataset_type(dataset)
|
826
|
-
inference_method="predict_log_proba"
|
894
|
+
inference_method = "predict_log_proba"
|
895
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
827
896
|
|
828
897
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
829
898
|
# are specific to the type of dataset used.
|
830
899
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
831
900
|
|
832
901
|
if isinstance(dataset, DataFrame):
|
833
|
-
self.
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
902
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
903
|
+
self._deps = self._get_dependencies()
|
904
|
+
assert isinstance(
|
905
|
+
dataset._session, Session
|
906
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
838
907
|
transform_kwargs = dict(
|
839
908
|
session=dataset._session,
|
840
909
|
dependencies=self._deps,
|
841
|
-
drop_input_cols
|
910
|
+
drop_input_cols=self._drop_input_cols,
|
842
911
|
expected_output_cols_type="float",
|
843
912
|
)
|
913
|
+
expected_output_cols = self._align_expected_output_names(
|
914
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
915
|
+
)
|
844
916
|
elif isinstance(dataset, pd.DataFrame):
|
845
|
-
transform_kwargs = dict(
|
846
|
-
snowpark_input_cols = self._snowpark_cols,
|
847
|
-
drop_input_cols = self._drop_input_cols
|
848
|
-
)
|
917
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
849
918
|
|
850
919
|
transform_handlers = ModelTransformerBuilder.build(
|
851
920
|
dataset=dataset,
|
@@ -858,7 +927,7 @@ class XGBClassifier(BaseTransformer):
|
|
858
927
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
859
928
|
inference_method=inference_method,
|
860
929
|
input_cols=self.input_cols,
|
861
|
-
expected_output_cols=
|
930
|
+
expected_output_cols=expected_output_cols,
|
862
931
|
**transform_kwargs
|
863
932
|
)
|
864
933
|
return output_df
|
@@ -884,30 +953,32 @@ class XGBClassifier(BaseTransformer):
|
|
884
953
|
Output dataset with results of the decision function for the samples in input dataset.
|
885
954
|
"""
|
886
955
|
super()._check_dataset_type(dataset)
|
887
|
-
inference_method="decision_function"
|
956
|
+
inference_method = "decision_function"
|
888
957
|
|
889
958
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
890
959
|
# are specific to the type of dataset used.
|
891
960
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
892
961
|
|
962
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
963
|
+
|
893
964
|
if isinstance(dataset, DataFrame):
|
894
|
-
self.
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
965
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
966
|
+
self._deps = self._get_dependencies()
|
967
|
+
assert isinstance(
|
968
|
+
dataset._session, Session
|
969
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
899
970
|
transform_kwargs = dict(
|
900
971
|
session=dataset._session,
|
901
972
|
dependencies=self._deps,
|
902
|
-
drop_input_cols
|
973
|
+
drop_input_cols=self._drop_input_cols,
|
903
974
|
expected_output_cols_type="float",
|
904
975
|
)
|
976
|
+
expected_output_cols = self._align_expected_output_names(
|
977
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
978
|
+
)
|
905
979
|
|
906
980
|
elif isinstance(dataset, pd.DataFrame):
|
907
|
-
transform_kwargs = dict(
|
908
|
-
snowpark_input_cols = self._snowpark_cols,
|
909
|
-
drop_input_cols = self._drop_input_cols
|
910
|
-
)
|
981
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
911
982
|
|
912
983
|
transform_handlers = ModelTransformerBuilder.build(
|
913
984
|
dataset=dataset,
|
@@ -920,7 +991,7 @@ class XGBClassifier(BaseTransformer):
|
|
920
991
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
921
992
|
inference_method=inference_method,
|
922
993
|
input_cols=self.input_cols,
|
923
|
-
expected_output_cols=
|
994
|
+
expected_output_cols=expected_output_cols,
|
924
995
|
**transform_kwargs
|
925
996
|
)
|
926
997
|
return output_df
|
@@ -949,17 +1020,17 @@ class XGBClassifier(BaseTransformer):
|
|
949
1020
|
Output dataset with probability of the sample for each class in the model.
|
950
1021
|
"""
|
951
1022
|
super()._check_dataset_type(dataset)
|
952
|
-
inference_method="score_samples"
|
1023
|
+
inference_method = "score_samples"
|
953
1024
|
|
954
1025
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
955
1026
|
# are specific to the type of dataset used.
|
956
1027
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
957
1028
|
|
1029
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
1030
|
+
|
958
1031
|
if isinstance(dataset, DataFrame):
|
959
|
-
self.
|
960
|
-
|
961
|
-
inference_method=inference_method,
|
962
|
-
)
|
1032
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1033
|
+
self._deps = self._get_dependencies()
|
963
1034
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
964
1035
|
transform_kwargs = dict(
|
965
1036
|
session=dataset._session,
|
@@ -967,6 +1038,9 @@ class XGBClassifier(BaseTransformer):
|
|
967
1038
|
drop_input_cols = self._drop_input_cols,
|
968
1039
|
expected_output_cols_type="float",
|
969
1040
|
)
|
1041
|
+
expected_output_cols = self._align_expected_output_names(
|
1042
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
1043
|
+
)
|
970
1044
|
|
971
1045
|
elif isinstance(dataset, pd.DataFrame):
|
972
1046
|
transform_kwargs = dict(
|
@@ -985,7 +1059,7 @@ class XGBClassifier(BaseTransformer):
|
|
985
1059
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
986
1060
|
inference_method=inference_method,
|
987
1061
|
input_cols=self.input_cols,
|
988
|
-
expected_output_cols=
|
1062
|
+
expected_output_cols=expected_output_cols,
|
989
1063
|
**transform_kwargs
|
990
1064
|
)
|
991
1065
|
return output_df
|
@@ -1020,17 +1094,15 @@ class XGBClassifier(BaseTransformer):
|
|
1020
1094
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
1021
1095
|
|
1022
1096
|
if isinstance(dataset, DataFrame):
|
1023
|
-
self.
|
1024
|
-
|
1025
|
-
inference_method="score",
|
1026
|
-
)
|
1097
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
1098
|
+
self._deps = self._get_dependencies()
|
1027
1099
|
selected_cols = self._get_active_columns()
|
1028
1100
|
if len(selected_cols) > 0:
|
1029
1101
|
dataset = dataset.select(selected_cols)
|
1030
1102
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
1031
1103
|
transform_kwargs = dict(
|
1032
1104
|
session=dataset._session,
|
1033
|
-
dependencies=
|
1105
|
+
dependencies=self._deps,
|
1034
1106
|
score_sproc_imports=['xgboost'],
|
1035
1107
|
)
|
1036
1108
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -1095,11 +1167,8 @@ class XGBClassifier(BaseTransformer):
|
|
1095
1167
|
|
1096
1168
|
if isinstance(dataset, DataFrame):
|
1097
1169
|
|
1098
|
-
self.
|
1099
|
-
|
1100
|
-
inference_method=inference_method,
|
1101
|
-
|
1102
|
-
)
|
1170
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1171
|
+
self._deps = self._get_dependencies()
|
1103
1172
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
1104
1173
|
transform_kwargs = dict(
|
1105
1174
|
session = dataset._session,
|
@@ -1132,50 +1201,84 @@ class XGBClassifier(BaseTransformer):
|
|
1132
1201
|
)
|
1133
1202
|
return output_df
|
1134
1203
|
|
1204
|
+
|
1205
|
+
|
1206
|
+
def to_xgboost(self) -> Any:
|
1207
|
+
"""Get xgboost.XGBClassifier object.
|
1208
|
+
"""
|
1209
|
+
if self._sklearn_object is None:
|
1210
|
+
self._sklearn_object = self._create_sklearn_object()
|
1211
|
+
return self._sklearn_object
|
1212
|
+
|
1213
|
+
def to_sklearn(self) -> Any:
|
1214
|
+
raise exceptions.SnowflakeMLException(
|
1215
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1216
|
+
original_exception=AttributeError(
|
1217
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1218
|
+
"to_sklearn()",
|
1219
|
+
"to_xgboost()"
|
1220
|
+
)
|
1221
|
+
),
|
1222
|
+
)
|
1223
|
+
|
1224
|
+
def to_lightgbm(self) -> Any:
|
1225
|
+
raise exceptions.SnowflakeMLException(
|
1226
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1227
|
+
original_exception=AttributeError(
|
1228
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1229
|
+
"to_lightgbm()",
|
1230
|
+
"to_xgboost()"
|
1231
|
+
)
|
1232
|
+
),
|
1233
|
+
)
|
1234
|
+
|
1235
|
+
def _get_dependencies(self) -> List[str]:
|
1236
|
+
return self._deps
|
1237
|
+
|
1135
1238
|
|
1136
|
-
def
|
1239
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1137
1240
|
self._model_signature_dict = dict()
|
1138
1241
|
|
1139
1242
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1140
1243
|
|
1141
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1244
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1142
1245
|
outputs: List[BaseFeatureSpec] = []
|
1143
1246
|
if hasattr(self, "predict"):
|
1144
1247
|
# keep mypy happy
|
1145
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1248
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1146
1249
|
# For classifier, the type of predict is the same as the type of label
|
1147
|
-
if self._sklearn_object._estimator_type ==
|
1148
|
-
|
1250
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1251
|
+
# label columns is the desired type for output
|
1149
1252
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1150
1253
|
# rename the output columns
|
1151
1254
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1152
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1153
|
-
|
1154
|
-
|
1255
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1256
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1257
|
+
)
|
1155
1258
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1156
1259
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1157
|
-
# Clusterer returns int64 cluster labels.
|
1260
|
+
# Clusterer returns int64 cluster labels.
|
1158
1261
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1159
1262
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1160
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1263
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1264
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1265
|
+
)
|
1266
|
+
|
1164
1267
|
# For regressor, the type of predict is float64
|
1165
|
-
elif self._sklearn_object._estimator_type ==
|
1268
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1166
1269
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1167
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1270
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1271
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1272
|
+
)
|
1273
|
+
|
1171
1274
|
for prob_func in PROB_FUNCTIONS:
|
1172
1275
|
if hasattr(self, prob_func):
|
1173
1276
|
output_cols_prefix: str = f"{prob_func}_"
|
1174
1277
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1175
1278
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1176
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1177
|
-
|
1178
|
-
|
1279
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1280
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1281
|
+
)
|
1179
1282
|
|
1180
1283
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1181
1284
|
items = list(self._model_signature_dict.items())
|
@@ -1188,10 +1291,10 @@ class XGBClassifier(BaseTransformer):
|
|
1188
1291
|
"""Returns model signature of current class.
|
1189
1292
|
|
1190
1293
|
Raises:
|
1191
|
-
|
1294
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1192
1295
|
|
1193
1296
|
Returns:
|
1194
|
-
Dict
|
1297
|
+
Dict with each method and its input output signature
|
1195
1298
|
"""
|
1196
1299
|
if self._model_signature_dict is None:
|
1197
1300
|
raise exceptions.SnowflakeMLException(
|
@@ -1199,35 +1302,3 @@ class XGBClassifier(BaseTransformer):
|
|
1199
1302
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1200
1303
|
)
|
1201
1304
|
return self._model_signature_dict
|
1202
|
-
|
1203
|
-
def to_xgboost(self) -> Any:
|
1204
|
-
"""Get xgboost.XGBClassifier object.
|
1205
|
-
"""
|
1206
|
-
if self._sklearn_object is None:
|
1207
|
-
self._sklearn_object = self._create_sklearn_object()
|
1208
|
-
return self._sklearn_object
|
1209
|
-
|
1210
|
-
def to_sklearn(self) -> Any:
|
1211
|
-
raise exceptions.SnowflakeMLException(
|
1212
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1213
|
-
original_exception=AttributeError(
|
1214
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1215
|
-
"to_sklearn()",
|
1216
|
-
"to_xgboost()"
|
1217
|
-
)
|
1218
|
-
),
|
1219
|
-
)
|
1220
|
-
|
1221
|
-
def to_lightgbm(self) -> Any:
|
1222
|
-
raise exceptions.SnowflakeMLException(
|
1223
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1224
|
-
original_exception=AttributeError(
|
1225
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1226
|
-
"to_lightgbm()",
|
1227
|
-
"to_xgboost()"
|
1228
|
-
)
|
1229
|
-
),
|
1230
|
-
)
|
1231
|
-
|
1232
|
-
def _get_dependencies(self) -> List[str]:
|
1233
|
-
return self._deps
|