snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
32
32
|
BatchInferenceKwargsTypedDict,
|
33
33
|
ScoreKwargsTypedDict
|
34
34
|
)
|
35
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
36
|
+
from snowflake.ml.model.model_signature import (
|
37
|
+
BaseFeatureSpec,
|
38
|
+
DataType,
|
39
|
+
FeatureSpec,
|
40
|
+
ModelSignature,
|
41
|
+
_infer_signature,
|
42
|
+
_rename_signature_with_snowflake_identifiers,
|
43
|
+
)
|
35
44
|
|
36
45
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
37
46
|
|
@@ -42,16 +51,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
42
51
|
validate_sklearn_args,
|
43
52
|
)
|
44
53
|
|
45
|
-
from snowflake.ml.model.model_signature import (
|
46
|
-
DataType,
|
47
|
-
FeatureSpec,
|
48
|
-
ModelSignature,
|
49
|
-
_infer_signature,
|
50
|
-
_rename_signature_with_snowflake_identifiers,
|
51
|
-
BaseFeatureSpec,
|
52
|
-
)
|
53
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
54
|
-
|
55
54
|
_PROJECT = "ModelDevelopment"
|
56
55
|
# Derive subproject from module name by removing "sklearn"
|
57
56
|
# and converting module name from underscore to CamelCase
|
@@ -421,12 +420,7 @@ class XGBRegressor(BaseTransformer):
|
|
421
420
|
)
|
422
421
|
return selected_cols
|
423
422
|
|
424
|
-
|
425
|
-
project=_PROJECT,
|
426
|
-
subproject=_SUBPROJECT,
|
427
|
-
custom_tags=dict([("autogen", True)]),
|
428
|
-
)
|
429
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "XGBRegressor":
|
423
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "XGBRegressor":
|
430
424
|
"""Fit gradient boosting model
|
431
425
|
For more details on this function, see [xgboost.XGBRegressor.fit]
|
432
426
|
(https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor.fit)
|
@@ -453,12 +447,14 @@ class XGBRegressor(BaseTransformer):
|
|
453
447
|
|
454
448
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
455
449
|
|
456
|
-
|
450
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
457
451
|
if SNOWML_SPROC_ENV in os.environ:
|
458
452
|
statement_params = telemetry.get_function_usage_statement_params(
|
459
453
|
project=_PROJECT,
|
460
454
|
subproject=_SUBPROJECT,
|
461
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
455
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
456
|
+
inspect.currentframe(), XGBRegressor.__class__.__name__
|
457
|
+
),
|
462
458
|
api_calls=[Session.call],
|
463
459
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
464
460
|
)
|
@@ -479,7 +475,7 @@ class XGBRegressor(BaseTransformer):
|
|
479
475
|
)
|
480
476
|
self._sklearn_object = model_trainer.train()
|
481
477
|
self._is_fitted = True
|
482
|
-
self.
|
478
|
+
self._generate_model_signatures(dataset)
|
483
479
|
return self
|
484
480
|
|
485
481
|
def _batch_inference_validate_snowpark(
|
@@ -555,7 +551,9 @@ class XGBRegressor(BaseTransformer):
|
|
555
551
|
# when it is classifier, infer the datatype from label columns
|
556
552
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
557
553
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
558
|
-
label_cols_signatures = [
|
554
|
+
label_cols_signatures = [
|
555
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
556
|
+
]
|
559
557
|
if len(label_cols_signatures) == 0:
|
560
558
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
561
559
|
raise exceptions.SnowflakeMLException(
|
@@ -563,25 +561,22 @@ class XGBRegressor(BaseTransformer):
|
|
563
561
|
original_exception=ValueError(error_str),
|
564
562
|
)
|
565
563
|
|
566
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
567
|
-
label_cols_signatures[0].as_snowpark_type()
|
568
|
-
)
|
564
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
569
565
|
|
570
566
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
571
|
-
assert isinstance(
|
567
|
+
assert isinstance(
|
568
|
+
dataset._session, Session
|
569
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
572
570
|
|
573
571
|
transform_kwargs = dict(
|
574
|
-
session
|
575
|
-
dependencies
|
576
|
-
drop_input_cols
|
577
|
-
expected_output_cols_type
|
572
|
+
session=dataset._session,
|
573
|
+
dependencies=self._deps,
|
574
|
+
drop_input_cols=self._drop_input_cols,
|
575
|
+
expected_output_cols_type=expected_type_inferred,
|
578
576
|
)
|
579
577
|
|
580
578
|
elif isinstance(dataset, pd.DataFrame):
|
581
|
-
transform_kwargs = dict(
|
582
|
-
snowpark_input_cols = self._snowpark_cols,
|
583
|
-
drop_input_cols = self._drop_input_cols
|
584
|
-
)
|
579
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
585
580
|
|
586
581
|
transform_handlers = ModelTransformerBuilder.build(
|
587
582
|
dataset=dataset,
|
@@ -621,7 +616,7 @@ class XGBRegressor(BaseTransformer):
|
|
621
616
|
Transformed dataset.
|
622
617
|
"""
|
623
618
|
super()._check_dataset_type(dataset)
|
624
|
-
inference_method="transform"
|
619
|
+
inference_method = "transform"
|
625
620
|
|
626
621
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
627
622
|
# are specific to the type of dataset used.
|
@@ -658,17 +653,14 @@ class XGBRegressor(BaseTransformer):
|
|
658
653
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
659
654
|
|
660
655
|
transform_kwargs = dict(
|
661
|
-
session
|
662
|
-
dependencies
|
663
|
-
drop_input_cols
|
664
|
-
expected_output_cols_type
|
656
|
+
session=dataset._session,
|
657
|
+
dependencies=self._deps,
|
658
|
+
drop_input_cols=self._drop_input_cols,
|
659
|
+
expected_output_cols_type=expected_dtype,
|
665
660
|
)
|
666
661
|
|
667
662
|
elif isinstance(dataset, pd.DataFrame):
|
668
|
-
transform_kwargs = dict(
|
669
|
-
snowpark_input_cols = self._snowpark_cols,
|
670
|
-
drop_input_cols = self._drop_input_cols
|
671
|
-
)
|
663
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
672
664
|
|
673
665
|
transform_handlers = ModelTransformerBuilder.build(
|
674
666
|
dataset=dataset,
|
@@ -687,7 +679,11 @@ class XGBRegressor(BaseTransformer):
|
|
687
679
|
return output_df
|
688
680
|
|
689
681
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
690
|
-
def fit_predict(
|
682
|
+
def fit_predict(
|
683
|
+
self,
|
684
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
685
|
+
output_cols_prefix: str = "fit_predict_",
|
686
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
691
687
|
""" Method not supported for this class.
|
692
688
|
|
693
689
|
|
@@ -712,7 +708,9 @@ class XGBRegressor(BaseTransformer):
|
|
712
708
|
)
|
713
709
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
714
710
|
drop_input_cols=self._drop_input_cols,
|
715
|
-
expected_output_cols_list=
|
711
|
+
expected_output_cols_list=(
|
712
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
713
|
+
),
|
716
714
|
)
|
717
715
|
self._sklearn_object = fitted_estimator
|
718
716
|
self._is_fitted = True
|
@@ -729,6 +727,62 @@ class XGBRegressor(BaseTransformer):
|
|
729
727
|
assert self._sklearn_object is not None
|
730
728
|
return self._sklearn_object.embedding_
|
731
729
|
|
730
|
+
|
731
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
732
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
733
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
734
|
+
"""
|
735
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
736
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
737
|
+
if output_cols:
|
738
|
+
output_cols = [
|
739
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
740
|
+
for c in output_cols
|
741
|
+
]
|
742
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
743
|
+
output_cols = [output_cols_prefix]
|
744
|
+
elif self._sklearn_object is not None:
|
745
|
+
classes = self._sklearn_object.classes_
|
746
|
+
if isinstance(classes, numpy.ndarray):
|
747
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
748
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
749
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
750
|
+
output_cols = []
|
751
|
+
for i, cl in enumerate(classes):
|
752
|
+
# For binary classification, there is only one output column for each class
|
753
|
+
# ndarray as the two classes are complementary.
|
754
|
+
if len(cl) == 2:
|
755
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
756
|
+
else:
|
757
|
+
output_cols.extend([
|
758
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
759
|
+
])
|
760
|
+
else:
|
761
|
+
output_cols = []
|
762
|
+
|
763
|
+
# Make sure column names are valid snowflake identifiers.
|
764
|
+
assert output_cols is not None # Make MyPy happy
|
765
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
766
|
+
|
767
|
+
return rv
|
768
|
+
|
769
|
+
def _align_expected_output_names(
|
770
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
771
|
+
) -> List[str]:
|
772
|
+
# in case the inferred output column names dimension is different
|
773
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
774
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
775
|
+
output_df_columns = list(output_df_pd.columns)
|
776
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
777
|
+
if self.sample_weight_col:
|
778
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
779
|
+
# if the dimension of inferred output column names is correct; use it
|
780
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
781
|
+
return expected_output_cols_list
|
782
|
+
# otherwise, use the sklearn estimator's output
|
783
|
+
else:
|
784
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
785
|
+
|
732
786
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
733
787
|
@telemetry.send_api_usage_telemetry(
|
734
788
|
project=_PROJECT,
|
@@ -759,24 +813,28 @@ class XGBRegressor(BaseTransformer):
|
|
759
813
|
# are specific to the type of dataset used.
|
760
814
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
761
815
|
|
816
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
817
|
+
|
762
818
|
if isinstance(dataset, DataFrame):
|
763
819
|
self._deps = self._batch_inference_validate_snowpark(
|
764
820
|
dataset=dataset,
|
765
821
|
inference_method=inference_method,
|
766
822
|
)
|
767
|
-
assert isinstance(
|
823
|
+
assert isinstance(
|
824
|
+
dataset._session, Session
|
825
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
768
826
|
transform_kwargs = dict(
|
769
827
|
session=dataset._session,
|
770
828
|
dependencies=self._deps,
|
771
|
-
drop_input_cols
|
829
|
+
drop_input_cols=self._drop_input_cols,
|
772
830
|
expected_output_cols_type="float",
|
773
831
|
)
|
832
|
+
expected_output_cols = self._align_expected_output_names(
|
833
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
834
|
+
)
|
774
835
|
|
775
836
|
elif isinstance(dataset, pd.DataFrame):
|
776
|
-
transform_kwargs = dict(
|
777
|
-
snowpark_input_cols = self._snowpark_cols,
|
778
|
-
drop_input_cols = self._drop_input_cols
|
779
|
-
)
|
837
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
780
838
|
|
781
839
|
transform_handlers = ModelTransformerBuilder.build(
|
782
840
|
dataset=dataset,
|
@@ -788,7 +846,7 @@ class XGBRegressor(BaseTransformer):
|
|
788
846
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
789
847
|
inference_method=inference_method,
|
790
848
|
input_cols=self.input_cols,
|
791
|
-
expected_output_cols=
|
849
|
+
expected_output_cols=expected_output_cols,
|
792
850
|
**transform_kwargs
|
793
851
|
)
|
794
852
|
return output_df
|
@@ -818,7 +876,8 @@ class XGBRegressor(BaseTransformer):
|
|
818
876
|
Output dataset with log probability of the sample for each class in the model.
|
819
877
|
"""
|
820
878
|
super()._check_dataset_type(dataset)
|
821
|
-
inference_method="predict_log_proba"
|
879
|
+
inference_method = "predict_log_proba"
|
880
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
822
881
|
|
823
882
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
824
883
|
# are specific to the type of dataset used.
|
@@ -829,18 +888,20 @@ class XGBRegressor(BaseTransformer):
|
|
829
888
|
dataset=dataset,
|
830
889
|
inference_method=inference_method,
|
831
890
|
)
|
832
|
-
assert isinstance(
|
891
|
+
assert isinstance(
|
892
|
+
dataset._session, Session
|
893
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
833
894
|
transform_kwargs = dict(
|
834
895
|
session=dataset._session,
|
835
896
|
dependencies=self._deps,
|
836
|
-
drop_input_cols
|
897
|
+
drop_input_cols=self._drop_input_cols,
|
837
898
|
expected_output_cols_type="float",
|
838
899
|
)
|
900
|
+
expected_output_cols = self._align_expected_output_names(
|
901
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
902
|
+
)
|
839
903
|
elif isinstance(dataset, pd.DataFrame):
|
840
|
-
transform_kwargs = dict(
|
841
|
-
snowpark_input_cols = self._snowpark_cols,
|
842
|
-
drop_input_cols = self._drop_input_cols
|
843
|
-
)
|
904
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
844
905
|
|
845
906
|
transform_handlers = ModelTransformerBuilder.build(
|
846
907
|
dataset=dataset,
|
@@ -853,7 +914,7 @@ class XGBRegressor(BaseTransformer):
|
|
853
914
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
854
915
|
inference_method=inference_method,
|
855
916
|
input_cols=self.input_cols,
|
856
|
-
expected_output_cols=
|
917
|
+
expected_output_cols=expected_output_cols,
|
857
918
|
**transform_kwargs
|
858
919
|
)
|
859
920
|
return output_df
|
@@ -879,30 +940,34 @@ class XGBRegressor(BaseTransformer):
|
|
879
940
|
Output dataset with results of the decision function for the samples in input dataset.
|
880
941
|
"""
|
881
942
|
super()._check_dataset_type(dataset)
|
882
|
-
inference_method="decision_function"
|
943
|
+
inference_method = "decision_function"
|
883
944
|
|
884
945
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
885
946
|
# are specific to the type of dataset used.
|
886
947
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
887
948
|
|
949
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
950
|
+
|
888
951
|
if isinstance(dataset, DataFrame):
|
889
952
|
self._deps = self._batch_inference_validate_snowpark(
|
890
953
|
dataset=dataset,
|
891
954
|
inference_method=inference_method,
|
892
955
|
)
|
893
|
-
assert isinstance(
|
956
|
+
assert isinstance(
|
957
|
+
dataset._session, Session
|
958
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
894
959
|
transform_kwargs = dict(
|
895
960
|
session=dataset._session,
|
896
961
|
dependencies=self._deps,
|
897
|
-
drop_input_cols
|
962
|
+
drop_input_cols=self._drop_input_cols,
|
898
963
|
expected_output_cols_type="float",
|
899
964
|
)
|
965
|
+
expected_output_cols = self._align_expected_output_names(
|
966
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
967
|
+
)
|
900
968
|
|
901
969
|
elif isinstance(dataset, pd.DataFrame):
|
902
|
-
transform_kwargs = dict(
|
903
|
-
snowpark_input_cols = self._snowpark_cols,
|
904
|
-
drop_input_cols = self._drop_input_cols
|
905
|
-
)
|
970
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
906
971
|
|
907
972
|
transform_handlers = ModelTransformerBuilder.build(
|
908
973
|
dataset=dataset,
|
@@ -915,7 +980,7 @@ class XGBRegressor(BaseTransformer):
|
|
915
980
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
916
981
|
inference_method=inference_method,
|
917
982
|
input_cols=self.input_cols,
|
918
|
-
expected_output_cols=
|
983
|
+
expected_output_cols=expected_output_cols,
|
919
984
|
**transform_kwargs
|
920
985
|
)
|
921
986
|
return output_df
|
@@ -944,12 +1009,14 @@ class XGBRegressor(BaseTransformer):
|
|
944
1009
|
Output dataset with probability of the sample for each class in the model.
|
945
1010
|
"""
|
946
1011
|
super()._check_dataset_type(dataset)
|
947
|
-
inference_method="score_samples"
|
1012
|
+
inference_method = "score_samples"
|
948
1013
|
|
949
1014
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
950
1015
|
# are specific to the type of dataset used.
|
951
1016
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
952
1017
|
|
1018
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
1019
|
+
|
953
1020
|
if isinstance(dataset, DataFrame):
|
954
1021
|
self._deps = self._batch_inference_validate_snowpark(
|
955
1022
|
dataset=dataset,
|
@@ -962,6 +1029,9 @@ class XGBRegressor(BaseTransformer):
|
|
962
1029
|
drop_input_cols = self._drop_input_cols,
|
963
1030
|
expected_output_cols_type="float",
|
964
1031
|
)
|
1032
|
+
expected_output_cols = self._align_expected_output_names(
|
1033
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
1034
|
+
)
|
965
1035
|
|
966
1036
|
elif isinstance(dataset, pd.DataFrame):
|
967
1037
|
transform_kwargs = dict(
|
@@ -980,7 +1050,7 @@ class XGBRegressor(BaseTransformer):
|
|
980
1050
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
981
1051
|
inference_method=inference_method,
|
982
1052
|
input_cols=self.input_cols,
|
983
|
-
expected_output_cols=
|
1053
|
+
expected_output_cols=expected_output_cols,
|
984
1054
|
**transform_kwargs
|
985
1055
|
)
|
986
1056
|
return output_df
|
@@ -1127,50 +1197,84 @@ class XGBRegressor(BaseTransformer):
|
|
1127
1197
|
)
|
1128
1198
|
return output_df
|
1129
1199
|
|
1200
|
+
|
1201
|
+
|
1202
|
+
def to_xgboost(self) -> Any:
|
1203
|
+
"""Get xgboost.XGBRegressor object.
|
1204
|
+
"""
|
1205
|
+
if self._sklearn_object is None:
|
1206
|
+
self._sklearn_object = self._create_sklearn_object()
|
1207
|
+
return self._sklearn_object
|
1208
|
+
|
1209
|
+
def to_sklearn(self) -> Any:
|
1210
|
+
raise exceptions.SnowflakeMLException(
|
1211
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1212
|
+
original_exception=AttributeError(
|
1213
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1214
|
+
"to_sklearn()",
|
1215
|
+
"to_xgboost()"
|
1216
|
+
)
|
1217
|
+
),
|
1218
|
+
)
|
1219
|
+
|
1220
|
+
def to_lightgbm(self) -> Any:
|
1221
|
+
raise exceptions.SnowflakeMLException(
|
1222
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1223
|
+
original_exception=AttributeError(
|
1224
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1225
|
+
"to_lightgbm()",
|
1226
|
+
"to_xgboost()"
|
1227
|
+
)
|
1228
|
+
),
|
1229
|
+
)
|
1130
1230
|
|
1131
|
-
def
|
1231
|
+
def _get_dependencies(self) -> List[str]:
|
1232
|
+
return self._deps
|
1233
|
+
|
1234
|
+
|
1235
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1132
1236
|
self._model_signature_dict = dict()
|
1133
1237
|
|
1134
1238
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1135
1239
|
|
1136
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1240
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1137
1241
|
outputs: List[BaseFeatureSpec] = []
|
1138
1242
|
if hasattr(self, "predict"):
|
1139
1243
|
# keep mypy happy
|
1140
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1244
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1141
1245
|
# For classifier, the type of predict is the same as the type of label
|
1142
|
-
if self._sklearn_object._estimator_type ==
|
1143
|
-
|
1246
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1247
|
+
# label columns is the desired type for output
|
1144
1248
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1145
1249
|
# rename the output columns
|
1146
1250
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1147
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1148
|
-
|
1149
|
-
|
1251
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1252
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1253
|
+
)
|
1150
1254
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1151
1255
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1152
|
-
# Clusterer returns int64 cluster labels.
|
1256
|
+
# Clusterer returns int64 cluster labels.
|
1153
1257
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1154
1258
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1155
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1259
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1260
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1261
|
+
)
|
1262
|
+
|
1159
1263
|
# For regressor, the type of predict is float64
|
1160
|
-
elif self._sklearn_object._estimator_type ==
|
1264
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1161
1265
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1162
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1266
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1267
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1268
|
+
)
|
1269
|
+
|
1166
1270
|
for prob_func in PROB_FUNCTIONS:
|
1167
1271
|
if hasattr(self, prob_func):
|
1168
1272
|
output_cols_prefix: str = f"{prob_func}_"
|
1169
1273
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1170
1274
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1171
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1172
|
-
|
1173
|
-
|
1275
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1276
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1277
|
+
)
|
1174
1278
|
|
1175
1279
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1176
1280
|
items = list(self._model_signature_dict.items())
|
@@ -1183,10 +1287,10 @@ class XGBRegressor(BaseTransformer):
|
|
1183
1287
|
"""Returns model signature of current class.
|
1184
1288
|
|
1185
1289
|
Raises:
|
1186
|
-
|
1290
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1187
1291
|
|
1188
1292
|
Returns:
|
1189
|
-
Dict
|
1293
|
+
Dict with each method and its input output signature
|
1190
1294
|
"""
|
1191
1295
|
if self._model_signature_dict is None:
|
1192
1296
|
raise exceptions.SnowflakeMLException(
|
@@ -1194,35 +1298,3 @@ class XGBRegressor(BaseTransformer):
|
|
1194
1298
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1195
1299
|
)
|
1196
1300
|
return self._model_signature_dict
|
1197
|
-
|
1198
|
-
def to_xgboost(self) -> Any:
|
1199
|
-
"""Get xgboost.XGBRegressor object.
|
1200
|
-
"""
|
1201
|
-
if self._sklearn_object is None:
|
1202
|
-
self._sklearn_object = self._create_sklearn_object()
|
1203
|
-
return self._sklearn_object
|
1204
|
-
|
1205
|
-
def to_sklearn(self) -> Any:
|
1206
|
-
raise exceptions.SnowflakeMLException(
|
1207
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1208
|
-
original_exception=AttributeError(
|
1209
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1210
|
-
"to_sklearn()",
|
1211
|
-
"to_xgboost()"
|
1212
|
-
)
|
1213
|
-
),
|
1214
|
-
)
|
1215
|
-
|
1216
|
-
def to_lightgbm(self) -> Any:
|
1217
|
-
raise exceptions.SnowflakeMLException(
|
1218
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1219
|
-
original_exception=AttributeError(
|
1220
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1221
|
-
"to_lightgbm()",
|
1222
|
-
"to_xgboost()"
|
1223
|
-
)
|
1224
|
-
),
|
1225
|
-
)
|
1226
|
-
|
1227
|
-
def _get_dependencies(self) -> List[str]:
|
1228
|
-
return self._deps
|