snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.ensemble".replace("sklea
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class HistGradientBoostingRegressor(BaseTransformer):
|
71
64
|
r"""Histogram-based Gradient Boosting Regression Tree
|
72
65
|
For more details on this class, see [sklearn.ensemble.HistGradientBoostingRegressor]
|
@@ -363,12 +356,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
363
356
|
)
|
364
357
|
return selected_cols
|
365
358
|
|
366
|
-
|
367
|
-
project=_PROJECT,
|
368
|
-
subproject=_SUBPROJECT,
|
369
|
-
custom_tags=dict([("autogen", True)]),
|
370
|
-
)
|
371
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "HistGradientBoostingRegressor":
|
359
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "HistGradientBoostingRegressor":
|
372
360
|
"""Fit the gradient boosting model
|
373
361
|
For more details on this function, see [sklearn.ensemble.HistGradientBoostingRegressor.fit]
|
374
362
|
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#sklearn.ensemble.HistGradientBoostingRegressor.fit)
|
@@ -395,12 +383,14 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
395
383
|
|
396
384
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
397
385
|
|
398
|
-
|
386
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
399
387
|
if SNOWML_SPROC_ENV in os.environ:
|
400
388
|
statement_params = telemetry.get_function_usage_statement_params(
|
401
389
|
project=_PROJECT,
|
402
390
|
subproject=_SUBPROJECT,
|
403
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
391
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
392
|
+
inspect.currentframe(), HistGradientBoostingRegressor.__class__.__name__
|
393
|
+
),
|
404
394
|
api_calls=[Session.call],
|
405
395
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
406
396
|
)
|
@@ -421,27 +411,24 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
421
411
|
)
|
422
412
|
self._sklearn_object = model_trainer.train()
|
423
413
|
self._is_fitted = True
|
424
|
-
self.
|
414
|
+
self._generate_model_signatures(dataset)
|
425
415
|
return self
|
426
416
|
|
427
417
|
def _batch_inference_validate_snowpark(
|
428
418
|
self,
|
429
419
|
dataset: DataFrame,
|
430
420
|
inference_method: str,
|
431
|
-
) ->
|
432
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
433
|
-
return the available package that exists in the snowflake anaconda channel
|
421
|
+
) -> None:
|
422
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
434
423
|
|
435
424
|
Args:
|
436
425
|
dataset: snowpark dataframe
|
437
426
|
inference_method: the inference method such as predict, score...
|
438
|
-
|
427
|
+
|
439
428
|
Raises:
|
440
429
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
441
430
|
SnowflakeMLException: If the session is None, raise error
|
442
431
|
|
443
|
-
Returns:
|
444
|
-
A list of available package that exists in the snowflake anaconda channel
|
445
432
|
"""
|
446
433
|
if not self._is_fitted:
|
447
434
|
raise exceptions.SnowflakeMLException(
|
@@ -459,9 +446,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
459
446
|
"Session must not specified for snowpark dataset."
|
460
447
|
),
|
461
448
|
)
|
462
|
-
|
463
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
464
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
449
|
+
|
465
450
|
|
466
451
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
467
452
|
@telemetry.send_api_usage_telemetry(
|
@@ -497,7 +482,9 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
497
482
|
# when it is classifier, infer the datatype from label columns
|
498
483
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
499
484
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
500
|
-
label_cols_signatures = [
|
485
|
+
label_cols_signatures = [
|
486
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
487
|
+
]
|
501
488
|
if len(label_cols_signatures) == 0:
|
502
489
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
503
490
|
raise exceptions.SnowflakeMLException(
|
@@ -505,25 +492,23 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
505
492
|
original_exception=ValueError(error_str),
|
506
493
|
)
|
507
494
|
|
508
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
509
|
-
label_cols_signatures[0].as_snowpark_type()
|
510
|
-
)
|
495
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
511
496
|
|
512
|
-
self.
|
513
|
-
|
497
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
498
|
+
self._deps = self._get_dependencies()
|
499
|
+
assert isinstance(
|
500
|
+
dataset._session, Session
|
501
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
514
502
|
|
515
503
|
transform_kwargs = dict(
|
516
|
-
session
|
517
|
-
dependencies
|
518
|
-
drop_input_cols
|
519
|
-
expected_output_cols_type
|
504
|
+
session=dataset._session,
|
505
|
+
dependencies=self._deps,
|
506
|
+
drop_input_cols=self._drop_input_cols,
|
507
|
+
expected_output_cols_type=expected_type_inferred,
|
520
508
|
)
|
521
509
|
|
522
510
|
elif isinstance(dataset, pd.DataFrame):
|
523
|
-
transform_kwargs = dict(
|
524
|
-
snowpark_input_cols = self._snowpark_cols,
|
525
|
-
drop_input_cols = self._drop_input_cols
|
526
|
-
)
|
511
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
527
512
|
|
528
513
|
transform_handlers = ModelTransformerBuilder.build(
|
529
514
|
dataset=dataset,
|
@@ -563,7 +548,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
563
548
|
Transformed dataset.
|
564
549
|
"""
|
565
550
|
super()._check_dataset_type(dataset)
|
566
|
-
inference_method="transform"
|
551
|
+
inference_method = "transform"
|
567
552
|
|
568
553
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
569
554
|
# are specific to the type of dataset used.
|
@@ -593,24 +578,19 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
593
578
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
594
579
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
595
580
|
|
596
|
-
self.
|
597
|
-
|
598
|
-
inference_method=inference_method,
|
599
|
-
)
|
581
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
582
|
+
self._deps = self._get_dependencies()
|
600
583
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
601
584
|
|
602
585
|
transform_kwargs = dict(
|
603
|
-
session
|
604
|
-
dependencies
|
605
|
-
drop_input_cols
|
606
|
-
expected_output_cols_type
|
586
|
+
session=dataset._session,
|
587
|
+
dependencies=self._deps,
|
588
|
+
drop_input_cols=self._drop_input_cols,
|
589
|
+
expected_output_cols_type=expected_dtype,
|
607
590
|
)
|
608
591
|
|
609
592
|
elif isinstance(dataset, pd.DataFrame):
|
610
|
-
transform_kwargs = dict(
|
611
|
-
snowpark_input_cols = self._snowpark_cols,
|
612
|
-
drop_input_cols = self._drop_input_cols
|
613
|
-
)
|
593
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
614
594
|
|
615
595
|
transform_handlers = ModelTransformerBuilder.build(
|
616
596
|
dataset=dataset,
|
@@ -629,7 +609,11 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
629
609
|
return output_df
|
630
610
|
|
631
611
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
632
|
-
def fit_predict(
|
612
|
+
def fit_predict(
|
613
|
+
self,
|
614
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
615
|
+
output_cols_prefix: str = "fit_predict_",
|
616
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
633
617
|
""" Method not supported for this class.
|
634
618
|
|
635
619
|
|
@@ -654,22 +638,104 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
654
638
|
)
|
655
639
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
656
640
|
drop_input_cols=self._drop_input_cols,
|
657
|
-
expected_output_cols_list=
|
641
|
+
expected_output_cols_list=(
|
642
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
643
|
+
),
|
658
644
|
)
|
659
645
|
self._sklearn_object = fitted_estimator
|
660
646
|
self._is_fitted = True
|
661
647
|
return output_result
|
662
648
|
|
649
|
+
|
650
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
651
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
652
|
+
""" Method not supported for this class.
|
653
|
+
|
663
654
|
|
664
|
-
|
665
|
-
|
666
|
-
|
655
|
+
Raises:
|
656
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
657
|
+
|
658
|
+
Args:
|
659
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
660
|
+
Snowpark or Pandas DataFrame.
|
661
|
+
output_cols_prefix: Prefix for the response columns
|
667
662
|
Returns:
|
668
663
|
Transformed dataset.
|
669
664
|
"""
|
670
|
-
self.
|
671
|
-
|
672
|
-
|
665
|
+
self._infer_input_output_cols(dataset)
|
666
|
+
super()._check_dataset_type(dataset)
|
667
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
668
|
+
estimator=self._sklearn_object,
|
669
|
+
dataset=dataset,
|
670
|
+
input_cols=self.input_cols,
|
671
|
+
label_cols=self.label_cols,
|
672
|
+
sample_weight_col=self.sample_weight_col,
|
673
|
+
autogenerated=self._autogenerated,
|
674
|
+
subproject=_SUBPROJECT,
|
675
|
+
)
|
676
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
677
|
+
drop_input_cols=self._drop_input_cols,
|
678
|
+
expected_output_cols_list=self.output_cols,
|
679
|
+
)
|
680
|
+
self._sklearn_object = fitted_estimator
|
681
|
+
self._is_fitted = True
|
682
|
+
return output_result
|
683
|
+
|
684
|
+
|
685
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
686
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
687
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
688
|
+
"""
|
689
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
690
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
691
|
+
if output_cols:
|
692
|
+
output_cols = [
|
693
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
694
|
+
for c in output_cols
|
695
|
+
]
|
696
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
697
|
+
output_cols = [output_cols_prefix]
|
698
|
+
elif self._sklearn_object is not None:
|
699
|
+
classes = self._sklearn_object.classes_
|
700
|
+
if isinstance(classes, numpy.ndarray):
|
701
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
702
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
703
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
704
|
+
output_cols = []
|
705
|
+
for i, cl in enumerate(classes):
|
706
|
+
# For binary classification, there is only one output column for each class
|
707
|
+
# ndarray as the two classes are complementary.
|
708
|
+
if len(cl) == 2:
|
709
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
710
|
+
else:
|
711
|
+
output_cols.extend([
|
712
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
713
|
+
])
|
714
|
+
else:
|
715
|
+
output_cols = []
|
716
|
+
|
717
|
+
# Make sure column names are valid snowflake identifiers.
|
718
|
+
assert output_cols is not None # Make MyPy happy
|
719
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
720
|
+
|
721
|
+
return rv
|
722
|
+
|
723
|
+
def _align_expected_output_names(
|
724
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
725
|
+
) -> List[str]:
|
726
|
+
# in case the inferred output column names dimension is different
|
727
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
728
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
729
|
+
output_df_columns = list(output_df_pd.columns)
|
730
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
731
|
+
if self.sample_weight_col:
|
732
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
733
|
+
# if the dimension of inferred output column names is correct; use it
|
734
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
735
|
+
return expected_output_cols_list
|
736
|
+
# otherwise, use the sklearn estimator's output
|
737
|
+
else:
|
738
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
673
739
|
|
674
740
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
675
741
|
@telemetry.send_api_usage_telemetry(
|
@@ -701,24 +767,26 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
701
767
|
# are specific to the type of dataset used.
|
702
768
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
703
769
|
|
770
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
771
|
+
|
704
772
|
if isinstance(dataset, DataFrame):
|
705
|
-
self.
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
773
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
774
|
+
self._deps = self._get_dependencies()
|
775
|
+
assert isinstance(
|
776
|
+
dataset._session, Session
|
777
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
710
778
|
transform_kwargs = dict(
|
711
779
|
session=dataset._session,
|
712
780
|
dependencies=self._deps,
|
713
|
-
drop_input_cols
|
781
|
+
drop_input_cols=self._drop_input_cols,
|
714
782
|
expected_output_cols_type="float",
|
715
783
|
)
|
784
|
+
expected_output_cols = self._align_expected_output_names(
|
785
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
786
|
+
)
|
716
787
|
|
717
788
|
elif isinstance(dataset, pd.DataFrame):
|
718
|
-
transform_kwargs = dict(
|
719
|
-
snowpark_input_cols = self._snowpark_cols,
|
720
|
-
drop_input_cols = self._drop_input_cols
|
721
|
-
)
|
789
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
722
790
|
|
723
791
|
transform_handlers = ModelTransformerBuilder.build(
|
724
792
|
dataset=dataset,
|
@@ -730,7 +798,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
730
798
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
731
799
|
inference_method=inference_method,
|
732
800
|
input_cols=self.input_cols,
|
733
|
-
expected_output_cols=
|
801
|
+
expected_output_cols=expected_output_cols,
|
734
802
|
**transform_kwargs
|
735
803
|
)
|
736
804
|
return output_df
|
@@ -760,29 +828,30 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
760
828
|
Output dataset with log probability of the sample for each class in the model.
|
761
829
|
"""
|
762
830
|
super()._check_dataset_type(dataset)
|
763
|
-
inference_method="predict_log_proba"
|
831
|
+
inference_method = "predict_log_proba"
|
832
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
764
833
|
|
765
834
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
766
835
|
# are specific to the type of dataset used.
|
767
836
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
768
837
|
|
769
838
|
if isinstance(dataset, DataFrame):
|
770
|
-
self.
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
839
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
840
|
+
self._deps = self._get_dependencies()
|
841
|
+
assert isinstance(
|
842
|
+
dataset._session, Session
|
843
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
775
844
|
transform_kwargs = dict(
|
776
845
|
session=dataset._session,
|
777
846
|
dependencies=self._deps,
|
778
|
-
drop_input_cols
|
847
|
+
drop_input_cols=self._drop_input_cols,
|
779
848
|
expected_output_cols_type="float",
|
780
849
|
)
|
850
|
+
expected_output_cols = self._align_expected_output_names(
|
851
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
852
|
+
)
|
781
853
|
elif isinstance(dataset, pd.DataFrame):
|
782
|
-
transform_kwargs = dict(
|
783
|
-
snowpark_input_cols = self._snowpark_cols,
|
784
|
-
drop_input_cols = self._drop_input_cols
|
785
|
-
)
|
854
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
786
855
|
|
787
856
|
transform_handlers = ModelTransformerBuilder.build(
|
788
857
|
dataset=dataset,
|
@@ -795,7 +864,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
795
864
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
796
865
|
inference_method=inference_method,
|
797
866
|
input_cols=self.input_cols,
|
798
|
-
expected_output_cols=
|
867
|
+
expected_output_cols=expected_output_cols,
|
799
868
|
**transform_kwargs
|
800
869
|
)
|
801
870
|
return output_df
|
@@ -821,30 +890,32 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
821
890
|
Output dataset with results of the decision function for the samples in input dataset.
|
822
891
|
"""
|
823
892
|
super()._check_dataset_type(dataset)
|
824
|
-
inference_method="decision_function"
|
893
|
+
inference_method = "decision_function"
|
825
894
|
|
826
895
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
827
896
|
# are specific to the type of dataset used.
|
828
897
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
829
898
|
|
899
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
900
|
+
|
830
901
|
if isinstance(dataset, DataFrame):
|
831
|
-
self.
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
902
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
903
|
+
self._deps = self._get_dependencies()
|
904
|
+
assert isinstance(
|
905
|
+
dataset._session, Session
|
906
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
836
907
|
transform_kwargs = dict(
|
837
908
|
session=dataset._session,
|
838
909
|
dependencies=self._deps,
|
839
|
-
drop_input_cols
|
910
|
+
drop_input_cols=self._drop_input_cols,
|
840
911
|
expected_output_cols_type="float",
|
841
912
|
)
|
913
|
+
expected_output_cols = self._align_expected_output_names(
|
914
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
915
|
+
)
|
842
916
|
|
843
917
|
elif isinstance(dataset, pd.DataFrame):
|
844
|
-
transform_kwargs = dict(
|
845
|
-
snowpark_input_cols = self._snowpark_cols,
|
846
|
-
drop_input_cols = self._drop_input_cols
|
847
|
-
)
|
918
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
848
919
|
|
849
920
|
transform_handlers = ModelTransformerBuilder.build(
|
850
921
|
dataset=dataset,
|
@@ -857,7 +928,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
857
928
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
858
929
|
inference_method=inference_method,
|
859
930
|
input_cols=self.input_cols,
|
860
|
-
expected_output_cols=
|
931
|
+
expected_output_cols=expected_output_cols,
|
861
932
|
**transform_kwargs
|
862
933
|
)
|
863
934
|
return output_df
|
@@ -886,17 +957,17 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
886
957
|
Output dataset with probability of the sample for each class in the model.
|
887
958
|
"""
|
888
959
|
super()._check_dataset_type(dataset)
|
889
|
-
inference_method="score_samples"
|
960
|
+
inference_method = "score_samples"
|
890
961
|
|
891
962
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
892
963
|
# are specific to the type of dataset used.
|
893
964
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
894
965
|
|
966
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
967
|
+
|
895
968
|
if isinstance(dataset, DataFrame):
|
896
|
-
self.
|
897
|
-
|
898
|
-
inference_method=inference_method,
|
899
|
-
)
|
969
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
970
|
+
self._deps = self._get_dependencies()
|
900
971
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
901
972
|
transform_kwargs = dict(
|
902
973
|
session=dataset._session,
|
@@ -904,6 +975,9 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
904
975
|
drop_input_cols = self._drop_input_cols,
|
905
976
|
expected_output_cols_type="float",
|
906
977
|
)
|
978
|
+
expected_output_cols = self._align_expected_output_names(
|
979
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
980
|
+
)
|
907
981
|
|
908
982
|
elif isinstance(dataset, pd.DataFrame):
|
909
983
|
transform_kwargs = dict(
|
@@ -922,7 +996,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
922
996
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
923
997
|
inference_method=inference_method,
|
924
998
|
input_cols=self.input_cols,
|
925
|
-
expected_output_cols=
|
999
|
+
expected_output_cols=expected_output_cols,
|
926
1000
|
**transform_kwargs
|
927
1001
|
)
|
928
1002
|
return output_df
|
@@ -957,17 +1031,15 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
957
1031
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
958
1032
|
|
959
1033
|
if isinstance(dataset, DataFrame):
|
960
|
-
self.
|
961
|
-
|
962
|
-
inference_method="score",
|
963
|
-
)
|
1034
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
1035
|
+
self._deps = self._get_dependencies()
|
964
1036
|
selected_cols = self._get_active_columns()
|
965
1037
|
if len(selected_cols) > 0:
|
966
1038
|
dataset = dataset.select(selected_cols)
|
967
1039
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
968
1040
|
transform_kwargs = dict(
|
969
1041
|
session=dataset._session,
|
970
|
-
dependencies=
|
1042
|
+
dependencies=self._deps,
|
971
1043
|
score_sproc_imports=['sklearn'],
|
972
1044
|
)
|
973
1045
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -1032,11 +1104,8 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
1032
1104
|
|
1033
1105
|
if isinstance(dataset, DataFrame):
|
1034
1106
|
|
1035
|
-
self.
|
1036
|
-
|
1037
|
-
inference_method=inference_method,
|
1038
|
-
|
1039
|
-
)
|
1107
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1108
|
+
self._deps = self._get_dependencies()
|
1040
1109
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
1041
1110
|
transform_kwargs = dict(
|
1042
1111
|
session = dataset._session,
|
@@ -1069,50 +1138,84 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
1069
1138
|
)
|
1070
1139
|
return output_df
|
1071
1140
|
|
1141
|
+
|
1142
|
+
|
1143
|
+
def to_sklearn(self) -> Any:
|
1144
|
+
"""Get sklearn.ensemble.HistGradientBoostingRegressor object.
|
1145
|
+
"""
|
1146
|
+
if self._sklearn_object is None:
|
1147
|
+
self._sklearn_object = self._create_sklearn_object()
|
1148
|
+
return self._sklearn_object
|
1149
|
+
|
1150
|
+
def to_xgboost(self) -> Any:
|
1151
|
+
raise exceptions.SnowflakeMLException(
|
1152
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1153
|
+
original_exception=AttributeError(
|
1154
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1155
|
+
"to_xgboost()",
|
1156
|
+
"to_sklearn()"
|
1157
|
+
)
|
1158
|
+
),
|
1159
|
+
)
|
1160
|
+
|
1161
|
+
def to_lightgbm(self) -> Any:
|
1162
|
+
raise exceptions.SnowflakeMLException(
|
1163
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1164
|
+
original_exception=AttributeError(
|
1165
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1166
|
+
"to_lightgbm()",
|
1167
|
+
"to_sklearn()"
|
1168
|
+
)
|
1169
|
+
),
|
1170
|
+
)
|
1171
|
+
|
1172
|
+
def _get_dependencies(self) -> List[str]:
|
1173
|
+
return self._deps
|
1174
|
+
|
1072
1175
|
|
1073
|
-
def
|
1176
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1074
1177
|
self._model_signature_dict = dict()
|
1075
1178
|
|
1076
1179
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1077
1180
|
|
1078
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1181
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1079
1182
|
outputs: List[BaseFeatureSpec] = []
|
1080
1183
|
if hasattr(self, "predict"):
|
1081
1184
|
# keep mypy happy
|
1082
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1185
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1083
1186
|
# For classifier, the type of predict is the same as the type of label
|
1084
|
-
if self._sklearn_object._estimator_type ==
|
1085
|
-
|
1187
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1188
|
+
# label columns is the desired type for output
|
1086
1189
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1087
1190
|
# rename the output columns
|
1088
1191
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1089
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1090
|
-
|
1091
|
-
|
1192
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1193
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1194
|
+
)
|
1092
1195
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1093
1196
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1094
|
-
# Clusterer returns int64 cluster labels.
|
1197
|
+
# Clusterer returns int64 cluster labels.
|
1095
1198
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1096
1199
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1097
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1200
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1201
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1202
|
+
)
|
1203
|
+
|
1101
1204
|
# For regressor, the type of predict is float64
|
1102
|
-
elif self._sklearn_object._estimator_type ==
|
1205
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1103
1206
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1104
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1207
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1208
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1209
|
+
)
|
1210
|
+
|
1108
1211
|
for prob_func in PROB_FUNCTIONS:
|
1109
1212
|
if hasattr(self, prob_func):
|
1110
1213
|
output_cols_prefix: str = f"{prob_func}_"
|
1111
1214
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1112
1215
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1113
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1114
|
-
|
1115
|
-
|
1216
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1217
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1218
|
+
)
|
1116
1219
|
|
1117
1220
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1118
1221
|
items = list(self._model_signature_dict.items())
|
@@ -1125,10 +1228,10 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
1125
1228
|
"""Returns model signature of current class.
|
1126
1229
|
|
1127
1230
|
Raises:
|
1128
|
-
|
1231
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1129
1232
|
|
1130
1233
|
Returns:
|
1131
|
-
Dict
|
1234
|
+
Dict with each method and its input output signature
|
1132
1235
|
"""
|
1133
1236
|
if self._model_signature_dict is None:
|
1134
1237
|
raise exceptions.SnowflakeMLException(
|
@@ -1136,35 +1239,3 @@ class HistGradientBoostingRegressor(BaseTransformer):
|
|
1136
1239
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1137
1240
|
)
|
1138
1241
|
return self._model_signature_dict
|
1139
|
-
|
1140
|
-
def to_sklearn(self) -> Any:
|
1141
|
-
"""Get sklearn.ensemble.HistGradientBoostingRegressor object.
|
1142
|
-
"""
|
1143
|
-
if self._sklearn_object is None:
|
1144
|
-
self._sklearn_object = self._create_sklearn_object()
|
1145
|
-
return self._sklearn_object
|
1146
|
-
|
1147
|
-
def to_xgboost(self) -> Any:
|
1148
|
-
raise exceptions.SnowflakeMLException(
|
1149
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1150
|
-
original_exception=AttributeError(
|
1151
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1152
|
-
"to_xgboost()",
|
1153
|
-
"to_sklearn()"
|
1154
|
-
)
|
1155
|
-
),
|
1156
|
-
)
|
1157
|
-
|
1158
|
-
def to_lightgbm(self) -> Any:
|
1159
|
-
raise exceptions.SnowflakeMLException(
|
1160
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1161
|
-
original_exception=AttributeError(
|
1162
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1163
|
-
"to_lightgbm()",
|
1164
|
-
"to_sklearn()"
|
1165
|
-
)
|
1166
|
-
),
|
1167
|
-
)
|
1168
|
-
|
1169
|
-
def _get_dependencies(self) -> List[str]:
|
1170
|
-
return self._deps
|