snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -372,12 +371,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
372
371
|
)
|
373
372
|
return selected_cols
|
374
373
|
|
375
|
-
|
376
|
-
project=_PROJECT,
|
377
|
-
subproject=_SUBPROJECT,
|
378
|
-
custom_tags=dict([("autogen", True)]),
|
379
|
-
)
|
380
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "HistGradientBoostingClassifier":
|
374
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "HistGradientBoostingClassifier":
|
381
375
|
"""Fit the gradient boosting model
|
382
376
|
For more details on this function, see [sklearn.ensemble.HistGradientBoostingClassifier.fit]
|
383
377
|
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html#sklearn.ensemble.HistGradientBoostingClassifier.fit)
|
@@ -404,12 +398,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
404
398
|
|
405
399
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
406
400
|
|
407
|
-
|
401
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
408
402
|
if SNOWML_SPROC_ENV in os.environ:
|
409
403
|
statement_params = telemetry.get_function_usage_statement_params(
|
410
404
|
project=_PROJECT,
|
411
405
|
subproject=_SUBPROJECT,
|
412
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
406
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
407
|
+
inspect.currentframe(), HistGradientBoostingClassifier.__class__.__name__
|
408
|
+
),
|
413
409
|
api_calls=[Session.call],
|
414
410
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
415
411
|
)
|
@@ -430,7 +426,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
430
426
|
)
|
431
427
|
self._sklearn_object = model_trainer.train()
|
432
428
|
self._is_fitted = True
|
433
|
-
self.
|
429
|
+
self._generate_model_signatures(dataset)
|
434
430
|
return self
|
435
431
|
|
436
432
|
def _batch_inference_validate_snowpark(
|
@@ -506,7 +502,9 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
506
502
|
# when it is classifier, infer the datatype from label columns
|
507
503
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
508
504
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
509
|
-
label_cols_signatures = [
|
505
|
+
label_cols_signatures = [
|
506
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
507
|
+
]
|
510
508
|
if len(label_cols_signatures) == 0:
|
511
509
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
512
510
|
raise exceptions.SnowflakeMLException(
|
@@ -514,25 +512,22 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
514
512
|
original_exception=ValueError(error_str),
|
515
513
|
)
|
516
514
|
|
517
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
518
|
-
label_cols_signatures[0].as_snowpark_type()
|
519
|
-
)
|
515
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
520
516
|
|
521
517
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
522
|
-
assert isinstance(
|
518
|
+
assert isinstance(
|
519
|
+
dataset._session, Session
|
520
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
523
521
|
|
524
522
|
transform_kwargs = dict(
|
525
|
-
session
|
526
|
-
dependencies
|
527
|
-
drop_input_cols
|
528
|
-
expected_output_cols_type
|
523
|
+
session=dataset._session,
|
524
|
+
dependencies=self._deps,
|
525
|
+
drop_input_cols=self._drop_input_cols,
|
526
|
+
expected_output_cols_type=expected_type_inferred,
|
529
527
|
)
|
530
528
|
|
531
529
|
elif isinstance(dataset, pd.DataFrame):
|
532
|
-
transform_kwargs = dict(
|
533
|
-
snowpark_input_cols = self._snowpark_cols,
|
534
|
-
drop_input_cols = self._drop_input_cols
|
535
|
-
)
|
530
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
536
531
|
|
537
532
|
transform_handlers = ModelTransformerBuilder.build(
|
538
533
|
dataset=dataset,
|
@@ -572,7 +567,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
572
567
|
Transformed dataset.
|
573
568
|
"""
|
574
569
|
super()._check_dataset_type(dataset)
|
575
|
-
inference_method="transform"
|
570
|
+
inference_method = "transform"
|
576
571
|
|
577
572
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
578
573
|
# are specific to the type of dataset used.
|
@@ -609,17 +604,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
609
604
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
610
605
|
|
611
606
|
transform_kwargs = dict(
|
612
|
-
session
|
613
|
-
dependencies
|
614
|
-
drop_input_cols
|
615
|
-
expected_output_cols_type
|
607
|
+
session=dataset._session,
|
608
|
+
dependencies=self._deps,
|
609
|
+
drop_input_cols=self._drop_input_cols,
|
610
|
+
expected_output_cols_type=expected_dtype,
|
616
611
|
)
|
617
612
|
|
618
613
|
elif isinstance(dataset, pd.DataFrame):
|
619
|
-
transform_kwargs = dict(
|
620
|
-
snowpark_input_cols = self._snowpark_cols,
|
621
|
-
drop_input_cols = self._drop_input_cols
|
622
|
-
)
|
614
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
623
615
|
|
624
616
|
transform_handlers = ModelTransformerBuilder.build(
|
625
617
|
dataset=dataset,
|
@@ -638,7 +630,11 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
638
630
|
return output_df
|
639
631
|
|
640
632
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
641
|
-
def fit_predict(
|
633
|
+
def fit_predict(
|
634
|
+
self,
|
635
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
636
|
+
output_cols_prefix: str = "fit_predict_",
|
637
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
642
638
|
""" Method not supported for this class.
|
643
639
|
|
644
640
|
|
@@ -663,7 +659,9 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
663
659
|
)
|
664
660
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
665
661
|
drop_input_cols=self._drop_input_cols,
|
666
|
-
expected_output_cols_list=
|
662
|
+
expected_output_cols_list=(
|
663
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
664
|
+
),
|
667
665
|
)
|
668
666
|
self._sklearn_object = fitted_estimator
|
669
667
|
self._is_fitted = True
|
@@ -680,6 +678,62 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
680
678
|
assert self._sklearn_object is not None
|
681
679
|
return self._sklearn_object.embedding_
|
682
680
|
|
681
|
+
|
682
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
683
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
684
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
685
|
+
"""
|
686
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
687
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
688
|
+
if output_cols:
|
689
|
+
output_cols = [
|
690
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
691
|
+
for c in output_cols
|
692
|
+
]
|
693
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
694
|
+
output_cols = [output_cols_prefix]
|
695
|
+
elif self._sklearn_object is not None:
|
696
|
+
classes = self._sklearn_object.classes_
|
697
|
+
if isinstance(classes, numpy.ndarray):
|
698
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
699
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
700
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
701
|
+
output_cols = []
|
702
|
+
for i, cl in enumerate(classes):
|
703
|
+
# For binary classification, there is only one output column for each class
|
704
|
+
# ndarray as the two classes are complementary.
|
705
|
+
if len(cl) == 2:
|
706
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
707
|
+
else:
|
708
|
+
output_cols.extend([
|
709
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
710
|
+
])
|
711
|
+
else:
|
712
|
+
output_cols = []
|
713
|
+
|
714
|
+
# Make sure column names are valid snowflake identifiers.
|
715
|
+
assert output_cols is not None # Make MyPy happy
|
716
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
717
|
+
|
718
|
+
return rv
|
719
|
+
|
720
|
+
def _align_expected_output_names(
|
721
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
722
|
+
) -> List[str]:
|
723
|
+
# in case the inferred output column names dimension is different
|
724
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
725
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
726
|
+
output_df_columns = list(output_df_pd.columns)
|
727
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
728
|
+
if self.sample_weight_col:
|
729
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
730
|
+
# if the dimension of inferred output column names is correct; use it
|
731
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
732
|
+
return expected_output_cols_list
|
733
|
+
# otherwise, use the sklearn estimator's output
|
734
|
+
else:
|
735
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
736
|
+
|
683
737
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
684
738
|
@telemetry.send_api_usage_telemetry(
|
685
739
|
project=_PROJECT,
|
@@ -712,24 +766,28 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
712
766
|
# are specific to the type of dataset used.
|
713
767
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
714
768
|
|
769
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
770
|
+
|
715
771
|
if isinstance(dataset, DataFrame):
|
716
772
|
self._deps = self._batch_inference_validate_snowpark(
|
717
773
|
dataset=dataset,
|
718
774
|
inference_method=inference_method,
|
719
775
|
)
|
720
|
-
assert isinstance(
|
776
|
+
assert isinstance(
|
777
|
+
dataset._session, Session
|
778
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
721
779
|
transform_kwargs = dict(
|
722
780
|
session=dataset._session,
|
723
781
|
dependencies=self._deps,
|
724
|
-
drop_input_cols
|
782
|
+
drop_input_cols=self._drop_input_cols,
|
725
783
|
expected_output_cols_type="float",
|
726
784
|
)
|
785
|
+
expected_output_cols = self._align_expected_output_names(
|
786
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
787
|
+
)
|
727
788
|
|
728
789
|
elif isinstance(dataset, pd.DataFrame):
|
729
|
-
transform_kwargs = dict(
|
730
|
-
snowpark_input_cols = self._snowpark_cols,
|
731
|
-
drop_input_cols = self._drop_input_cols
|
732
|
-
)
|
790
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
733
791
|
|
734
792
|
transform_handlers = ModelTransformerBuilder.build(
|
735
793
|
dataset=dataset,
|
@@ -741,7 +799,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
741
799
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
742
800
|
inference_method=inference_method,
|
743
801
|
input_cols=self.input_cols,
|
744
|
-
expected_output_cols=
|
802
|
+
expected_output_cols=expected_output_cols,
|
745
803
|
**transform_kwargs
|
746
804
|
)
|
747
805
|
return output_df
|
@@ -773,7 +831,8 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
773
831
|
Output dataset with log probability of the sample for each class in the model.
|
774
832
|
"""
|
775
833
|
super()._check_dataset_type(dataset)
|
776
|
-
inference_method="predict_log_proba"
|
834
|
+
inference_method = "predict_log_proba"
|
835
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
777
836
|
|
778
837
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
779
838
|
# are specific to the type of dataset used.
|
@@ -784,18 +843,20 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
784
843
|
dataset=dataset,
|
785
844
|
inference_method=inference_method,
|
786
845
|
)
|
787
|
-
assert isinstance(
|
846
|
+
assert isinstance(
|
847
|
+
dataset._session, Session
|
848
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
788
849
|
transform_kwargs = dict(
|
789
850
|
session=dataset._session,
|
790
851
|
dependencies=self._deps,
|
791
|
-
drop_input_cols
|
852
|
+
drop_input_cols=self._drop_input_cols,
|
792
853
|
expected_output_cols_type="float",
|
793
854
|
)
|
855
|
+
expected_output_cols = self._align_expected_output_names(
|
856
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
857
|
+
)
|
794
858
|
elif isinstance(dataset, pd.DataFrame):
|
795
|
-
transform_kwargs = dict(
|
796
|
-
snowpark_input_cols = self._snowpark_cols,
|
797
|
-
drop_input_cols = self._drop_input_cols
|
798
|
-
)
|
859
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
799
860
|
|
800
861
|
transform_handlers = ModelTransformerBuilder.build(
|
801
862
|
dataset=dataset,
|
@@ -808,7 +869,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
808
869
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
809
870
|
inference_method=inference_method,
|
810
871
|
input_cols=self.input_cols,
|
811
|
-
expected_output_cols=
|
872
|
+
expected_output_cols=expected_output_cols,
|
812
873
|
**transform_kwargs
|
813
874
|
)
|
814
875
|
return output_df
|
@@ -836,30 +897,34 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
836
897
|
Output dataset with results of the decision function for the samples in input dataset.
|
837
898
|
"""
|
838
899
|
super()._check_dataset_type(dataset)
|
839
|
-
inference_method="decision_function"
|
900
|
+
inference_method = "decision_function"
|
840
901
|
|
841
902
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
842
903
|
# are specific to the type of dataset used.
|
843
904
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
844
905
|
|
906
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
907
|
+
|
845
908
|
if isinstance(dataset, DataFrame):
|
846
909
|
self._deps = self._batch_inference_validate_snowpark(
|
847
910
|
dataset=dataset,
|
848
911
|
inference_method=inference_method,
|
849
912
|
)
|
850
|
-
assert isinstance(
|
913
|
+
assert isinstance(
|
914
|
+
dataset._session, Session
|
915
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
851
916
|
transform_kwargs = dict(
|
852
917
|
session=dataset._session,
|
853
918
|
dependencies=self._deps,
|
854
|
-
drop_input_cols
|
919
|
+
drop_input_cols=self._drop_input_cols,
|
855
920
|
expected_output_cols_type="float",
|
856
921
|
)
|
922
|
+
expected_output_cols = self._align_expected_output_names(
|
923
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
924
|
+
)
|
857
925
|
|
858
926
|
elif isinstance(dataset, pd.DataFrame):
|
859
|
-
transform_kwargs = dict(
|
860
|
-
snowpark_input_cols = self._snowpark_cols,
|
861
|
-
drop_input_cols = self._drop_input_cols
|
862
|
-
)
|
927
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
863
928
|
|
864
929
|
transform_handlers = ModelTransformerBuilder.build(
|
865
930
|
dataset=dataset,
|
@@ -872,7 +937,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
872
937
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
873
938
|
inference_method=inference_method,
|
874
939
|
input_cols=self.input_cols,
|
875
|
-
expected_output_cols=
|
940
|
+
expected_output_cols=expected_output_cols,
|
876
941
|
**transform_kwargs
|
877
942
|
)
|
878
943
|
return output_df
|
@@ -901,12 +966,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
901
966
|
Output dataset with probability of the sample for each class in the model.
|
902
967
|
"""
|
903
968
|
super()._check_dataset_type(dataset)
|
904
|
-
inference_method="score_samples"
|
969
|
+
inference_method = "score_samples"
|
905
970
|
|
906
971
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
907
972
|
# are specific to the type of dataset used.
|
908
973
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
909
974
|
|
975
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
976
|
+
|
910
977
|
if isinstance(dataset, DataFrame):
|
911
978
|
self._deps = self._batch_inference_validate_snowpark(
|
912
979
|
dataset=dataset,
|
@@ -919,6 +986,9 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
919
986
|
drop_input_cols = self._drop_input_cols,
|
920
987
|
expected_output_cols_type="float",
|
921
988
|
)
|
989
|
+
expected_output_cols = self._align_expected_output_names(
|
990
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
991
|
+
)
|
922
992
|
|
923
993
|
elif isinstance(dataset, pd.DataFrame):
|
924
994
|
transform_kwargs = dict(
|
@@ -937,7 +1007,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
937
1007
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
938
1008
|
inference_method=inference_method,
|
939
1009
|
input_cols=self.input_cols,
|
940
|
-
expected_output_cols=
|
1010
|
+
expected_output_cols=expected_output_cols,
|
941
1011
|
**transform_kwargs
|
942
1012
|
)
|
943
1013
|
return output_df
|
@@ -1084,50 +1154,84 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
1084
1154
|
)
|
1085
1155
|
return output_df
|
1086
1156
|
|
1157
|
+
|
1158
|
+
|
1159
|
+
def to_sklearn(self) -> Any:
|
1160
|
+
"""Get sklearn.ensemble.HistGradientBoostingClassifier object.
|
1161
|
+
"""
|
1162
|
+
if self._sklearn_object is None:
|
1163
|
+
self._sklearn_object = self._create_sklearn_object()
|
1164
|
+
return self._sklearn_object
|
1165
|
+
|
1166
|
+
def to_xgboost(self) -> Any:
|
1167
|
+
raise exceptions.SnowflakeMLException(
|
1168
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1169
|
+
original_exception=AttributeError(
|
1170
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1171
|
+
"to_xgboost()",
|
1172
|
+
"to_sklearn()"
|
1173
|
+
)
|
1174
|
+
),
|
1175
|
+
)
|
1176
|
+
|
1177
|
+
def to_lightgbm(self) -> Any:
|
1178
|
+
raise exceptions.SnowflakeMLException(
|
1179
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1180
|
+
original_exception=AttributeError(
|
1181
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1182
|
+
"to_lightgbm()",
|
1183
|
+
"to_sklearn()"
|
1184
|
+
)
|
1185
|
+
),
|
1186
|
+
)
|
1087
1187
|
|
1088
|
-
def
|
1188
|
+
def _get_dependencies(self) -> List[str]:
|
1189
|
+
return self._deps
|
1190
|
+
|
1191
|
+
|
1192
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1089
1193
|
self._model_signature_dict = dict()
|
1090
1194
|
|
1091
1195
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1092
1196
|
|
1093
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1197
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1094
1198
|
outputs: List[BaseFeatureSpec] = []
|
1095
1199
|
if hasattr(self, "predict"):
|
1096
1200
|
# keep mypy happy
|
1097
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1201
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1098
1202
|
# For classifier, the type of predict is the same as the type of label
|
1099
|
-
if self._sklearn_object._estimator_type ==
|
1100
|
-
|
1203
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1204
|
+
# label columns is the desired type for output
|
1101
1205
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1102
1206
|
# rename the output columns
|
1103
1207
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1104
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1105
|
-
|
1106
|
-
|
1208
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1209
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1210
|
+
)
|
1107
1211
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1108
1212
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1109
|
-
# Clusterer returns int64 cluster labels.
|
1213
|
+
# Clusterer returns int64 cluster labels.
|
1110
1214
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1111
1215
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1112
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1216
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1217
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1218
|
+
)
|
1219
|
+
|
1116
1220
|
# For regressor, the type of predict is float64
|
1117
|
-
elif self._sklearn_object._estimator_type ==
|
1221
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1118
1222
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1119
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1223
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1224
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1225
|
+
)
|
1226
|
+
|
1123
1227
|
for prob_func in PROB_FUNCTIONS:
|
1124
1228
|
if hasattr(self, prob_func):
|
1125
1229
|
output_cols_prefix: str = f"{prob_func}_"
|
1126
1230
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1127
1231
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1128
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1129
|
-
|
1130
|
-
|
1232
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1233
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1234
|
+
)
|
1131
1235
|
|
1132
1236
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1133
1237
|
items = list(self._model_signature_dict.items())
|
@@ -1140,10 +1244,10 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
1140
1244
|
"""Returns model signature of current class.
|
1141
1245
|
|
1142
1246
|
Raises:
|
1143
|
-
|
1247
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1144
1248
|
|
1145
1249
|
Returns:
|
1146
|
-
Dict
|
1250
|
+
Dict with each method and its input output signature
|
1147
1251
|
"""
|
1148
1252
|
if self._model_signature_dict is None:
|
1149
1253
|
raise exceptions.SnowflakeMLException(
|
@@ -1151,35 +1255,3 @@ class HistGradientBoostingClassifier(BaseTransformer):
|
|
1151
1255
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1152
1256
|
)
|
1153
1257
|
return self._model_signature_dict
|
1154
|
-
|
1155
|
-
def to_sklearn(self) -> Any:
|
1156
|
-
"""Get sklearn.ensemble.HistGradientBoostingClassifier object.
|
1157
|
-
"""
|
1158
|
-
if self._sklearn_object is None:
|
1159
|
-
self._sklearn_object = self._create_sklearn_object()
|
1160
|
-
return self._sklearn_object
|
1161
|
-
|
1162
|
-
def to_xgboost(self) -> Any:
|
1163
|
-
raise exceptions.SnowflakeMLException(
|
1164
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1165
|
-
original_exception=AttributeError(
|
1166
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1167
|
-
"to_xgboost()",
|
1168
|
-
"to_sklearn()"
|
1169
|
-
)
|
1170
|
-
),
|
1171
|
-
)
|
1172
|
-
|
1173
|
-
def to_lightgbm(self) -> Any:
|
1174
|
-
raise exceptions.SnowflakeMLException(
|
1175
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1176
|
-
original_exception=AttributeError(
|
1177
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1178
|
-
"to_lightgbm()",
|
1179
|
-
"to_sklearn()"
|
1180
|
-
)
|
1181
|
-
),
|
1182
|
-
)
|
1183
|
-
|
1184
|
-
def _get_dependencies(self) -> List[str]:
|
1185
|
-
return self._deps
|