snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -312,12 +311,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
312
311
|
)
|
313
312
|
return selected_cols
|
314
313
|
|
315
|
-
|
316
|
-
project=_PROJECT,
|
317
|
-
subproject=_SUBPROJECT,
|
318
|
-
custom_tags=dict([("autogen", True)]),
|
319
|
-
)
|
320
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "DecisionTreeRegressor":
|
314
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "DecisionTreeRegressor":
|
321
315
|
"""Build a decision tree regressor from the training set (X, y)
|
322
316
|
For more details on this function, see [sklearn.tree.DecisionTreeRegressor.fit]
|
323
317
|
(https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.fit)
|
@@ -344,12 +338,14 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
344
338
|
|
345
339
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
346
340
|
|
347
|
-
|
341
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
348
342
|
if SNOWML_SPROC_ENV in os.environ:
|
349
343
|
statement_params = telemetry.get_function_usage_statement_params(
|
350
344
|
project=_PROJECT,
|
351
345
|
subproject=_SUBPROJECT,
|
352
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
346
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
347
|
+
inspect.currentframe(), DecisionTreeRegressor.__class__.__name__
|
348
|
+
),
|
353
349
|
api_calls=[Session.call],
|
354
350
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
355
351
|
)
|
@@ -370,7 +366,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
370
366
|
)
|
371
367
|
self._sklearn_object = model_trainer.train()
|
372
368
|
self._is_fitted = True
|
373
|
-
self.
|
369
|
+
self._generate_model_signatures(dataset)
|
374
370
|
return self
|
375
371
|
|
376
372
|
def _batch_inference_validate_snowpark(
|
@@ -446,7 +442,9 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
446
442
|
# when it is classifier, infer the datatype from label columns
|
447
443
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
448
444
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
449
|
-
label_cols_signatures = [
|
445
|
+
label_cols_signatures = [
|
446
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
447
|
+
]
|
450
448
|
if len(label_cols_signatures) == 0:
|
451
449
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
452
450
|
raise exceptions.SnowflakeMLException(
|
@@ -454,25 +452,22 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
454
452
|
original_exception=ValueError(error_str),
|
455
453
|
)
|
456
454
|
|
457
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
458
|
-
label_cols_signatures[0].as_snowpark_type()
|
459
|
-
)
|
455
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
460
456
|
|
461
457
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
462
|
-
assert isinstance(
|
458
|
+
assert isinstance(
|
459
|
+
dataset._session, Session
|
460
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
463
461
|
|
464
462
|
transform_kwargs = dict(
|
465
|
-
session
|
466
|
-
dependencies
|
467
|
-
drop_input_cols
|
468
|
-
expected_output_cols_type
|
463
|
+
session=dataset._session,
|
464
|
+
dependencies=self._deps,
|
465
|
+
drop_input_cols=self._drop_input_cols,
|
466
|
+
expected_output_cols_type=expected_type_inferred,
|
469
467
|
)
|
470
468
|
|
471
469
|
elif isinstance(dataset, pd.DataFrame):
|
472
|
-
transform_kwargs = dict(
|
473
|
-
snowpark_input_cols = self._snowpark_cols,
|
474
|
-
drop_input_cols = self._drop_input_cols
|
475
|
-
)
|
470
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
476
471
|
|
477
472
|
transform_handlers = ModelTransformerBuilder.build(
|
478
473
|
dataset=dataset,
|
@@ -512,7 +507,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
512
507
|
Transformed dataset.
|
513
508
|
"""
|
514
509
|
super()._check_dataset_type(dataset)
|
515
|
-
inference_method="transform"
|
510
|
+
inference_method = "transform"
|
516
511
|
|
517
512
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
518
513
|
# are specific to the type of dataset used.
|
@@ -549,17 +544,14 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
549
544
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
550
545
|
|
551
546
|
transform_kwargs = dict(
|
552
|
-
session
|
553
|
-
dependencies
|
554
|
-
drop_input_cols
|
555
|
-
expected_output_cols_type
|
547
|
+
session=dataset._session,
|
548
|
+
dependencies=self._deps,
|
549
|
+
drop_input_cols=self._drop_input_cols,
|
550
|
+
expected_output_cols_type=expected_dtype,
|
556
551
|
)
|
557
552
|
|
558
553
|
elif isinstance(dataset, pd.DataFrame):
|
559
|
-
transform_kwargs = dict(
|
560
|
-
snowpark_input_cols = self._snowpark_cols,
|
561
|
-
drop_input_cols = self._drop_input_cols
|
562
|
-
)
|
554
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
563
555
|
|
564
556
|
transform_handlers = ModelTransformerBuilder.build(
|
565
557
|
dataset=dataset,
|
@@ -578,7 +570,11 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
578
570
|
return output_df
|
579
571
|
|
580
572
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
581
|
-
def fit_predict(
|
573
|
+
def fit_predict(
|
574
|
+
self,
|
575
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
576
|
+
output_cols_prefix: str = "fit_predict_",
|
577
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
582
578
|
""" Method not supported for this class.
|
583
579
|
|
584
580
|
|
@@ -603,7 +599,9 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
603
599
|
)
|
604
600
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
605
601
|
drop_input_cols=self._drop_input_cols,
|
606
|
-
expected_output_cols_list=
|
602
|
+
expected_output_cols_list=(
|
603
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
604
|
+
),
|
607
605
|
)
|
608
606
|
self._sklearn_object = fitted_estimator
|
609
607
|
self._is_fitted = True
|
@@ -620,6 +618,62 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
620
618
|
assert self._sklearn_object is not None
|
621
619
|
return self._sklearn_object.embedding_
|
622
620
|
|
621
|
+
|
622
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
623
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
624
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
625
|
+
"""
|
626
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
627
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
628
|
+
if output_cols:
|
629
|
+
output_cols = [
|
630
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
631
|
+
for c in output_cols
|
632
|
+
]
|
633
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
634
|
+
output_cols = [output_cols_prefix]
|
635
|
+
elif self._sklearn_object is not None:
|
636
|
+
classes = self._sklearn_object.classes_
|
637
|
+
if isinstance(classes, numpy.ndarray):
|
638
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
639
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
640
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
641
|
+
output_cols = []
|
642
|
+
for i, cl in enumerate(classes):
|
643
|
+
# For binary classification, there is only one output column for each class
|
644
|
+
# ndarray as the two classes are complementary.
|
645
|
+
if len(cl) == 2:
|
646
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
647
|
+
else:
|
648
|
+
output_cols.extend([
|
649
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
650
|
+
])
|
651
|
+
else:
|
652
|
+
output_cols = []
|
653
|
+
|
654
|
+
# Make sure column names are valid snowflake identifiers.
|
655
|
+
assert output_cols is not None # Make MyPy happy
|
656
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
657
|
+
|
658
|
+
return rv
|
659
|
+
|
660
|
+
def _align_expected_output_names(
|
661
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
662
|
+
) -> List[str]:
|
663
|
+
# in case the inferred output column names dimension is different
|
664
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
665
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
666
|
+
output_df_columns = list(output_df_pd.columns)
|
667
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
668
|
+
if self.sample_weight_col:
|
669
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
670
|
+
# if the dimension of inferred output column names is correct; use it
|
671
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
672
|
+
return expected_output_cols_list
|
673
|
+
# otherwise, use the sklearn estimator's output
|
674
|
+
else:
|
675
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
676
|
+
|
623
677
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
624
678
|
@telemetry.send_api_usage_telemetry(
|
625
679
|
project=_PROJECT,
|
@@ -650,24 +704,28 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
650
704
|
# are specific to the type of dataset used.
|
651
705
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
652
706
|
|
707
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
708
|
+
|
653
709
|
if isinstance(dataset, DataFrame):
|
654
710
|
self._deps = self._batch_inference_validate_snowpark(
|
655
711
|
dataset=dataset,
|
656
712
|
inference_method=inference_method,
|
657
713
|
)
|
658
|
-
assert isinstance(
|
714
|
+
assert isinstance(
|
715
|
+
dataset._session, Session
|
716
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
659
717
|
transform_kwargs = dict(
|
660
718
|
session=dataset._session,
|
661
719
|
dependencies=self._deps,
|
662
|
-
drop_input_cols
|
720
|
+
drop_input_cols=self._drop_input_cols,
|
663
721
|
expected_output_cols_type="float",
|
664
722
|
)
|
723
|
+
expected_output_cols = self._align_expected_output_names(
|
724
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
725
|
+
)
|
665
726
|
|
666
727
|
elif isinstance(dataset, pd.DataFrame):
|
667
|
-
transform_kwargs = dict(
|
668
|
-
snowpark_input_cols = self._snowpark_cols,
|
669
|
-
drop_input_cols = self._drop_input_cols
|
670
|
-
)
|
728
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
671
729
|
|
672
730
|
transform_handlers = ModelTransformerBuilder.build(
|
673
731
|
dataset=dataset,
|
@@ -679,7 +737,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
679
737
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
680
738
|
inference_method=inference_method,
|
681
739
|
input_cols=self.input_cols,
|
682
|
-
expected_output_cols=
|
740
|
+
expected_output_cols=expected_output_cols,
|
683
741
|
**transform_kwargs
|
684
742
|
)
|
685
743
|
return output_df
|
@@ -709,7 +767,8 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
709
767
|
Output dataset with log probability of the sample for each class in the model.
|
710
768
|
"""
|
711
769
|
super()._check_dataset_type(dataset)
|
712
|
-
inference_method="predict_log_proba"
|
770
|
+
inference_method = "predict_log_proba"
|
771
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
713
772
|
|
714
773
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
715
774
|
# are specific to the type of dataset used.
|
@@ -720,18 +779,20 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
720
779
|
dataset=dataset,
|
721
780
|
inference_method=inference_method,
|
722
781
|
)
|
723
|
-
assert isinstance(
|
782
|
+
assert isinstance(
|
783
|
+
dataset._session, Session
|
784
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
724
785
|
transform_kwargs = dict(
|
725
786
|
session=dataset._session,
|
726
787
|
dependencies=self._deps,
|
727
|
-
drop_input_cols
|
788
|
+
drop_input_cols=self._drop_input_cols,
|
728
789
|
expected_output_cols_type="float",
|
729
790
|
)
|
791
|
+
expected_output_cols = self._align_expected_output_names(
|
792
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
793
|
+
)
|
730
794
|
elif isinstance(dataset, pd.DataFrame):
|
731
|
-
transform_kwargs = dict(
|
732
|
-
snowpark_input_cols = self._snowpark_cols,
|
733
|
-
drop_input_cols = self._drop_input_cols
|
734
|
-
)
|
795
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
735
796
|
|
736
797
|
transform_handlers = ModelTransformerBuilder.build(
|
737
798
|
dataset=dataset,
|
@@ -744,7 +805,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
744
805
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
745
806
|
inference_method=inference_method,
|
746
807
|
input_cols=self.input_cols,
|
747
|
-
expected_output_cols=
|
808
|
+
expected_output_cols=expected_output_cols,
|
748
809
|
**transform_kwargs
|
749
810
|
)
|
750
811
|
return output_df
|
@@ -770,30 +831,34 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
770
831
|
Output dataset with results of the decision function for the samples in input dataset.
|
771
832
|
"""
|
772
833
|
super()._check_dataset_type(dataset)
|
773
|
-
inference_method="decision_function"
|
834
|
+
inference_method = "decision_function"
|
774
835
|
|
775
836
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
776
837
|
# are specific to the type of dataset used.
|
777
838
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
778
839
|
|
840
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
841
|
+
|
779
842
|
if isinstance(dataset, DataFrame):
|
780
843
|
self._deps = self._batch_inference_validate_snowpark(
|
781
844
|
dataset=dataset,
|
782
845
|
inference_method=inference_method,
|
783
846
|
)
|
784
|
-
assert isinstance(
|
847
|
+
assert isinstance(
|
848
|
+
dataset._session, Session
|
849
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
785
850
|
transform_kwargs = dict(
|
786
851
|
session=dataset._session,
|
787
852
|
dependencies=self._deps,
|
788
|
-
drop_input_cols
|
853
|
+
drop_input_cols=self._drop_input_cols,
|
789
854
|
expected_output_cols_type="float",
|
790
855
|
)
|
856
|
+
expected_output_cols = self._align_expected_output_names(
|
857
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
858
|
+
)
|
791
859
|
|
792
860
|
elif isinstance(dataset, pd.DataFrame):
|
793
|
-
transform_kwargs = dict(
|
794
|
-
snowpark_input_cols = self._snowpark_cols,
|
795
|
-
drop_input_cols = self._drop_input_cols
|
796
|
-
)
|
861
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
797
862
|
|
798
863
|
transform_handlers = ModelTransformerBuilder.build(
|
799
864
|
dataset=dataset,
|
@@ -806,7 +871,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
806
871
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
807
872
|
inference_method=inference_method,
|
808
873
|
input_cols=self.input_cols,
|
809
|
-
expected_output_cols=
|
874
|
+
expected_output_cols=expected_output_cols,
|
810
875
|
**transform_kwargs
|
811
876
|
)
|
812
877
|
return output_df
|
@@ -835,12 +900,14 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
835
900
|
Output dataset with probability of the sample for each class in the model.
|
836
901
|
"""
|
837
902
|
super()._check_dataset_type(dataset)
|
838
|
-
inference_method="score_samples"
|
903
|
+
inference_method = "score_samples"
|
839
904
|
|
840
905
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
841
906
|
# are specific to the type of dataset used.
|
842
907
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
843
908
|
|
909
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
910
|
+
|
844
911
|
if isinstance(dataset, DataFrame):
|
845
912
|
self._deps = self._batch_inference_validate_snowpark(
|
846
913
|
dataset=dataset,
|
@@ -853,6 +920,9 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
853
920
|
drop_input_cols = self._drop_input_cols,
|
854
921
|
expected_output_cols_type="float",
|
855
922
|
)
|
923
|
+
expected_output_cols = self._align_expected_output_names(
|
924
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
925
|
+
)
|
856
926
|
|
857
927
|
elif isinstance(dataset, pd.DataFrame):
|
858
928
|
transform_kwargs = dict(
|
@@ -871,7 +941,7 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
871
941
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
872
942
|
inference_method=inference_method,
|
873
943
|
input_cols=self.input_cols,
|
874
|
-
expected_output_cols=
|
944
|
+
expected_output_cols=expected_output_cols,
|
875
945
|
**transform_kwargs
|
876
946
|
)
|
877
947
|
return output_df
|
@@ -1018,50 +1088,84 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
1018
1088
|
)
|
1019
1089
|
return output_df
|
1020
1090
|
|
1091
|
+
|
1092
|
+
|
1093
|
+
def to_sklearn(self) -> Any:
|
1094
|
+
"""Get sklearn.tree.DecisionTreeRegressor object.
|
1095
|
+
"""
|
1096
|
+
if self._sklearn_object is None:
|
1097
|
+
self._sklearn_object = self._create_sklearn_object()
|
1098
|
+
return self._sklearn_object
|
1099
|
+
|
1100
|
+
def to_xgboost(self) -> Any:
|
1101
|
+
raise exceptions.SnowflakeMLException(
|
1102
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1103
|
+
original_exception=AttributeError(
|
1104
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1105
|
+
"to_xgboost()",
|
1106
|
+
"to_sklearn()"
|
1107
|
+
)
|
1108
|
+
),
|
1109
|
+
)
|
1110
|
+
|
1111
|
+
def to_lightgbm(self) -> Any:
|
1112
|
+
raise exceptions.SnowflakeMLException(
|
1113
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1114
|
+
original_exception=AttributeError(
|
1115
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1116
|
+
"to_lightgbm()",
|
1117
|
+
"to_sklearn()"
|
1118
|
+
)
|
1119
|
+
),
|
1120
|
+
)
|
1021
1121
|
|
1022
|
-
def
|
1122
|
+
def _get_dependencies(self) -> List[str]:
|
1123
|
+
return self._deps
|
1124
|
+
|
1125
|
+
|
1126
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1023
1127
|
self._model_signature_dict = dict()
|
1024
1128
|
|
1025
1129
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1026
1130
|
|
1027
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1131
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1028
1132
|
outputs: List[BaseFeatureSpec] = []
|
1029
1133
|
if hasattr(self, "predict"):
|
1030
1134
|
# keep mypy happy
|
1031
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1135
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1032
1136
|
# For classifier, the type of predict is the same as the type of label
|
1033
|
-
if self._sklearn_object._estimator_type ==
|
1034
|
-
|
1137
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1138
|
+
# label columns is the desired type for output
|
1035
1139
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1036
1140
|
# rename the output columns
|
1037
1141
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1038
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1039
|
-
|
1040
|
-
|
1142
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1143
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1144
|
+
)
|
1041
1145
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1042
1146
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1043
|
-
# Clusterer returns int64 cluster labels.
|
1147
|
+
# Clusterer returns int64 cluster labels.
|
1044
1148
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1045
1149
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1046
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1150
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1151
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1152
|
+
)
|
1153
|
+
|
1050
1154
|
# For regressor, the type of predict is float64
|
1051
|
-
elif self._sklearn_object._estimator_type ==
|
1155
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1052
1156
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1053
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1157
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1158
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1159
|
+
)
|
1160
|
+
|
1057
1161
|
for prob_func in PROB_FUNCTIONS:
|
1058
1162
|
if hasattr(self, prob_func):
|
1059
1163
|
output_cols_prefix: str = f"{prob_func}_"
|
1060
1164
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1061
1165
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1062
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1063
|
-
|
1064
|
-
|
1166
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1167
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1168
|
+
)
|
1065
1169
|
|
1066
1170
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1067
1171
|
items = list(self._model_signature_dict.items())
|
@@ -1074,10 +1178,10 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
1074
1178
|
"""Returns model signature of current class.
|
1075
1179
|
|
1076
1180
|
Raises:
|
1077
|
-
|
1181
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1078
1182
|
|
1079
1183
|
Returns:
|
1080
|
-
Dict
|
1184
|
+
Dict with each method and its input output signature
|
1081
1185
|
"""
|
1082
1186
|
if self._model_signature_dict is None:
|
1083
1187
|
raise exceptions.SnowflakeMLException(
|
@@ -1085,35 +1189,3 @@ class DecisionTreeRegressor(BaseTransformer):
|
|
1085
1189
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1086
1190
|
)
|
1087
1191
|
return self._model_signature_dict
|
1088
|
-
|
1089
|
-
def to_sklearn(self) -> Any:
|
1090
|
-
"""Get sklearn.tree.DecisionTreeRegressor object.
|
1091
|
-
"""
|
1092
|
-
if self._sklearn_object is None:
|
1093
|
-
self._sklearn_object = self._create_sklearn_object()
|
1094
|
-
return self._sklearn_object
|
1095
|
-
|
1096
|
-
def to_xgboost(self) -> Any:
|
1097
|
-
raise exceptions.SnowflakeMLException(
|
1098
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1099
|
-
original_exception=AttributeError(
|
1100
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1101
|
-
"to_xgboost()",
|
1102
|
-
"to_sklearn()"
|
1103
|
-
)
|
1104
|
-
),
|
1105
|
-
)
|
1106
|
-
|
1107
|
-
def to_lightgbm(self) -> Any:
|
1108
|
-
raise exceptions.SnowflakeMLException(
|
1109
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1110
|
-
original_exception=AttributeError(
|
1111
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1112
|
-
"to_lightgbm()",
|
1113
|
-
"to_sklearn()"
|
1114
|
-
)
|
1115
|
-
),
|
1116
|
-
)
|
1117
|
-
|
1118
|
-
def _get_dependencies(self) -> List[str]:
|
1119
|
-
return self._deps
|