snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
34
34
|
BatchInferenceKwargsTypedDict,
|
35
35
|
ScoreKwargsTypedDict
|
36
36
|
)
|
37
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
38
|
+
from snowflake.ml.model.model_signature import (
|
39
|
+
BaseFeatureSpec,
|
40
|
+
DataType,
|
41
|
+
FeatureSpec,
|
42
|
+
ModelSignature,
|
43
|
+
_infer_signature,
|
44
|
+
_rename_signature_with_snowflake_identifiers,
|
45
|
+
)
|
37
46
|
|
38
47
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
39
48
|
|
@@ -44,16 +53,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
44
53
|
validate_sklearn_args,
|
45
54
|
)
|
46
55
|
|
47
|
-
from snowflake.ml.model.model_signature import (
|
48
|
-
DataType,
|
49
|
-
FeatureSpec,
|
50
|
-
ModelSignature,
|
51
|
-
_infer_signature,
|
52
|
-
_rename_signature_with_snowflake_identifiers,
|
53
|
-
BaseFeatureSpec,
|
54
|
-
)
|
55
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
56
|
-
|
57
56
|
_PROJECT = "ModelDevelopment"
|
58
57
|
# Derive subproject from module name by removing "sklearn"
|
59
58
|
# and converting module name from underscore to CamelCase
|
@@ -324,12 +323,7 @@ class IterativeImputer(BaseTransformer):
|
|
324
323
|
)
|
325
324
|
return selected_cols
|
326
325
|
|
327
|
-
|
328
|
-
project=_PROJECT,
|
329
|
-
subproject=_SUBPROJECT,
|
330
|
-
custom_tags=dict([("autogen", True)]),
|
331
|
-
)
|
332
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IterativeImputer":
|
326
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IterativeImputer":
|
333
327
|
"""Fit the imputer on `X` and return self
|
334
328
|
For more details on this function, see [sklearn.impute.IterativeImputer.fit]
|
335
329
|
(https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer.fit)
|
@@ -356,12 +350,14 @@ class IterativeImputer(BaseTransformer):
|
|
356
350
|
|
357
351
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
358
352
|
|
359
|
-
|
353
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
360
354
|
if SNOWML_SPROC_ENV in os.environ:
|
361
355
|
statement_params = telemetry.get_function_usage_statement_params(
|
362
356
|
project=_PROJECT,
|
363
357
|
subproject=_SUBPROJECT,
|
364
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
358
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
359
|
+
inspect.currentframe(), IterativeImputer.__class__.__name__
|
360
|
+
),
|
365
361
|
api_calls=[Session.call],
|
366
362
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
367
363
|
)
|
@@ -382,7 +378,7 @@ class IterativeImputer(BaseTransformer):
|
|
382
378
|
)
|
383
379
|
self._sklearn_object = model_trainer.train()
|
384
380
|
self._is_fitted = True
|
385
|
-
self.
|
381
|
+
self._generate_model_signatures(dataset)
|
386
382
|
return self
|
387
383
|
|
388
384
|
def _batch_inference_validate_snowpark(
|
@@ -456,7 +452,9 @@ class IterativeImputer(BaseTransformer):
|
|
456
452
|
# when it is classifier, infer the datatype from label columns
|
457
453
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
458
454
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
459
|
-
label_cols_signatures = [
|
455
|
+
label_cols_signatures = [
|
456
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
457
|
+
]
|
460
458
|
if len(label_cols_signatures) == 0:
|
461
459
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
462
460
|
raise exceptions.SnowflakeMLException(
|
@@ -464,25 +462,22 @@ class IterativeImputer(BaseTransformer):
|
|
464
462
|
original_exception=ValueError(error_str),
|
465
463
|
)
|
466
464
|
|
467
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
468
|
-
label_cols_signatures[0].as_snowpark_type()
|
469
|
-
)
|
465
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
470
466
|
|
471
467
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
472
|
-
assert isinstance(
|
468
|
+
assert isinstance(
|
469
|
+
dataset._session, Session
|
470
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
473
471
|
|
474
472
|
transform_kwargs = dict(
|
475
|
-
session
|
476
|
-
dependencies
|
477
|
-
drop_input_cols
|
478
|
-
expected_output_cols_type
|
473
|
+
session=dataset._session,
|
474
|
+
dependencies=self._deps,
|
475
|
+
drop_input_cols=self._drop_input_cols,
|
476
|
+
expected_output_cols_type=expected_type_inferred,
|
479
477
|
)
|
480
478
|
|
481
479
|
elif isinstance(dataset, pd.DataFrame):
|
482
|
-
transform_kwargs = dict(
|
483
|
-
snowpark_input_cols = self._snowpark_cols,
|
484
|
-
drop_input_cols = self._drop_input_cols
|
485
|
-
)
|
480
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
486
481
|
|
487
482
|
transform_handlers = ModelTransformerBuilder.build(
|
488
483
|
dataset=dataset,
|
@@ -524,7 +519,7 @@ class IterativeImputer(BaseTransformer):
|
|
524
519
|
Transformed dataset.
|
525
520
|
"""
|
526
521
|
super()._check_dataset_type(dataset)
|
527
|
-
inference_method="transform"
|
522
|
+
inference_method = "transform"
|
528
523
|
|
529
524
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
530
525
|
# are specific to the type of dataset used.
|
@@ -561,17 +556,14 @@ class IterativeImputer(BaseTransformer):
|
|
561
556
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
562
557
|
|
563
558
|
transform_kwargs = dict(
|
564
|
-
session
|
565
|
-
dependencies
|
566
|
-
drop_input_cols
|
567
|
-
expected_output_cols_type
|
559
|
+
session=dataset._session,
|
560
|
+
dependencies=self._deps,
|
561
|
+
drop_input_cols=self._drop_input_cols,
|
562
|
+
expected_output_cols_type=expected_dtype,
|
568
563
|
)
|
569
564
|
|
570
565
|
elif isinstance(dataset, pd.DataFrame):
|
571
|
-
transform_kwargs = dict(
|
572
|
-
snowpark_input_cols = self._snowpark_cols,
|
573
|
-
drop_input_cols = self._drop_input_cols
|
574
|
-
)
|
566
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
575
567
|
|
576
568
|
transform_handlers = ModelTransformerBuilder.build(
|
577
569
|
dataset=dataset,
|
@@ -590,7 +582,11 @@ class IterativeImputer(BaseTransformer):
|
|
590
582
|
return output_df
|
591
583
|
|
592
584
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
593
|
-
def fit_predict(
|
585
|
+
def fit_predict(
|
586
|
+
self,
|
587
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
588
|
+
output_cols_prefix: str = "fit_predict_",
|
589
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
594
590
|
""" Method not supported for this class.
|
595
591
|
|
596
592
|
|
@@ -615,7 +611,9 @@ class IterativeImputer(BaseTransformer):
|
|
615
611
|
)
|
616
612
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
617
613
|
drop_input_cols=self._drop_input_cols,
|
618
|
-
expected_output_cols_list=
|
614
|
+
expected_output_cols_list=(
|
615
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
616
|
+
),
|
619
617
|
)
|
620
618
|
self._sklearn_object = fitted_estimator
|
621
619
|
self._is_fitted = True
|
@@ -632,6 +630,62 @@ class IterativeImputer(BaseTransformer):
|
|
632
630
|
assert self._sklearn_object is not None
|
633
631
|
return self._sklearn_object.embedding_
|
634
632
|
|
633
|
+
|
634
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
635
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
636
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
637
|
+
"""
|
638
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
639
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
640
|
+
if output_cols:
|
641
|
+
output_cols = [
|
642
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
643
|
+
for c in output_cols
|
644
|
+
]
|
645
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
646
|
+
output_cols = [output_cols_prefix]
|
647
|
+
elif self._sklearn_object is not None:
|
648
|
+
classes = self._sklearn_object.classes_
|
649
|
+
if isinstance(classes, numpy.ndarray):
|
650
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
651
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
652
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
653
|
+
output_cols = []
|
654
|
+
for i, cl in enumerate(classes):
|
655
|
+
# For binary classification, there is only one output column for each class
|
656
|
+
# ndarray as the two classes are complementary.
|
657
|
+
if len(cl) == 2:
|
658
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
659
|
+
else:
|
660
|
+
output_cols.extend([
|
661
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
662
|
+
])
|
663
|
+
else:
|
664
|
+
output_cols = []
|
665
|
+
|
666
|
+
# Make sure column names are valid snowflake identifiers.
|
667
|
+
assert output_cols is not None # Make MyPy happy
|
668
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
669
|
+
|
670
|
+
return rv
|
671
|
+
|
672
|
+
def _align_expected_output_names(
|
673
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
674
|
+
) -> List[str]:
|
675
|
+
# in case the inferred output column names dimension is different
|
676
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
677
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
678
|
+
output_df_columns = list(output_df_pd.columns)
|
679
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
680
|
+
if self.sample_weight_col:
|
681
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
682
|
+
# if the dimension of inferred output column names is correct; use it
|
683
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
684
|
+
return expected_output_cols_list
|
685
|
+
# otherwise, use the sklearn estimator's output
|
686
|
+
else:
|
687
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
688
|
+
|
635
689
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
636
690
|
@telemetry.send_api_usage_telemetry(
|
637
691
|
project=_PROJECT,
|
@@ -662,24 +716,28 @@ class IterativeImputer(BaseTransformer):
|
|
662
716
|
# are specific to the type of dataset used.
|
663
717
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
664
718
|
|
719
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
720
|
+
|
665
721
|
if isinstance(dataset, DataFrame):
|
666
722
|
self._deps = self._batch_inference_validate_snowpark(
|
667
723
|
dataset=dataset,
|
668
724
|
inference_method=inference_method,
|
669
725
|
)
|
670
|
-
assert isinstance(
|
726
|
+
assert isinstance(
|
727
|
+
dataset._session, Session
|
728
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
671
729
|
transform_kwargs = dict(
|
672
730
|
session=dataset._session,
|
673
731
|
dependencies=self._deps,
|
674
|
-
drop_input_cols
|
732
|
+
drop_input_cols=self._drop_input_cols,
|
675
733
|
expected_output_cols_type="float",
|
676
734
|
)
|
735
|
+
expected_output_cols = self._align_expected_output_names(
|
736
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
737
|
+
)
|
677
738
|
|
678
739
|
elif isinstance(dataset, pd.DataFrame):
|
679
|
-
transform_kwargs = dict(
|
680
|
-
snowpark_input_cols = self._snowpark_cols,
|
681
|
-
drop_input_cols = self._drop_input_cols
|
682
|
-
)
|
740
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
683
741
|
|
684
742
|
transform_handlers = ModelTransformerBuilder.build(
|
685
743
|
dataset=dataset,
|
@@ -691,7 +749,7 @@ class IterativeImputer(BaseTransformer):
|
|
691
749
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
692
750
|
inference_method=inference_method,
|
693
751
|
input_cols=self.input_cols,
|
694
|
-
expected_output_cols=
|
752
|
+
expected_output_cols=expected_output_cols,
|
695
753
|
**transform_kwargs
|
696
754
|
)
|
697
755
|
return output_df
|
@@ -721,7 +779,8 @@ class IterativeImputer(BaseTransformer):
|
|
721
779
|
Output dataset with log probability of the sample for each class in the model.
|
722
780
|
"""
|
723
781
|
super()._check_dataset_type(dataset)
|
724
|
-
inference_method="predict_log_proba"
|
782
|
+
inference_method = "predict_log_proba"
|
783
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
725
784
|
|
726
785
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
727
786
|
# are specific to the type of dataset used.
|
@@ -732,18 +791,20 @@ class IterativeImputer(BaseTransformer):
|
|
732
791
|
dataset=dataset,
|
733
792
|
inference_method=inference_method,
|
734
793
|
)
|
735
|
-
assert isinstance(
|
794
|
+
assert isinstance(
|
795
|
+
dataset._session, Session
|
796
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
736
797
|
transform_kwargs = dict(
|
737
798
|
session=dataset._session,
|
738
799
|
dependencies=self._deps,
|
739
|
-
drop_input_cols
|
800
|
+
drop_input_cols=self._drop_input_cols,
|
740
801
|
expected_output_cols_type="float",
|
741
802
|
)
|
803
|
+
expected_output_cols = self._align_expected_output_names(
|
804
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
805
|
+
)
|
742
806
|
elif isinstance(dataset, pd.DataFrame):
|
743
|
-
transform_kwargs = dict(
|
744
|
-
snowpark_input_cols = self._snowpark_cols,
|
745
|
-
drop_input_cols = self._drop_input_cols
|
746
|
-
)
|
807
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
747
808
|
|
748
809
|
transform_handlers = ModelTransformerBuilder.build(
|
749
810
|
dataset=dataset,
|
@@ -756,7 +817,7 @@ class IterativeImputer(BaseTransformer):
|
|
756
817
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
757
818
|
inference_method=inference_method,
|
758
819
|
input_cols=self.input_cols,
|
759
|
-
expected_output_cols=
|
820
|
+
expected_output_cols=expected_output_cols,
|
760
821
|
**transform_kwargs
|
761
822
|
)
|
762
823
|
return output_df
|
@@ -782,30 +843,34 @@ class IterativeImputer(BaseTransformer):
|
|
782
843
|
Output dataset with results of the decision function for the samples in input dataset.
|
783
844
|
"""
|
784
845
|
super()._check_dataset_type(dataset)
|
785
|
-
inference_method="decision_function"
|
846
|
+
inference_method = "decision_function"
|
786
847
|
|
787
848
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
788
849
|
# are specific to the type of dataset used.
|
789
850
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
790
851
|
|
852
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
853
|
+
|
791
854
|
if isinstance(dataset, DataFrame):
|
792
855
|
self._deps = self._batch_inference_validate_snowpark(
|
793
856
|
dataset=dataset,
|
794
857
|
inference_method=inference_method,
|
795
858
|
)
|
796
|
-
assert isinstance(
|
859
|
+
assert isinstance(
|
860
|
+
dataset._session, Session
|
861
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
797
862
|
transform_kwargs = dict(
|
798
863
|
session=dataset._session,
|
799
864
|
dependencies=self._deps,
|
800
|
-
drop_input_cols
|
865
|
+
drop_input_cols=self._drop_input_cols,
|
801
866
|
expected_output_cols_type="float",
|
802
867
|
)
|
868
|
+
expected_output_cols = self._align_expected_output_names(
|
869
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
870
|
+
)
|
803
871
|
|
804
872
|
elif isinstance(dataset, pd.DataFrame):
|
805
|
-
transform_kwargs = dict(
|
806
|
-
snowpark_input_cols = self._snowpark_cols,
|
807
|
-
drop_input_cols = self._drop_input_cols
|
808
|
-
)
|
873
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
809
874
|
|
810
875
|
transform_handlers = ModelTransformerBuilder.build(
|
811
876
|
dataset=dataset,
|
@@ -818,7 +883,7 @@ class IterativeImputer(BaseTransformer):
|
|
818
883
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
819
884
|
inference_method=inference_method,
|
820
885
|
input_cols=self.input_cols,
|
821
|
-
expected_output_cols=
|
886
|
+
expected_output_cols=expected_output_cols,
|
822
887
|
**transform_kwargs
|
823
888
|
)
|
824
889
|
return output_df
|
@@ -847,12 +912,14 @@ class IterativeImputer(BaseTransformer):
|
|
847
912
|
Output dataset with probability of the sample for each class in the model.
|
848
913
|
"""
|
849
914
|
super()._check_dataset_type(dataset)
|
850
|
-
inference_method="score_samples"
|
915
|
+
inference_method = "score_samples"
|
851
916
|
|
852
917
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
853
918
|
# are specific to the type of dataset used.
|
854
919
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
855
920
|
|
921
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
922
|
+
|
856
923
|
if isinstance(dataset, DataFrame):
|
857
924
|
self._deps = self._batch_inference_validate_snowpark(
|
858
925
|
dataset=dataset,
|
@@ -865,6 +932,9 @@ class IterativeImputer(BaseTransformer):
|
|
865
932
|
drop_input_cols = self._drop_input_cols,
|
866
933
|
expected_output_cols_type="float",
|
867
934
|
)
|
935
|
+
expected_output_cols = self._align_expected_output_names(
|
936
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
937
|
+
)
|
868
938
|
|
869
939
|
elif isinstance(dataset, pd.DataFrame):
|
870
940
|
transform_kwargs = dict(
|
@@ -883,7 +953,7 @@ class IterativeImputer(BaseTransformer):
|
|
883
953
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
884
954
|
inference_method=inference_method,
|
885
955
|
input_cols=self.input_cols,
|
886
|
-
expected_output_cols=
|
956
|
+
expected_output_cols=expected_output_cols,
|
887
957
|
**transform_kwargs
|
888
958
|
)
|
889
959
|
return output_df
|
@@ -1028,50 +1098,84 @@ class IterativeImputer(BaseTransformer):
|
|
1028
1098
|
)
|
1029
1099
|
return output_df
|
1030
1100
|
|
1101
|
+
|
1102
|
+
|
1103
|
+
def to_sklearn(self) -> Any:
|
1104
|
+
"""Get sklearn.impute.IterativeImputer object.
|
1105
|
+
"""
|
1106
|
+
if self._sklearn_object is None:
|
1107
|
+
self._sklearn_object = self._create_sklearn_object()
|
1108
|
+
return self._sklearn_object
|
1109
|
+
|
1110
|
+
def to_xgboost(self) -> Any:
|
1111
|
+
raise exceptions.SnowflakeMLException(
|
1112
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1113
|
+
original_exception=AttributeError(
|
1114
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1115
|
+
"to_xgboost()",
|
1116
|
+
"to_sklearn()"
|
1117
|
+
)
|
1118
|
+
),
|
1119
|
+
)
|
1120
|
+
|
1121
|
+
def to_lightgbm(self) -> Any:
|
1122
|
+
raise exceptions.SnowflakeMLException(
|
1123
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1124
|
+
original_exception=AttributeError(
|
1125
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1126
|
+
"to_lightgbm()",
|
1127
|
+
"to_sklearn()"
|
1128
|
+
)
|
1129
|
+
),
|
1130
|
+
)
|
1031
1131
|
|
1032
|
-
def
|
1132
|
+
def _get_dependencies(self) -> List[str]:
|
1133
|
+
return self._deps
|
1134
|
+
|
1135
|
+
|
1136
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1033
1137
|
self._model_signature_dict = dict()
|
1034
1138
|
|
1035
1139
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1036
1140
|
|
1037
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1141
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1038
1142
|
outputs: List[BaseFeatureSpec] = []
|
1039
1143
|
if hasattr(self, "predict"):
|
1040
1144
|
# keep mypy happy
|
1041
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1145
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1042
1146
|
# For classifier, the type of predict is the same as the type of label
|
1043
|
-
if self._sklearn_object._estimator_type ==
|
1044
|
-
|
1147
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1148
|
+
# label columns is the desired type for output
|
1045
1149
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1046
1150
|
# rename the output columns
|
1047
1151
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1048
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1049
|
-
|
1050
|
-
|
1152
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1153
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1154
|
+
)
|
1051
1155
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1052
1156
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1053
|
-
# Clusterer returns int64 cluster labels.
|
1157
|
+
# Clusterer returns int64 cluster labels.
|
1054
1158
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1055
1159
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1056
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1160
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1161
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1162
|
+
)
|
1163
|
+
|
1060
1164
|
# For regressor, the type of predict is float64
|
1061
|
-
elif self._sklearn_object._estimator_type ==
|
1165
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1062
1166
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1063
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1167
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1168
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1169
|
+
)
|
1170
|
+
|
1067
1171
|
for prob_func in PROB_FUNCTIONS:
|
1068
1172
|
if hasattr(self, prob_func):
|
1069
1173
|
output_cols_prefix: str = f"{prob_func}_"
|
1070
1174
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1071
1175
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1072
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1073
|
-
|
1074
|
-
|
1176
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1177
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1178
|
+
)
|
1075
1179
|
|
1076
1180
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1077
1181
|
items = list(self._model_signature_dict.items())
|
@@ -1084,10 +1188,10 @@ class IterativeImputer(BaseTransformer):
|
|
1084
1188
|
"""Returns model signature of current class.
|
1085
1189
|
|
1086
1190
|
Raises:
|
1087
|
-
|
1191
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1088
1192
|
|
1089
1193
|
Returns:
|
1090
|
-
Dict
|
1194
|
+
Dict with each method and its input output signature
|
1091
1195
|
"""
|
1092
1196
|
if self._model_signature_dict is None:
|
1093
1197
|
raise exceptions.SnowflakeMLException(
|
@@ -1095,35 +1199,3 @@ class IterativeImputer(BaseTransformer):
|
|
1095
1199
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1096
1200
|
)
|
1097
1201
|
return self._model_signature_dict
|
1098
|
-
|
1099
|
-
def to_sklearn(self) -> Any:
|
1100
|
-
"""Get sklearn.impute.IterativeImputer object.
|
1101
|
-
"""
|
1102
|
-
if self._sklearn_object is None:
|
1103
|
-
self._sklearn_object = self._create_sklearn_object()
|
1104
|
-
return self._sklearn_object
|
1105
|
-
|
1106
|
-
def to_xgboost(self) -> Any:
|
1107
|
-
raise exceptions.SnowflakeMLException(
|
1108
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1109
|
-
original_exception=AttributeError(
|
1110
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1111
|
-
"to_xgboost()",
|
1112
|
-
"to_sklearn()"
|
1113
|
-
)
|
1114
|
-
),
|
1115
|
-
)
|
1116
|
-
|
1117
|
-
def to_lightgbm(self) -> Any:
|
1118
|
-
raise exceptions.SnowflakeMLException(
|
1119
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1120
|
-
original_exception=AttributeError(
|
1121
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1122
|
-
"to_lightgbm()",
|
1123
|
-
"to_sklearn()"
|
1124
|
-
)
|
1125
|
-
),
|
1126
|
-
)
|
1127
|
-
|
1128
|
-
def _get_dependencies(self) -> List[str]:
|
1129
|
-
return self._deps
|