snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -270,12 +269,7 @@ class ColumnTransformer(BaseTransformer):
|
|
270
269
|
)
|
271
270
|
return selected_cols
|
272
271
|
|
273
|
-
|
274
|
-
project=_PROJECT,
|
275
|
-
subproject=_SUBPROJECT,
|
276
|
-
custom_tags=dict([("autogen", True)]),
|
277
|
-
)
|
278
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "ColumnTransformer":
|
272
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "ColumnTransformer":
|
279
273
|
"""Fit all transformers using X
|
280
274
|
For more details on this function, see [sklearn.compose.ColumnTransformer.fit]
|
281
275
|
(https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer.fit)
|
@@ -302,12 +296,14 @@ class ColumnTransformer(BaseTransformer):
|
|
302
296
|
|
303
297
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
304
298
|
|
305
|
-
|
299
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
306
300
|
if SNOWML_SPROC_ENV in os.environ:
|
307
301
|
statement_params = telemetry.get_function_usage_statement_params(
|
308
302
|
project=_PROJECT,
|
309
303
|
subproject=_SUBPROJECT,
|
310
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
304
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
305
|
+
inspect.currentframe(), ColumnTransformer.__class__.__name__
|
306
|
+
),
|
311
307
|
api_calls=[Session.call],
|
312
308
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
313
309
|
)
|
@@ -328,7 +324,7 @@ class ColumnTransformer(BaseTransformer):
|
|
328
324
|
)
|
329
325
|
self._sklearn_object = model_trainer.train()
|
330
326
|
self._is_fitted = True
|
331
|
-
self.
|
327
|
+
self._generate_model_signatures(dataset)
|
332
328
|
return self
|
333
329
|
|
334
330
|
def _batch_inference_validate_snowpark(
|
@@ -402,7 +398,9 @@ class ColumnTransformer(BaseTransformer):
|
|
402
398
|
# when it is classifier, infer the datatype from label columns
|
403
399
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
404
400
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
405
|
-
label_cols_signatures = [
|
401
|
+
label_cols_signatures = [
|
402
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
403
|
+
]
|
406
404
|
if len(label_cols_signatures) == 0:
|
407
405
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
408
406
|
raise exceptions.SnowflakeMLException(
|
@@ -410,25 +408,22 @@ class ColumnTransformer(BaseTransformer):
|
|
410
408
|
original_exception=ValueError(error_str),
|
411
409
|
)
|
412
410
|
|
413
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
414
|
-
label_cols_signatures[0].as_snowpark_type()
|
415
|
-
)
|
411
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
416
412
|
|
417
413
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
418
|
-
assert isinstance(
|
414
|
+
assert isinstance(
|
415
|
+
dataset._session, Session
|
416
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
419
417
|
|
420
418
|
transform_kwargs = dict(
|
421
|
-
session
|
422
|
-
dependencies
|
423
|
-
drop_input_cols
|
424
|
-
expected_output_cols_type
|
419
|
+
session=dataset._session,
|
420
|
+
dependencies=self._deps,
|
421
|
+
drop_input_cols=self._drop_input_cols,
|
422
|
+
expected_output_cols_type=expected_type_inferred,
|
425
423
|
)
|
426
424
|
|
427
425
|
elif isinstance(dataset, pd.DataFrame):
|
428
|
-
transform_kwargs = dict(
|
429
|
-
snowpark_input_cols = self._snowpark_cols,
|
430
|
-
drop_input_cols = self._drop_input_cols
|
431
|
-
)
|
426
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
432
427
|
|
433
428
|
transform_handlers = ModelTransformerBuilder.build(
|
434
429
|
dataset=dataset,
|
@@ -470,7 +465,7 @@ class ColumnTransformer(BaseTransformer):
|
|
470
465
|
Transformed dataset.
|
471
466
|
"""
|
472
467
|
super()._check_dataset_type(dataset)
|
473
|
-
inference_method="transform"
|
468
|
+
inference_method = "transform"
|
474
469
|
|
475
470
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
476
471
|
# are specific to the type of dataset used.
|
@@ -507,17 +502,14 @@ class ColumnTransformer(BaseTransformer):
|
|
507
502
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
508
503
|
|
509
504
|
transform_kwargs = dict(
|
510
|
-
session
|
511
|
-
dependencies
|
512
|
-
drop_input_cols
|
513
|
-
expected_output_cols_type
|
505
|
+
session=dataset._session,
|
506
|
+
dependencies=self._deps,
|
507
|
+
drop_input_cols=self._drop_input_cols,
|
508
|
+
expected_output_cols_type=expected_dtype,
|
514
509
|
)
|
515
510
|
|
516
511
|
elif isinstance(dataset, pd.DataFrame):
|
517
|
-
transform_kwargs = dict(
|
518
|
-
snowpark_input_cols = self._snowpark_cols,
|
519
|
-
drop_input_cols = self._drop_input_cols
|
520
|
-
)
|
512
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
521
513
|
|
522
514
|
transform_handlers = ModelTransformerBuilder.build(
|
523
515
|
dataset=dataset,
|
@@ -536,7 +528,11 @@ class ColumnTransformer(BaseTransformer):
|
|
536
528
|
return output_df
|
537
529
|
|
538
530
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
539
|
-
def fit_predict(
|
531
|
+
def fit_predict(
|
532
|
+
self,
|
533
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
534
|
+
output_cols_prefix: str = "fit_predict_",
|
535
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
540
536
|
""" Method not supported for this class.
|
541
537
|
|
542
538
|
|
@@ -561,7 +557,9 @@ class ColumnTransformer(BaseTransformer):
|
|
561
557
|
)
|
562
558
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
563
559
|
drop_input_cols=self._drop_input_cols,
|
564
|
-
expected_output_cols_list=
|
560
|
+
expected_output_cols_list=(
|
561
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
562
|
+
),
|
565
563
|
)
|
566
564
|
self._sklearn_object = fitted_estimator
|
567
565
|
self._is_fitted = True
|
@@ -578,6 +576,62 @@ class ColumnTransformer(BaseTransformer):
|
|
578
576
|
assert self._sklearn_object is not None
|
579
577
|
return self._sklearn_object.embedding_
|
580
578
|
|
579
|
+
|
580
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
581
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
582
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
583
|
+
"""
|
584
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
585
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
586
|
+
if output_cols:
|
587
|
+
output_cols = [
|
588
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
589
|
+
for c in output_cols
|
590
|
+
]
|
591
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
592
|
+
output_cols = [output_cols_prefix]
|
593
|
+
elif self._sklearn_object is not None:
|
594
|
+
classes = self._sklearn_object.classes_
|
595
|
+
if isinstance(classes, numpy.ndarray):
|
596
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
597
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
598
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
599
|
+
output_cols = []
|
600
|
+
for i, cl in enumerate(classes):
|
601
|
+
# For binary classification, there is only one output column for each class
|
602
|
+
# ndarray as the two classes are complementary.
|
603
|
+
if len(cl) == 2:
|
604
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
605
|
+
else:
|
606
|
+
output_cols.extend([
|
607
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
608
|
+
])
|
609
|
+
else:
|
610
|
+
output_cols = []
|
611
|
+
|
612
|
+
# Make sure column names are valid snowflake identifiers.
|
613
|
+
assert output_cols is not None # Make MyPy happy
|
614
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
615
|
+
|
616
|
+
return rv
|
617
|
+
|
618
|
+
def _align_expected_output_names(
|
619
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
620
|
+
) -> List[str]:
|
621
|
+
# in case the inferred output column names dimension is different
|
622
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
623
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
624
|
+
output_df_columns = list(output_df_pd.columns)
|
625
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
626
|
+
if self.sample_weight_col:
|
627
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
628
|
+
# if the dimension of inferred output column names is correct; use it
|
629
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
630
|
+
return expected_output_cols_list
|
631
|
+
# otherwise, use the sklearn estimator's output
|
632
|
+
else:
|
633
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
634
|
+
|
581
635
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
582
636
|
@telemetry.send_api_usage_telemetry(
|
583
637
|
project=_PROJECT,
|
@@ -608,24 +662,28 @@ class ColumnTransformer(BaseTransformer):
|
|
608
662
|
# are specific to the type of dataset used.
|
609
663
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
610
664
|
|
665
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
666
|
+
|
611
667
|
if isinstance(dataset, DataFrame):
|
612
668
|
self._deps = self._batch_inference_validate_snowpark(
|
613
669
|
dataset=dataset,
|
614
670
|
inference_method=inference_method,
|
615
671
|
)
|
616
|
-
assert isinstance(
|
672
|
+
assert isinstance(
|
673
|
+
dataset._session, Session
|
674
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
617
675
|
transform_kwargs = dict(
|
618
676
|
session=dataset._session,
|
619
677
|
dependencies=self._deps,
|
620
|
-
drop_input_cols
|
678
|
+
drop_input_cols=self._drop_input_cols,
|
621
679
|
expected_output_cols_type="float",
|
622
680
|
)
|
681
|
+
expected_output_cols = self._align_expected_output_names(
|
682
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
683
|
+
)
|
623
684
|
|
624
685
|
elif isinstance(dataset, pd.DataFrame):
|
625
|
-
transform_kwargs = dict(
|
626
|
-
snowpark_input_cols = self._snowpark_cols,
|
627
|
-
drop_input_cols = self._drop_input_cols
|
628
|
-
)
|
686
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
629
687
|
|
630
688
|
transform_handlers = ModelTransformerBuilder.build(
|
631
689
|
dataset=dataset,
|
@@ -637,7 +695,7 @@ class ColumnTransformer(BaseTransformer):
|
|
637
695
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
638
696
|
inference_method=inference_method,
|
639
697
|
input_cols=self.input_cols,
|
640
|
-
expected_output_cols=
|
698
|
+
expected_output_cols=expected_output_cols,
|
641
699
|
**transform_kwargs
|
642
700
|
)
|
643
701
|
return output_df
|
@@ -667,7 +725,8 @@ class ColumnTransformer(BaseTransformer):
|
|
667
725
|
Output dataset with log probability of the sample for each class in the model.
|
668
726
|
"""
|
669
727
|
super()._check_dataset_type(dataset)
|
670
|
-
inference_method="predict_log_proba"
|
728
|
+
inference_method = "predict_log_proba"
|
729
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
671
730
|
|
672
731
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
673
732
|
# are specific to the type of dataset used.
|
@@ -678,18 +737,20 @@ class ColumnTransformer(BaseTransformer):
|
|
678
737
|
dataset=dataset,
|
679
738
|
inference_method=inference_method,
|
680
739
|
)
|
681
|
-
assert isinstance(
|
740
|
+
assert isinstance(
|
741
|
+
dataset._session, Session
|
742
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
682
743
|
transform_kwargs = dict(
|
683
744
|
session=dataset._session,
|
684
745
|
dependencies=self._deps,
|
685
|
-
drop_input_cols
|
746
|
+
drop_input_cols=self._drop_input_cols,
|
686
747
|
expected_output_cols_type="float",
|
687
748
|
)
|
749
|
+
expected_output_cols = self._align_expected_output_names(
|
750
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
751
|
+
)
|
688
752
|
elif isinstance(dataset, pd.DataFrame):
|
689
|
-
transform_kwargs = dict(
|
690
|
-
snowpark_input_cols = self._snowpark_cols,
|
691
|
-
drop_input_cols = self._drop_input_cols
|
692
|
-
)
|
753
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
693
754
|
|
694
755
|
transform_handlers = ModelTransformerBuilder.build(
|
695
756
|
dataset=dataset,
|
@@ -702,7 +763,7 @@ class ColumnTransformer(BaseTransformer):
|
|
702
763
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
703
764
|
inference_method=inference_method,
|
704
765
|
input_cols=self.input_cols,
|
705
|
-
expected_output_cols=
|
766
|
+
expected_output_cols=expected_output_cols,
|
706
767
|
**transform_kwargs
|
707
768
|
)
|
708
769
|
return output_df
|
@@ -728,30 +789,34 @@ class ColumnTransformer(BaseTransformer):
|
|
728
789
|
Output dataset with results of the decision function for the samples in input dataset.
|
729
790
|
"""
|
730
791
|
super()._check_dataset_type(dataset)
|
731
|
-
inference_method="decision_function"
|
792
|
+
inference_method = "decision_function"
|
732
793
|
|
733
794
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
734
795
|
# are specific to the type of dataset used.
|
735
796
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
736
797
|
|
798
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
799
|
+
|
737
800
|
if isinstance(dataset, DataFrame):
|
738
801
|
self._deps = self._batch_inference_validate_snowpark(
|
739
802
|
dataset=dataset,
|
740
803
|
inference_method=inference_method,
|
741
804
|
)
|
742
|
-
assert isinstance(
|
805
|
+
assert isinstance(
|
806
|
+
dataset._session, Session
|
807
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
743
808
|
transform_kwargs = dict(
|
744
809
|
session=dataset._session,
|
745
810
|
dependencies=self._deps,
|
746
|
-
drop_input_cols
|
811
|
+
drop_input_cols=self._drop_input_cols,
|
747
812
|
expected_output_cols_type="float",
|
748
813
|
)
|
814
|
+
expected_output_cols = self._align_expected_output_names(
|
815
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
816
|
+
)
|
749
817
|
|
750
818
|
elif isinstance(dataset, pd.DataFrame):
|
751
|
-
transform_kwargs = dict(
|
752
|
-
snowpark_input_cols = self._snowpark_cols,
|
753
|
-
drop_input_cols = self._drop_input_cols
|
754
|
-
)
|
819
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
755
820
|
|
756
821
|
transform_handlers = ModelTransformerBuilder.build(
|
757
822
|
dataset=dataset,
|
@@ -764,7 +829,7 @@ class ColumnTransformer(BaseTransformer):
|
|
764
829
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
765
830
|
inference_method=inference_method,
|
766
831
|
input_cols=self.input_cols,
|
767
|
-
expected_output_cols=
|
832
|
+
expected_output_cols=expected_output_cols,
|
768
833
|
**transform_kwargs
|
769
834
|
)
|
770
835
|
return output_df
|
@@ -793,12 +858,14 @@ class ColumnTransformer(BaseTransformer):
|
|
793
858
|
Output dataset with probability of the sample for each class in the model.
|
794
859
|
"""
|
795
860
|
super()._check_dataset_type(dataset)
|
796
|
-
inference_method="score_samples"
|
861
|
+
inference_method = "score_samples"
|
797
862
|
|
798
863
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
799
864
|
# are specific to the type of dataset used.
|
800
865
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
801
866
|
|
867
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
868
|
+
|
802
869
|
if isinstance(dataset, DataFrame):
|
803
870
|
self._deps = self._batch_inference_validate_snowpark(
|
804
871
|
dataset=dataset,
|
@@ -811,6 +878,9 @@ class ColumnTransformer(BaseTransformer):
|
|
811
878
|
drop_input_cols = self._drop_input_cols,
|
812
879
|
expected_output_cols_type="float",
|
813
880
|
)
|
881
|
+
expected_output_cols = self._align_expected_output_names(
|
882
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
883
|
+
)
|
814
884
|
|
815
885
|
elif isinstance(dataset, pd.DataFrame):
|
816
886
|
transform_kwargs = dict(
|
@@ -829,7 +899,7 @@ class ColumnTransformer(BaseTransformer):
|
|
829
899
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
830
900
|
inference_method=inference_method,
|
831
901
|
input_cols=self.input_cols,
|
832
|
-
expected_output_cols=
|
902
|
+
expected_output_cols=expected_output_cols,
|
833
903
|
**transform_kwargs
|
834
904
|
)
|
835
905
|
return output_df
|
@@ -974,50 +1044,84 @@ class ColumnTransformer(BaseTransformer):
|
|
974
1044
|
)
|
975
1045
|
return output_df
|
976
1046
|
|
1047
|
+
|
1048
|
+
|
1049
|
+
def to_sklearn(self) -> Any:
|
1050
|
+
"""Get sklearn.compose.ColumnTransformer object.
|
1051
|
+
"""
|
1052
|
+
if self._sklearn_object is None:
|
1053
|
+
self._sklearn_object = self._create_sklearn_object()
|
1054
|
+
return self._sklearn_object
|
1055
|
+
|
1056
|
+
def to_xgboost(self) -> Any:
|
1057
|
+
raise exceptions.SnowflakeMLException(
|
1058
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1059
|
+
original_exception=AttributeError(
|
1060
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1061
|
+
"to_xgboost()",
|
1062
|
+
"to_sklearn()"
|
1063
|
+
)
|
1064
|
+
),
|
1065
|
+
)
|
1066
|
+
|
1067
|
+
def to_lightgbm(self) -> Any:
|
1068
|
+
raise exceptions.SnowflakeMLException(
|
1069
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1070
|
+
original_exception=AttributeError(
|
1071
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1072
|
+
"to_lightgbm()",
|
1073
|
+
"to_sklearn()"
|
1074
|
+
)
|
1075
|
+
),
|
1076
|
+
)
|
977
1077
|
|
978
|
-
def
|
1078
|
+
def _get_dependencies(self) -> List[str]:
|
1079
|
+
return self._deps
|
1080
|
+
|
1081
|
+
|
1082
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
979
1083
|
self._model_signature_dict = dict()
|
980
1084
|
|
981
1085
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
982
1086
|
|
983
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1087
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
984
1088
|
outputs: List[BaseFeatureSpec] = []
|
985
1089
|
if hasattr(self, "predict"):
|
986
1090
|
# keep mypy happy
|
987
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1091
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
988
1092
|
# For classifier, the type of predict is the same as the type of label
|
989
|
-
if self._sklearn_object._estimator_type ==
|
990
|
-
|
1093
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1094
|
+
# label columns is the desired type for output
|
991
1095
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
992
1096
|
# rename the output columns
|
993
1097
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
994
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
995
|
-
|
996
|
-
|
1098
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1099
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1100
|
+
)
|
997
1101
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
998
1102
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
999
|
-
# Clusterer returns int64 cluster labels.
|
1103
|
+
# Clusterer returns int64 cluster labels.
|
1000
1104
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1001
1105
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1002
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1106
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1107
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1108
|
+
)
|
1109
|
+
|
1006
1110
|
# For regressor, the type of predict is float64
|
1007
|
-
elif self._sklearn_object._estimator_type ==
|
1111
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1008
1112
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1009
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1113
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1114
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1115
|
+
)
|
1116
|
+
|
1013
1117
|
for prob_func in PROB_FUNCTIONS:
|
1014
1118
|
if hasattr(self, prob_func):
|
1015
1119
|
output_cols_prefix: str = f"{prob_func}_"
|
1016
1120
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1017
1121
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1018
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1019
|
-
|
1020
|
-
|
1122
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1123
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1124
|
+
)
|
1021
1125
|
|
1022
1126
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1023
1127
|
items = list(self._model_signature_dict.items())
|
@@ -1030,10 +1134,10 @@ class ColumnTransformer(BaseTransformer):
|
|
1030
1134
|
"""Returns model signature of current class.
|
1031
1135
|
|
1032
1136
|
Raises:
|
1033
|
-
|
1137
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1034
1138
|
|
1035
1139
|
Returns:
|
1036
|
-
Dict
|
1140
|
+
Dict with each method and its input output signature
|
1037
1141
|
"""
|
1038
1142
|
if self._model_signature_dict is None:
|
1039
1143
|
raise exceptions.SnowflakeMLException(
|
@@ -1041,35 +1145,3 @@ class ColumnTransformer(BaseTransformer):
|
|
1041
1145
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1042
1146
|
)
|
1043
1147
|
return self._model_signature_dict
|
1044
|
-
|
1045
|
-
def to_sklearn(self) -> Any:
|
1046
|
-
"""Get sklearn.compose.ColumnTransformer object.
|
1047
|
-
"""
|
1048
|
-
if self._sklearn_object is None:
|
1049
|
-
self._sklearn_object = self._create_sklearn_object()
|
1050
|
-
return self._sklearn_object
|
1051
|
-
|
1052
|
-
def to_xgboost(self) -> Any:
|
1053
|
-
raise exceptions.SnowflakeMLException(
|
1054
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1055
|
-
original_exception=AttributeError(
|
1056
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1057
|
-
"to_xgboost()",
|
1058
|
-
"to_sklearn()"
|
1059
|
-
)
|
1060
|
-
),
|
1061
|
-
)
|
1062
|
-
|
1063
|
-
def to_lightgbm(self) -> Any:
|
1064
|
-
raise exceptions.SnowflakeMLException(
|
1065
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1066
|
-
original_exception=AttributeError(
|
1067
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1068
|
-
"to_lightgbm()",
|
1069
|
-
"to_sklearn()"
|
1070
|
-
)
|
1071
|
-
),
|
1072
|
-
)
|
1073
|
-
|
1074
|
-
def _get_dependencies(self) -> List[str]:
|
1075
|
-
return self._deps
|