snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
34
34
|
BatchInferenceKwargsTypedDict,
|
35
35
|
ScoreKwargsTypedDict
|
36
36
|
)
|
37
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
38
|
+
from snowflake.ml.model.model_signature import (
|
39
|
+
BaseFeatureSpec,
|
40
|
+
DataType,
|
41
|
+
FeatureSpec,
|
42
|
+
ModelSignature,
|
43
|
+
_infer_signature,
|
44
|
+
_rename_signature_with_snowflake_identifiers,
|
45
|
+
)
|
37
46
|
|
38
47
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
39
48
|
|
@@ -44,16 +53,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
44
53
|
validate_sklearn_args,
|
45
54
|
)
|
46
55
|
|
47
|
-
from snowflake.ml.model.model_signature import (
|
48
|
-
DataType,
|
49
|
-
FeatureSpec,
|
50
|
-
ModelSignature,
|
51
|
-
_infer_signature,
|
52
|
-
_rename_signature_with_snowflake_identifiers,
|
53
|
-
BaseFeatureSpec,
|
54
|
-
)
|
55
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
56
|
-
|
57
56
|
_PROJECT = "ModelDevelopment"
|
58
57
|
# Derive subproject from module name by removing "sklearn"
|
59
58
|
# and converting module name from underscore to CamelCase
|
@@ -209,12 +208,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
209
208
|
)
|
210
209
|
return selected_cols
|
211
210
|
|
212
|
-
|
213
|
-
project=_PROJECT,
|
214
|
-
subproject=_SUBPROJECT,
|
215
|
-
custom_tags=dict([("autogen", True)]),
|
216
|
-
)
|
217
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GenericUnivariateSelect":
|
211
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GenericUnivariateSelect":
|
218
212
|
"""Run score function on (X, y) and get the appropriate features
|
219
213
|
For more details on this function, see [sklearn.feature_selection.GenericUnivariateSelect.fit]
|
220
214
|
(https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html#sklearn.feature_selection.GenericUnivariateSelect.fit)
|
@@ -241,12 +235,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
241
235
|
|
242
236
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
243
237
|
|
244
|
-
|
238
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
245
239
|
if SNOWML_SPROC_ENV in os.environ:
|
246
240
|
statement_params = telemetry.get_function_usage_statement_params(
|
247
241
|
project=_PROJECT,
|
248
242
|
subproject=_SUBPROJECT,
|
249
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
243
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
244
|
+
inspect.currentframe(), GenericUnivariateSelect.__class__.__name__
|
245
|
+
),
|
250
246
|
api_calls=[Session.call],
|
251
247
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
252
248
|
)
|
@@ -267,7 +263,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
267
263
|
)
|
268
264
|
self._sklearn_object = model_trainer.train()
|
269
265
|
self._is_fitted = True
|
270
|
-
self.
|
266
|
+
self._generate_model_signatures(dataset)
|
271
267
|
return self
|
272
268
|
|
273
269
|
def _batch_inference_validate_snowpark(
|
@@ -341,7 +337,9 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
341
337
|
# when it is classifier, infer the datatype from label columns
|
342
338
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
343
339
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
344
|
-
label_cols_signatures = [
|
340
|
+
label_cols_signatures = [
|
341
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
342
|
+
]
|
345
343
|
if len(label_cols_signatures) == 0:
|
346
344
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
347
345
|
raise exceptions.SnowflakeMLException(
|
@@ -349,25 +347,22 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
349
347
|
original_exception=ValueError(error_str),
|
350
348
|
)
|
351
349
|
|
352
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
353
|
-
label_cols_signatures[0].as_snowpark_type()
|
354
|
-
)
|
350
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
355
351
|
|
356
352
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
357
|
-
assert isinstance(
|
353
|
+
assert isinstance(
|
354
|
+
dataset._session, Session
|
355
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
358
356
|
|
359
357
|
transform_kwargs = dict(
|
360
|
-
session
|
361
|
-
dependencies
|
362
|
-
drop_input_cols
|
363
|
-
expected_output_cols_type
|
358
|
+
session=dataset._session,
|
359
|
+
dependencies=self._deps,
|
360
|
+
drop_input_cols=self._drop_input_cols,
|
361
|
+
expected_output_cols_type=expected_type_inferred,
|
364
362
|
)
|
365
363
|
|
366
364
|
elif isinstance(dataset, pd.DataFrame):
|
367
|
-
transform_kwargs = dict(
|
368
|
-
snowpark_input_cols = self._snowpark_cols,
|
369
|
-
drop_input_cols = self._drop_input_cols
|
370
|
-
)
|
365
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
371
366
|
|
372
367
|
transform_handlers = ModelTransformerBuilder.build(
|
373
368
|
dataset=dataset,
|
@@ -409,7 +404,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
409
404
|
Transformed dataset.
|
410
405
|
"""
|
411
406
|
super()._check_dataset_type(dataset)
|
412
|
-
inference_method="transform"
|
407
|
+
inference_method = "transform"
|
413
408
|
|
414
409
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
415
410
|
# are specific to the type of dataset used.
|
@@ -446,17 +441,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
446
441
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
447
442
|
|
448
443
|
transform_kwargs = dict(
|
449
|
-
session
|
450
|
-
dependencies
|
451
|
-
drop_input_cols
|
452
|
-
expected_output_cols_type
|
444
|
+
session=dataset._session,
|
445
|
+
dependencies=self._deps,
|
446
|
+
drop_input_cols=self._drop_input_cols,
|
447
|
+
expected_output_cols_type=expected_dtype,
|
453
448
|
)
|
454
449
|
|
455
450
|
elif isinstance(dataset, pd.DataFrame):
|
456
|
-
transform_kwargs = dict(
|
457
|
-
snowpark_input_cols = self._snowpark_cols,
|
458
|
-
drop_input_cols = self._drop_input_cols
|
459
|
-
)
|
451
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
460
452
|
|
461
453
|
transform_handlers = ModelTransformerBuilder.build(
|
462
454
|
dataset=dataset,
|
@@ -475,7 +467,11 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
475
467
|
return output_df
|
476
468
|
|
477
469
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
478
|
-
def fit_predict(
|
470
|
+
def fit_predict(
|
471
|
+
self,
|
472
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
473
|
+
output_cols_prefix: str = "fit_predict_",
|
474
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
479
475
|
""" Method not supported for this class.
|
480
476
|
|
481
477
|
|
@@ -500,7 +496,9 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
500
496
|
)
|
501
497
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
502
498
|
drop_input_cols=self._drop_input_cols,
|
503
|
-
expected_output_cols_list=
|
499
|
+
expected_output_cols_list=(
|
500
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
501
|
+
),
|
504
502
|
)
|
505
503
|
self._sklearn_object = fitted_estimator
|
506
504
|
self._is_fitted = True
|
@@ -517,6 +515,62 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
517
515
|
assert self._sklearn_object is not None
|
518
516
|
return self._sklearn_object.embedding_
|
519
517
|
|
518
|
+
|
519
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
520
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
521
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
522
|
+
"""
|
523
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
524
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
525
|
+
if output_cols:
|
526
|
+
output_cols = [
|
527
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
528
|
+
for c in output_cols
|
529
|
+
]
|
530
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
531
|
+
output_cols = [output_cols_prefix]
|
532
|
+
elif self._sklearn_object is not None:
|
533
|
+
classes = self._sklearn_object.classes_
|
534
|
+
if isinstance(classes, numpy.ndarray):
|
535
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
536
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
537
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
538
|
+
output_cols = []
|
539
|
+
for i, cl in enumerate(classes):
|
540
|
+
# For binary classification, there is only one output column for each class
|
541
|
+
# ndarray as the two classes are complementary.
|
542
|
+
if len(cl) == 2:
|
543
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
544
|
+
else:
|
545
|
+
output_cols.extend([
|
546
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
547
|
+
])
|
548
|
+
else:
|
549
|
+
output_cols = []
|
550
|
+
|
551
|
+
# Make sure column names are valid snowflake identifiers.
|
552
|
+
assert output_cols is not None # Make MyPy happy
|
553
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
554
|
+
|
555
|
+
return rv
|
556
|
+
|
557
|
+
def _align_expected_output_names(
|
558
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
559
|
+
) -> List[str]:
|
560
|
+
# in case the inferred output column names dimension is different
|
561
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
562
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
563
|
+
output_df_columns = list(output_df_pd.columns)
|
564
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
565
|
+
if self.sample_weight_col:
|
566
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
567
|
+
# if the dimension of inferred output column names is correct; use it
|
568
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
569
|
+
return expected_output_cols_list
|
570
|
+
# otherwise, use the sklearn estimator's output
|
571
|
+
else:
|
572
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
573
|
+
|
520
574
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
521
575
|
@telemetry.send_api_usage_telemetry(
|
522
576
|
project=_PROJECT,
|
@@ -547,24 +601,28 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
547
601
|
# are specific to the type of dataset used.
|
548
602
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
549
603
|
|
604
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
605
|
+
|
550
606
|
if isinstance(dataset, DataFrame):
|
551
607
|
self._deps = self._batch_inference_validate_snowpark(
|
552
608
|
dataset=dataset,
|
553
609
|
inference_method=inference_method,
|
554
610
|
)
|
555
|
-
assert isinstance(
|
611
|
+
assert isinstance(
|
612
|
+
dataset._session, Session
|
613
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
556
614
|
transform_kwargs = dict(
|
557
615
|
session=dataset._session,
|
558
616
|
dependencies=self._deps,
|
559
|
-
drop_input_cols
|
617
|
+
drop_input_cols=self._drop_input_cols,
|
560
618
|
expected_output_cols_type="float",
|
561
619
|
)
|
620
|
+
expected_output_cols = self._align_expected_output_names(
|
621
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
622
|
+
)
|
562
623
|
|
563
624
|
elif isinstance(dataset, pd.DataFrame):
|
564
|
-
transform_kwargs = dict(
|
565
|
-
snowpark_input_cols = self._snowpark_cols,
|
566
|
-
drop_input_cols = self._drop_input_cols
|
567
|
-
)
|
625
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
568
626
|
|
569
627
|
transform_handlers = ModelTransformerBuilder.build(
|
570
628
|
dataset=dataset,
|
@@ -576,7 +634,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
576
634
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
577
635
|
inference_method=inference_method,
|
578
636
|
input_cols=self.input_cols,
|
579
|
-
expected_output_cols=
|
637
|
+
expected_output_cols=expected_output_cols,
|
580
638
|
**transform_kwargs
|
581
639
|
)
|
582
640
|
return output_df
|
@@ -606,7 +664,8 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
606
664
|
Output dataset with log probability of the sample for each class in the model.
|
607
665
|
"""
|
608
666
|
super()._check_dataset_type(dataset)
|
609
|
-
inference_method="predict_log_proba"
|
667
|
+
inference_method = "predict_log_proba"
|
668
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
610
669
|
|
611
670
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
612
671
|
# are specific to the type of dataset used.
|
@@ -617,18 +676,20 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
617
676
|
dataset=dataset,
|
618
677
|
inference_method=inference_method,
|
619
678
|
)
|
620
|
-
assert isinstance(
|
679
|
+
assert isinstance(
|
680
|
+
dataset._session, Session
|
681
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
621
682
|
transform_kwargs = dict(
|
622
683
|
session=dataset._session,
|
623
684
|
dependencies=self._deps,
|
624
|
-
drop_input_cols
|
685
|
+
drop_input_cols=self._drop_input_cols,
|
625
686
|
expected_output_cols_type="float",
|
626
687
|
)
|
688
|
+
expected_output_cols = self._align_expected_output_names(
|
689
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
690
|
+
)
|
627
691
|
elif isinstance(dataset, pd.DataFrame):
|
628
|
-
transform_kwargs = dict(
|
629
|
-
snowpark_input_cols = self._snowpark_cols,
|
630
|
-
drop_input_cols = self._drop_input_cols
|
631
|
-
)
|
692
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
632
693
|
|
633
694
|
transform_handlers = ModelTransformerBuilder.build(
|
634
695
|
dataset=dataset,
|
@@ -641,7 +702,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
641
702
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
642
703
|
inference_method=inference_method,
|
643
704
|
input_cols=self.input_cols,
|
644
|
-
expected_output_cols=
|
705
|
+
expected_output_cols=expected_output_cols,
|
645
706
|
**transform_kwargs
|
646
707
|
)
|
647
708
|
return output_df
|
@@ -667,30 +728,34 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
667
728
|
Output dataset with results of the decision function for the samples in input dataset.
|
668
729
|
"""
|
669
730
|
super()._check_dataset_type(dataset)
|
670
|
-
inference_method="decision_function"
|
731
|
+
inference_method = "decision_function"
|
671
732
|
|
672
733
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
673
734
|
# are specific to the type of dataset used.
|
674
735
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
675
736
|
|
737
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
738
|
+
|
676
739
|
if isinstance(dataset, DataFrame):
|
677
740
|
self._deps = self._batch_inference_validate_snowpark(
|
678
741
|
dataset=dataset,
|
679
742
|
inference_method=inference_method,
|
680
743
|
)
|
681
|
-
assert isinstance(
|
744
|
+
assert isinstance(
|
745
|
+
dataset._session, Session
|
746
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
682
747
|
transform_kwargs = dict(
|
683
748
|
session=dataset._session,
|
684
749
|
dependencies=self._deps,
|
685
|
-
drop_input_cols
|
750
|
+
drop_input_cols=self._drop_input_cols,
|
686
751
|
expected_output_cols_type="float",
|
687
752
|
)
|
753
|
+
expected_output_cols = self._align_expected_output_names(
|
754
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
755
|
+
)
|
688
756
|
|
689
757
|
elif isinstance(dataset, pd.DataFrame):
|
690
|
-
transform_kwargs = dict(
|
691
|
-
snowpark_input_cols = self._snowpark_cols,
|
692
|
-
drop_input_cols = self._drop_input_cols
|
693
|
-
)
|
758
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
694
759
|
|
695
760
|
transform_handlers = ModelTransformerBuilder.build(
|
696
761
|
dataset=dataset,
|
@@ -703,7 +768,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
703
768
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
704
769
|
inference_method=inference_method,
|
705
770
|
input_cols=self.input_cols,
|
706
|
-
expected_output_cols=
|
771
|
+
expected_output_cols=expected_output_cols,
|
707
772
|
**transform_kwargs
|
708
773
|
)
|
709
774
|
return output_df
|
@@ -732,12 +797,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
732
797
|
Output dataset with probability of the sample for each class in the model.
|
733
798
|
"""
|
734
799
|
super()._check_dataset_type(dataset)
|
735
|
-
inference_method="score_samples"
|
800
|
+
inference_method = "score_samples"
|
736
801
|
|
737
802
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
738
803
|
# are specific to the type of dataset used.
|
739
804
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
740
805
|
|
806
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
807
|
+
|
741
808
|
if isinstance(dataset, DataFrame):
|
742
809
|
self._deps = self._batch_inference_validate_snowpark(
|
743
810
|
dataset=dataset,
|
@@ -750,6 +817,9 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
750
817
|
drop_input_cols = self._drop_input_cols,
|
751
818
|
expected_output_cols_type="float",
|
752
819
|
)
|
820
|
+
expected_output_cols = self._align_expected_output_names(
|
821
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
822
|
+
)
|
753
823
|
|
754
824
|
elif isinstance(dataset, pd.DataFrame):
|
755
825
|
transform_kwargs = dict(
|
@@ -768,7 +838,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
768
838
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
769
839
|
inference_method=inference_method,
|
770
840
|
input_cols=self.input_cols,
|
771
|
-
expected_output_cols=
|
841
|
+
expected_output_cols=expected_output_cols,
|
772
842
|
**transform_kwargs
|
773
843
|
)
|
774
844
|
return output_df
|
@@ -913,50 +983,84 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
913
983
|
)
|
914
984
|
return output_df
|
915
985
|
|
986
|
+
|
987
|
+
|
988
|
+
def to_sklearn(self) -> Any:
|
989
|
+
"""Get sklearn.feature_selection.GenericUnivariateSelect object.
|
990
|
+
"""
|
991
|
+
if self._sklearn_object is None:
|
992
|
+
self._sklearn_object = self._create_sklearn_object()
|
993
|
+
return self._sklearn_object
|
994
|
+
|
995
|
+
def to_xgboost(self) -> Any:
|
996
|
+
raise exceptions.SnowflakeMLException(
|
997
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
998
|
+
original_exception=AttributeError(
|
999
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1000
|
+
"to_xgboost()",
|
1001
|
+
"to_sklearn()"
|
1002
|
+
)
|
1003
|
+
),
|
1004
|
+
)
|
1005
|
+
|
1006
|
+
def to_lightgbm(self) -> Any:
|
1007
|
+
raise exceptions.SnowflakeMLException(
|
1008
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1009
|
+
original_exception=AttributeError(
|
1010
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1011
|
+
"to_lightgbm()",
|
1012
|
+
"to_sklearn()"
|
1013
|
+
)
|
1014
|
+
),
|
1015
|
+
)
|
916
1016
|
|
917
|
-
def
|
1017
|
+
def _get_dependencies(self) -> List[str]:
|
1018
|
+
return self._deps
|
1019
|
+
|
1020
|
+
|
1021
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
918
1022
|
self._model_signature_dict = dict()
|
919
1023
|
|
920
1024
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
921
1025
|
|
922
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1026
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
923
1027
|
outputs: List[BaseFeatureSpec] = []
|
924
1028
|
if hasattr(self, "predict"):
|
925
1029
|
# keep mypy happy
|
926
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1030
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
927
1031
|
# For classifier, the type of predict is the same as the type of label
|
928
|
-
if self._sklearn_object._estimator_type ==
|
929
|
-
|
1032
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1033
|
+
# label columns is the desired type for output
|
930
1034
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
931
1035
|
# rename the output columns
|
932
1036
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
933
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
934
|
-
|
935
|
-
|
1037
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1038
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1039
|
+
)
|
936
1040
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
937
1041
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
938
|
-
# Clusterer returns int64 cluster labels.
|
1042
|
+
# Clusterer returns int64 cluster labels.
|
939
1043
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
940
1044
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
941
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
942
|
-
|
943
|
-
|
944
|
-
|
1045
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1046
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1047
|
+
)
|
1048
|
+
|
945
1049
|
# For regressor, the type of predict is float64
|
946
|
-
elif self._sklearn_object._estimator_type ==
|
1050
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
947
1051
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
948
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
949
|
-
|
950
|
-
|
951
|
-
|
1052
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1053
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1054
|
+
)
|
1055
|
+
|
952
1056
|
for prob_func in PROB_FUNCTIONS:
|
953
1057
|
if hasattr(self, prob_func):
|
954
1058
|
output_cols_prefix: str = f"{prob_func}_"
|
955
1059
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
956
1060
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
957
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
958
|
-
|
959
|
-
|
1061
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1062
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1063
|
+
)
|
960
1064
|
|
961
1065
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
962
1066
|
items = list(self._model_signature_dict.items())
|
@@ -969,10 +1073,10 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
969
1073
|
"""Returns model signature of current class.
|
970
1074
|
|
971
1075
|
Raises:
|
972
|
-
|
1076
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
973
1077
|
|
974
1078
|
Returns:
|
975
|
-
Dict
|
1079
|
+
Dict with each method and its input output signature
|
976
1080
|
"""
|
977
1081
|
if self._model_signature_dict is None:
|
978
1082
|
raise exceptions.SnowflakeMLException(
|
@@ -980,35 +1084,3 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
980
1084
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
981
1085
|
)
|
982
1086
|
return self._model_signature_dict
|
983
|
-
|
984
|
-
def to_sklearn(self) -> Any:
|
985
|
-
"""Get sklearn.feature_selection.GenericUnivariateSelect object.
|
986
|
-
"""
|
987
|
-
if self._sklearn_object is None:
|
988
|
-
self._sklearn_object = self._create_sklearn_object()
|
989
|
-
return self._sklearn_object
|
990
|
-
|
991
|
-
def to_xgboost(self) -> Any:
|
992
|
-
raise exceptions.SnowflakeMLException(
|
993
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
994
|
-
original_exception=AttributeError(
|
995
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
996
|
-
"to_xgboost()",
|
997
|
-
"to_sklearn()"
|
998
|
-
)
|
999
|
-
),
|
1000
|
-
)
|
1001
|
-
|
1002
|
-
def to_lightgbm(self) -> Any:
|
1003
|
-
raise exceptions.SnowflakeMLException(
|
1004
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1005
|
-
original_exception=AttributeError(
|
1006
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1007
|
-
"to_lightgbm()",
|
1008
|
-
"to_sklearn()"
|
1009
|
-
)
|
1010
|
-
),
|
1011
|
-
)
|
1012
|
-
|
1013
|
-
def _get_dependencies(self) -> List[str]:
|
1014
|
-
return self._deps
|