snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -221,12 +220,7 @@ class MinCovDet(BaseTransformer):
|
|
221
220
|
)
|
222
221
|
return selected_cols
|
223
222
|
|
224
|
-
|
225
|
-
project=_PROJECT,
|
226
|
-
subproject=_SUBPROJECT,
|
227
|
-
custom_tags=dict([("autogen", True)]),
|
228
|
-
)
|
229
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MinCovDet":
|
223
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MinCovDet":
|
230
224
|
"""Fit a Minimum Covariance Determinant with the FastMCD algorithm
|
231
225
|
For more details on this function, see [sklearn.covariance.MinCovDet.fit]
|
232
226
|
(https://scikit-learn.org/stable/modules/generated/sklearn.covariance.MinCovDet.html#sklearn.covariance.MinCovDet.fit)
|
@@ -253,12 +247,14 @@ class MinCovDet(BaseTransformer):
|
|
253
247
|
|
254
248
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
255
249
|
|
256
|
-
|
250
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
257
251
|
if SNOWML_SPROC_ENV in os.environ:
|
258
252
|
statement_params = telemetry.get_function_usage_statement_params(
|
259
253
|
project=_PROJECT,
|
260
254
|
subproject=_SUBPROJECT,
|
261
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
255
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
256
|
+
inspect.currentframe(), MinCovDet.__class__.__name__
|
257
|
+
),
|
262
258
|
api_calls=[Session.call],
|
263
259
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
264
260
|
)
|
@@ -279,7 +275,7 @@ class MinCovDet(BaseTransformer):
|
|
279
275
|
)
|
280
276
|
self._sklearn_object = model_trainer.train()
|
281
277
|
self._is_fitted = True
|
282
|
-
self.
|
278
|
+
self._generate_model_signatures(dataset)
|
283
279
|
return self
|
284
280
|
|
285
281
|
def _batch_inference_validate_snowpark(
|
@@ -353,7 +349,9 @@ class MinCovDet(BaseTransformer):
|
|
353
349
|
# when it is classifier, infer the datatype from label columns
|
354
350
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
355
351
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
356
|
-
label_cols_signatures = [
|
352
|
+
label_cols_signatures = [
|
353
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
354
|
+
]
|
357
355
|
if len(label_cols_signatures) == 0:
|
358
356
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
359
357
|
raise exceptions.SnowflakeMLException(
|
@@ -361,25 +359,22 @@ class MinCovDet(BaseTransformer):
|
|
361
359
|
original_exception=ValueError(error_str),
|
362
360
|
)
|
363
361
|
|
364
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
365
|
-
label_cols_signatures[0].as_snowpark_type()
|
366
|
-
)
|
362
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
367
363
|
|
368
364
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
369
|
-
assert isinstance(
|
365
|
+
assert isinstance(
|
366
|
+
dataset._session, Session
|
367
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
370
368
|
|
371
369
|
transform_kwargs = dict(
|
372
|
-
session
|
373
|
-
dependencies
|
374
|
-
drop_input_cols
|
375
|
-
expected_output_cols_type
|
370
|
+
session=dataset._session,
|
371
|
+
dependencies=self._deps,
|
372
|
+
drop_input_cols=self._drop_input_cols,
|
373
|
+
expected_output_cols_type=expected_type_inferred,
|
376
374
|
)
|
377
375
|
|
378
376
|
elif isinstance(dataset, pd.DataFrame):
|
379
|
-
transform_kwargs = dict(
|
380
|
-
snowpark_input_cols = self._snowpark_cols,
|
381
|
-
drop_input_cols = self._drop_input_cols
|
382
|
-
)
|
377
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
383
378
|
|
384
379
|
transform_handlers = ModelTransformerBuilder.build(
|
385
380
|
dataset=dataset,
|
@@ -419,7 +414,7 @@ class MinCovDet(BaseTransformer):
|
|
419
414
|
Transformed dataset.
|
420
415
|
"""
|
421
416
|
super()._check_dataset_type(dataset)
|
422
|
-
inference_method="transform"
|
417
|
+
inference_method = "transform"
|
423
418
|
|
424
419
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
425
420
|
# are specific to the type of dataset used.
|
@@ -456,17 +451,14 @@ class MinCovDet(BaseTransformer):
|
|
456
451
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
457
452
|
|
458
453
|
transform_kwargs = dict(
|
459
|
-
session
|
460
|
-
dependencies
|
461
|
-
drop_input_cols
|
462
|
-
expected_output_cols_type
|
454
|
+
session=dataset._session,
|
455
|
+
dependencies=self._deps,
|
456
|
+
drop_input_cols=self._drop_input_cols,
|
457
|
+
expected_output_cols_type=expected_dtype,
|
463
458
|
)
|
464
459
|
|
465
460
|
elif isinstance(dataset, pd.DataFrame):
|
466
|
-
transform_kwargs = dict(
|
467
|
-
snowpark_input_cols = self._snowpark_cols,
|
468
|
-
drop_input_cols = self._drop_input_cols
|
469
|
-
)
|
461
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
470
462
|
|
471
463
|
transform_handlers = ModelTransformerBuilder.build(
|
472
464
|
dataset=dataset,
|
@@ -485,7 +477,11 @@ class MinCovDet(BaseTransformer):
|
|
485
477
|
return output_df
|
486
478
|
|
487
479
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
488
|
-
def fit_predict(
|
480
|
+
def fit_predict(
|
481
|
+
self,
|
482
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
483
|
+
output_cols_prefix: str = "fit_predict_",
|
484
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
489
485
|
""" Method not supported for this class.
|
490
486
|
|
491
487
|
|
@@ -510,7 +506,9 @@ class MinCovDet(BaseTransformer):
|
|
510
506
|
)
|
511
507
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
512
508
|
drop_input_cols=self._drop_input_cols,
|
513
|
-
expected_output_cols_list=
|
509
|
+
expected_output_cols_list=(
|
510
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
511
|
+
),
|
514
512
|
)
|
515
513
|
self._sklearn_object = fitted_estimator
|
516
514
|
self._is_fitted = True
|
@@ -527,6 +525,62 @@ class MinCovDet(BaseTransformer):
|
|
527
525
|
assert self._sklearn_object is not None
|
528
526
|
return self._sklearn_object.embedding_
|
529
527
|
|
528
|
+
|
529
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
530
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
531
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
532
|
+
"""
|
533
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
534
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
535
|
+
if output_cols:
|
536
|
+
output_cols = [
|
537
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
538
|
+
for c in output_cols
|
539
|
+
]
|
540
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
541
|
+
output_cols = [output_cols_prefix]
|
542
|
+
elif self._sklearn_object is not None:
|
543
|
+
classes = self._sklearn_object.classes_
|
544
|
+
if isinstance(classes, numpy.ndarray):
|
545
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
546
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
547
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
548
|
+
output_cols = []
|
549
|
+
for i, cl in enumerate(classes):
|
550
|
+
# For binary classification, there is only one output column for each class
|
551
|
+
# ndarray as the two classes are complementary.
|
552
|
+
if len(cl) == 2:
|
553
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
554
|
+
else:
|
555
|
+
output_cols.extend([
|
556
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
557
|
+
])
|
558
|
+
else:
|
559
|
+
output_cols = []
|
560
|
+
|
561
|
+
# Make sure column names are valid snowflake identifiers.
|
562
|
+
assert output_cols is not None # Make MyPy happy
|
563
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
564
|
+
|
565
|
+
return rv
|
566
|
+
|
567
|
+
def _align_expected_output_names(
|
568
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
569
|
+
) -> List[str]:
|
570
|
+
# in case the inferred output column names dimension is different
|
571
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
573
|
+
output_df_columns = list(output_df_pd.columns)
|
574
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
|
+
if self.sample_weight_col:
|
576
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
577
|
+
# if the dimension of inferred output column names is correct; use it
|
578
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
579
|
+
return expected_output_cols_list
|
580
|
+
# otherwise, use the sklearn estimator's output
|
581
|
+
else:
|
582
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
583
|
+
|
530
584
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
531
585
|
@telemetry.send_api_usage_telemetry(
|
532
586
|
project=_PROJECT,
|
@@ -557,24 +611,28 @@ class MinCovDet(BaseTransformer):
|
|
557
611
|
# are specific to the type of dataset used.
|
558
612
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
559
613
|
|
614
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
615
|
+
|
560
616
|
if isinstance(dataset, DataFrame):
|
561
617
|
self._deps = self._batch_inference_validate_snowpark(
|
562
618
|
dataset=dataset,
|
563
619
|
inference_method=inference_method,
|
564
620
|
)
|
565
|
-
assert isinstance(
|
621
|
+
assert isinstance(
|
622
|
+
dataset._session, Session
|
623
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
566
624
|
transform_kwargs = dict(
|
567
625
|
session=dataset._session,
|
568
626
|
dependencies=self._deps,
|
569
|
-
drop_input_cols
|
627
|
+
drop_input_cols=self._drop_input_cols,
|
570
628
|
expected_output_cols_type="float",
|
571
629
|
)
|
630
|
+
expected_output_cols = self._align_expected_output_names(
|
631
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
632
|
+
)
|
572
633
|
|
573
634
|
elif isinstance(dataset, pd.DataFrame):
|
574
|
-
transform_kwargs = dict(
|
575
|
-
snowpark_input_cols = self._snowpark_cols,
|
576
|
-
drop_input_cols = self._drop_input_cols
|
577
|
-
)
|
635
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
578
636
|
|
579
637
|
transform_handlers = ModelTransformerBuilder.build(
|
580
638
|
dataset=dataset,
|
@@ -586,7 +644,7 @@ class MinCovDet(BaseTransformer):
|
|
586
644
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
587
645
|
inference_method=inference_method,
|
588
646
|
input_cols=self.input_cols,
|
589
|
-
expected_output_cols=
|
647
|
+
expected_output_cols=expected_output_cols,
|
590
648
|
**transform_kwargs
|
591
649
|
)
|
592
650
|
return output_df
|
@@ -616,7 +674,8 @@ class MinCovDet(BaseTransformer):
|
|
616
674
|
Output dataset with log probability of the sample for each class in the model.
|
617
675
|
"""
|
618
676
|
super()._check_dataset_type(dataset)
|
619
|
-
inference_method="predict_log_proba"
|
677
|
+
inference_method = "predict_log_proba"
|
678
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
620
679
|
|
621
680
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
622
681
|
# are specific to the type of dataset used.
|
@@ -627,18 +686,20 @@ class MinCovDet(BaseTransformer):
|
|
627
686
|
dataset=dataset,
|
628
687
|
inference_method=inference_method,
|
629
688
|
)
|
630
|
-
assert isinstance(
|
689
|
+
assert isinstance(
|
690
|
+
dataset._session, Session
|
691
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
631
692
|
transform_kwargs = dict(
|
632
693
|
session=dataset._session,
|
633
694
|
dependencies=self._deps,
|
634
|
-
drop_input_cols
|
695
|
+
drop_input_cols=self._drop_input_cols,
|
635
696
|
expected_output_cols_type="float",
|
636
697
|
)
|
698
|
+
expected_output_cols = self._align_expected_output_names(
|
699
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
700
|
+
)
|
637
701
|
elif isinstance(dataset, pd.DataFrame):
|
638
|
-
transform_kwargs = dict(
|
639
|
-
snowpark_input_cols = self._snowpark_cols,
|
640
|
-
drop_input_cols = self._drop_input_cols
|
641
|
-
)
|
702
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
642
703
|
|
643
704
|
transform_handlers = ModelTransformerBuilder.build(
|
644
705
|
dataset=dataset,
|
@@ -651,7 +712,7 @@ class MinCovDet(BaseTransformer):
|
|
651
712
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
652
713
|
inference_method=inference_method,
|
653
714
|
input_cols=self.input_cols,
|
654
|
-
expected_output_cols=
|
715
|
+
expected_output_cols=expected_output_cols,
|
655
716
|
**transform_kwargs
|
656
717
|
)
|
657
718
|
return output_df
|
@@ -677,30 +738,34 @@ class MinCovDet(BaseTransformer):
|
|
677
738
|
Output dataset with results of the decision function for the samples in input dataset.
|
678
739
|
"""
|
679
740
|
super()._check_dataset_type(dataset)
|
680
|
-
inference_method="decision_function"
|
741
|
+
inference_method = "decision_function"
|
681
742
|
|
682
743
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
683
744
|
# are specific to the type of dataset used.
|
684
745
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
685
746
|
|
747
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
748
|
+
|
686
749
|
if isinstance(dataset, DataFrame):
|
687
750
|
self._deps = self._batch_inference_validate_snowpark(
|
688
751
|
dataset=dataset,
|
689
752
|
inference_method=inference_method,
|
690
753
|
)
|
691
|
-
assert isinstance(
|
754
|
+
assert isinstance(
|
755
|
+
dataset._session, Session
|
756
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
692
757
|
transform_kwargs = dict(
|
693
758
|
session=dataset._session,
|
694
759
|
dependencies=self._deps,
|
695
|
-
drop_input_cols
|
760
|
+
drop_input_cols=self._drop_input_cols,
|
696
761
|
expected_output_cols_type="float",
|
697
762
|
)
|
763
|
+
expected_output_cols = self._align_expected_output_names(
|
764
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
765
|
+
)
|
698
766
|
|
699
767
|
elif isinstance(dataset, pd.DataFrame):
|
700
|
-
transform_kwargs = dict(
|
701
|
-
snowpark_input_cols = self._snowpark_cols,
|
702
|
-
drop_input_cols = self._drop_input_cols
|
703
|
-
)
|
768
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
704
769
|
|
705
770
|
transform_handlers = ModelTransformerBuilder.build(
|
706
771
|
dataset=dataset,
|
@@ -713,7 +778,7 @@ class MinCovDet(BaseTransformer):
|
|
713
778
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
714
779
|
inference_method=inference_method,
|
715
780
|
input_cols=self.input_cols,
|
716
|
-
expected_output_cols=
|
781
|
+
expected_output_cols=expected_output_cols,
|
717
782
|
**transform_kwargs
|
718
783
|
)
|
719
784
|
return output_df
|
@@ -742,12 +807,14 @@ class MinCovDet(BaseTransformer):
|
|
742
807
|
Output dataset with probability of the sample for each class in the model.
|
743
808
|
"""
|
744
809
|
super()._check_dataset_type(dataset)
|
745
|
-
inference_method="score_samples"
|
810
|
+
inference_method = "score_samples"
|
746
811
|
|
747
812
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
748
813
|
# are specific to the type of dataset used.
|
749
814
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
750
815
|
|
816
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
817
|
+
|
751
818
|
if isinstance(dataset, DataFrame):
|
752
819
|
self._deps = self._batch_inference_validate_snowpark(
|
753
820
|
dataset=dataset,
|
@@ -760,6 +827,9 @@ class MinCovDet(BaseTransformer):
|
|
760
827
|
drop_input_cols = self._drop_input_cols,
|
761
828
|
expected_output_cols_type="float",
|
762
829
|
)
|
830
|
+
expected_output_cols = self._align_expected_output_names(
|
831
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
832
|
+
)
|
763
833
|
|
764
834
|
elif isinstance(dataset, pd.DataFrame):
|
765
835
|
transform_kwargs = dict(
|
@@ -778,7 +848,7 @@ class MinCovDet(BaseTransformer):
|
|
778
848
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
779
849
|
inference_method=inference_method,
|
780
850
|
input_cols=self.input_cols,
|
781
|
-
expected_output_cols=
|
851
|
+
expected_output_cols=expected_output_cols,
|
782
852
|
**transform_kwargs
|
783
853
|
)
|
784
854
|
return output_df
|
@@ -925,50 +995,84 @@ class MinCovDet(BaseTransformer):
|
|
925
995
|
)
|
926
996
|
return output_df
|
927
997
|
|
998
|
+
|
999
|
+
|
1000
|
+
def to_sklearn(self) -> Any:
|
1001
|
+
"""Get sklearn.covariance.MinCovDet object.
|
1002
|
+
"""
|
1003
|
+
if self._sklearn_object is None:
|
1004
|
+
self._sklearn_object = self._create_sklearn_object()
|
1005
|
+
return self._sklearn_object
|
1006
|
+
|
1007
|
+
def to_xgboost(self) -> Any:
|
1008
|
+
raise exceptions.SnowflakeMLException(
|
1009
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1010
|
+
original_exception=AttributeError(
|
1011
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1012
|
+
"to_xgboost()",
|
1013
|
+
"to_sklearn()"
|
1014
|
+
)
|
1015
|
+
),
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
def to_lightgbm(self) -> Any:
|
1019
|
+
raise exceptions.SnowflakeMLException(
|
1020
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1021
|
+
original_exception=AttributeError(
|
1022
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1023
|
+
"to_lightgbm()",
|
1024
|
+
"to_sklearn()"
|
1025
|
+
)
|
1026
|
+
),
|
1027
|
+
)
|
928
1028
|
|
929
|
-
def
|
1029
|
+
def _get_dependencies(self) -> List[str]:
|
1030
|
+
return self._deps
|
1031
|
+
|
1032
|
+
|
1033
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
930
1034
|
self._model_signature_dict = dict()
|
931
1035
|
|
932
1036
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
933
1037
|
|
934
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1038
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
935
1039
|
outputs: List[BaseFeatureSpec] = []
|
936
1040
|
if hasattr(self, "predict"):
|
937
1041
|
# keep mypy happy
|
938
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1042
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
939
1043
|
# For classifier, the type of predict is the same as the type of label
|
940
|
-
if self._sklearn_object._estimator_type ==
|
941
|
-
|
1044
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1045
|
+
# label columns is the desired type for output
|
942
1046
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
943
1047
|
# rename the output columns
|
944
1048
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
945
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
946
|
-
|
947
|
-
|
1049
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1050
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1051
|
+
)
|
948
1052
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
949
1053
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
950
|
-
# Clusterer returns int64 cluster labels.
|
1054
|
+
# Clusterer returns int64 cluster labels.
|
951
1055
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
952
1056
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
953
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
954
|
-
|
955
|
-
|
956
|
-
|
1057
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1058
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1059
|
+
)
|
1060
|
+
|
957
1061
|
# For regressor, the type of predict is float64
|
958
|
-
elif self._sklearn_object._estimator_type ==
|
1062
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
959
1063
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
960
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
961
|
-
|
962
|
-
|
963
|
-
|
1064
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1065
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1066
|
+
)
|
1067
|
+
|
964
1068
|
for prob_func in PROB_FUNCTIONS:
|
965
1069
|
if hasattr(self, prob_func):
|
966
1070
|
output_cols_prefix: str = f"{prob_func}_"
|
967
1071
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
968
1072
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
969
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
970
|
-
|
971
|
-
|
1073
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1074
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1075
|
+
)
|
972
1076
|
|
973
1077
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
974
1078
|
items = list(self._model_signature_dict.items())
|
@@ -981,10 +1085,10 @@ class MinCovDet(BaseTransformer):
|
|
981
1085
|
"""Returns model signature of current class.
|
982
1086
|
|
983
1087
|
Raises:
|
984
|
-
|
1088
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
985
1089
|
|
986
1090
|
Returns:
|
987
|
-
Dict
|
1091
|
+
Dict with each method and its input output signature
|
988
1092
|
"""
|
989
1093
|
if self._model_signature_dict is None:
|
990
1094
|
raise exceptions.SnowflakeMLException(
|
@@ -992,35 +1096,3 @@ class MinCovDet(BaseTransformer):
|
|
992
1096
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
993
1097
|
)
|
994
1098
|
return self._model_signature_dict
|
995
|
-
|
996
|
-
def to_sklearn(self) -> Any:
|
997
|
-
"""Get sklearn.covariance.MinCovDet object.
|
998
|
-
"""
|
999
|
-
if self._sklearn_object is None:
|
1000
|
-
self._sklearn_object = self._create_sklearn_object()
|
1001
|
-
return self._sklearn_object
|
1002
|
-
|
1003
|
-
def to_xgboost(self) -> Any:
|
1004
|
-
raise exceptions.SnowflakeMLException(
|
1005
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1006
|
-
original_exception=AttributeError(
|
1007
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1008
|
-
"to_xgboost()",
|
1009
|
-
"to_sklearn()"
|
1010
|
-
)
|
1011
|
-
),
|
1012
|
-
)
|
1013
|
-
|
1014
|
-
def to_lightgbm(self) -> Any:
|
1015
|
-
raise exceptions.SnowflakeMLException(
|
1016
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1017
|
-
original_exception=AttributeError(
|
1018
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1019
|
-
"to_lightgbm()",
|
1020
|
-
"to_sklearn()"
|
1021
|
-
)
|
1022
|
-
),
|
1023
|
-
)
|
1024
|
-
|
1025
|
-
def _get_dependencies(self) -> List[str]:
|
1026
|
-
return self._deps
|