snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -202,12 +201,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
202
201
|
)
|
203
202
|
return selected_cols
|
204
203
|
|
205
|
-
|
206
|
-
project=_PROJECT,
|
207
|
-
subproject=_SUBPROJECT,
|
208
|
-
custom_tags=dict([("autogen", True)]),
|
209
|
-
)
|
210
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "EmpiricalCovariance":
|
204
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "EmpiricalCovariance":
|
211
205
|
"""Fit the maximum likelihood covariance estimator to X
|
212
206
|
For more details on this function, see [sklearn.covariance.EmpiricalCovariance.fit]
|
213
207
|
(https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EmpiricalCovariance.html#sklearn.covariance.EmpiricalCovariance.fit)
|
@@ -234,12 +228,14 @@ class EmpiricalCovariance(BaseTransformer):
|
|
234
228
|
|
235
229
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
236
230
|
|
237
|
-
|
231
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
238
232
|
if SNOWML_SPROC_ENV in os.environ:
|
239
233
|
statement_params = telemetry.get_function_usage_statement_params(
|
240
234
|
project=_PROJECT,
|
241
235
|
subproject=_SUBPROJECT,
|
242
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
236
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
237
|
+
inspect.currentframe(), EmpiricalCovariance.__class__.__name__
|
238
|
+
),
|
243
239
|
api_calls=[Session.call],
|
244
240
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
245
241
|
)
|
@@ -260,7 +256,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
260
256
|
)
|
261
257
|
self._sklearn_object = model_trainer.train()
|
262
258
|
self._is_fitted = True
|
263
|
-
self.
|
259
|
+
self._generate_model_signatures(dataset)
|
264
260
|
return self
|
265
261
|
|
266
262
|
def _batch_inference_validate_snowpark(
|
@@ -334,7 +330,9 @@ class EmpiricalCovariance(BaseTransformer):
|
|
334
330
|
# when it is classifier, infer the datatype from label columns
|
335
331
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
336
332
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
337
|
-
label_cols_signatures = [
|
333
|
+
label_cols_signatures = [
|
334
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
335
|
+
]
|
338
336
|
if len(label_cols_signatures) == 0:
|
339
337
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
340
338
|
raise exceptions.SnowflakeMLException(
|
@@ -342,25 +340,22 @@ class EmpiricalCovariance(BaseTransformer):
|
|
342
340
|
original_exception=ValueError(error_str),
|
343
341
|
)
|
344
342
|
|
345
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
346
|
-
label_cols_signatures[0].as_snowpark_type()
|
347
|
-
)
|
343
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
348
344
|
|
349
345
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
350
|
-
assert isinstance(
|
346
|
+
assert isinstance(
|
347
|
+
dataset._session, Session
|
348
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
351
349
|
|
352
350
|
transform_kwargs = dict(
|
353
|
-
session
|
354
|
-
dependencies
|
355
|
-
drop_input_cols
|
356
|
-
expected_output_cols_type
|
351
|
+
session=dataset._session,
|
352
|
+
dependencies=self._deps,
|
353
|
+
drop_input_cols=self._drop_input_cols,
|
354
|
+
expected_output_cols_type=expected_type_inferred,
|
357
355
|
)
|
358
356
|
|
359
357
|
elif isinstance(dataset, pd.DataFrame):
|
360
|
-
transform_kwargs = dict(
|
361
|
-
snowpark_input_cols = self._snowpark_cols,
|
362
|
-
drop_input_cols = self._drop_input_cols
|
363
|
-
)
|
358
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
364
359
|
|
365
360
|
transform_handlers = ModelTransformerBuilder.build(
|
366
361
|
dataset=dataset,
|
@@ -400,7 +395,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
400
395
|
Transformed dataset.
|
401
396
|
"""
|
402
397
|
super()._check_dataset_type(dataset)
|
403
|
-
inference_method="transform"
|
398
|
+
inference_method = "transform"
|
404
399
|
|
405
400
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
406
401
|
# are specific to the type of dataset used.
|
@@ -437,17 +432,14 @@ class EmpiricalCovariance(BaseTransformer):
|
|
437
432
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
438
433
|
|
439
434
|
transform_kwargs = dict(
|
440
|
-
session
|
441
|
-
dependencies
|
442
|
-
drop_input_cols
|
443
|
-
expected_output_cols_type
|
435
|
+
session=dataset._session,
|
436
|
+
dependencies=self._deps,
|
437
|
+
drop_input_cols=self._drop_input_cols,
|
438
|
+
expected_output_cols_type=expected_dtype,
|
444
439
|
)
|
445
440
|
|
446
441
|
elif isinstance(dataset, pd.DataFrame):
|
447
|
-
transform_kwargs = dict(
|
448
|
-
snowpark_input_cols = self._snowpark_cols,
|
449
|
-
drop_input_cols = self._drop_input_cols
|
450
|
-
)
|
442
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
451
443
|
|
452
444
|
transform_handlers = ModelTransformerBuilder.build(
|
453
445
|
dataset=dataset,
|
@@ -466,7 +458,11 @@ class EmpiricalCovariance(BaseTransformer):
|
|
466
458
|
return output_df
|
467
459
|
|
468
460
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
469
|
-
def fit_predict(
|
461
|
+
def fit_predict(
|
462
|
+
self,
|
463
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
464
|
+
output_cols_prefix: str = "fit_predict_",
|
465
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
470
466
|
""" Method not supported for this class.
|
471
467
|
|
472
468
|
|
@@ -491,7 +487,9 @@ class EmpiricalCovariance(BaseTransformer):
|
|
491
487
|
)
|
492
488
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
493
489
|
drop_input_cols=self._drop_input_cols,
|
494
|
-
expected_output_cols_list=
|
490
|
+
expected_output_cols_list=(
|
491
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
492
|
+
),
|
495
493
|
)
|
496
494
|
self._sklearn_object = fitted_estimator
|
497
495
|
self._is_fitted = True
|
@@ -508,6 +506,62 @@ class EmpiricalCovariance(BaseTransformer):
|
|
508
506
|
assert self._sklearn_object is not None
|
509
507
|
return self._sklearn_object.embedding_
|
510
508
|
|
509
|
+
|
510
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
511
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
512
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
513
|
+
"""
|
514
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
515
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
516
|
+
if output_cols:
|
517
|
+
output_cols = [
|
518
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
519
|
+
for c in output_cols
|
520
|
+
]
|
521
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
522
|
+
output_cols = [output_cols_prefix]
|
523
|
+
elif self._sklearn_object is not None:
|
524
|
+
classes = self._sklearn_object.classes_
|
525
|
+
if isinstance(classes, numpy.ndarray):
|
526
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
527
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
528
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
529
|
+
output_cols = []
|
530
|
+
for i, cl in enumerate(classes):
|
531
|
+
# For binary classification, there is only one output column for each class
|
532
|
+
# ndarray as the two classes are complementary.
|
533
|
+
if len(cl) == 2:
|
534
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
535
|
+
else:
|
536
|
+
output_cols.extend([
|
537
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
538
|
+
])
|
539
|
+
else:
|
540
|
+
output_cols = []
|
541
|
+
|
542
|
+
# Make sure column names are valid snowflake identifiers.
|
543
|
+
assert output_cols is not None # Make MyPy happy
|
544
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
545
|
+
|
546
|
+
return rv
|
547
|
+
|
548
|
+
def _align_expected_output_names(
|
549
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
550
|
+
) -> List[str]:
|
551
|
+
# in case the inferred output column names dimension is different
|
552
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
553
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
554
|
+
output_df_columns = list(output_df_pd.columns)
|
555
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
556
|
+
if self.sample_weight_col:
|
557
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
558
|
+
# if the dimension of inferred output column names is correct; use it
|
559
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
560
|
+
return expected_output_cols_list
|
561
|
+
# otherwise, use the sklearn estimator's output
|
562
|
+
else:
|
563
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
564
|
+
|
511
565
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
512
566
|
@telemetry.send_api_usage_telemetry(
|
513
567
|
project=_PROJECT,
|
@@ -538,24 +592,28 @@ class EmpiricalCovariance(BaseTransformer):
|
|
538
592
|
# are specific to the type of dataset used.
|
539
593
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
540
594
|
|
595
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
596
|
+
|
541
597
|
if isinstance(dataset, DataFrame):
|
542
598
|
self._deps = self._batch_inference_validate_snowpark(
|
543
599
|
dataset=dataset,
|
544
600
|
inference_method=inference_method,
|
545
601
|
)
|
546
|
-
assert isinstance(
|
602
|
+
assert isinstance(
|
603
|
+
dataset._session, Session
|
604
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
547
605
|
transform_kwargs = dict(
|
548
606
|
session=dataset._session,
|
549
607
|
dependencies=self._deps,
|
550
|
-
drop_input_cols
|
608
|
+
drop_input_cols=self._drop_input_cols,
|
551
609
|
expected_output_cols_type="float",
|
552
610
|
)
|
611
|
+
expected_output_cols = self._align_expected_output_names(
|
612
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
613
|
+
)
|
553
614
|
|
554
615
|
elif isinstance(dataset, pd.DataFrame):
|
555
|
-
transform_kwargs = dict(
|
556
|
-
snowpark_input_cols = self._snowpark_cols,
|
557
|
-
drop_input_cols = self._drop_input_cols
|
558
|
-
)
|
616
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
559
617
|
|
560
618
|
transform_handlers = ModelTransformerBuilder.build(
|
561
619
|
dataset=dataset,
|
@@ -567,7 +625,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
567
625
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
568
626
|
inference_method=inference_method,
|
569
627
|
input_cols=self.input_cols,
|
570
|
-
expected_output_cols=
|
628
|
+
expected_output_cols=expected_output_cols,
|
571
629
|
**transform_kwargs
|
572
630
|
)
|
573
631
|
return output_df
|
@@ -597,7 +655,8 @@ class EmpiricalCovariance(BaseTransformer):
|
|
597
655
|
Output dataset with log probability of the sample for each class in the model.
|
598
656
|
"""
|
599
657
|
super()._check_dataset_type(dataset)
|
600
|
-
inference_method="predict_log_proba"
|
658
|
+
inference_method = "predict_log_proba"
|
659
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
601
660
|
|
602
661
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
603
662
|
# are specific to the type of dataset used.
|
@@ -608,18 +667,20 @@ class EmpiricalCovariance(BaseTransformer):
|
|
608
667
|
dataset=dataset,
|
609
668
|
inference_method=inference_method,
|
610
669
|
)
|
611
|
-
assert isinstance(
|
670
|
+
assert isinstance(
|
671
|
+
dataset._session, Session
|
672
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
612
673
|
transform_kwargs = dict(
|
613
674
|
session=dataset._session,
|
614
675
|
dependencies=self._deps,
|
615
|
-
drop_input_cols
|
676
|
+
drop_input_cols=self._drop_input_cols,
|
616
677
|
expected_output_cols_type="float",
|
617
678
|
)
|
679
|
+
expected_output_cols = self._align_expected_output_names(
|
680
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
681
|
+
)
|
618
682
|
elif isinstance(dataset, pd.DataFrame):
|
619
|
-
transform_kwargs = dict(
|
620
|
-
snowpark_input_cols = self._snowpark_cols,
|
621
|
-
drop_input_cols = self._drop_input_cols
|
622
|
-
)
|
683
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
623
684
|
|
624
685
|
transform_handlers = ModelTransformerBuilder.build(
|
625
686
|
dataset=dataset,
|
@@ -632,7 +693,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
632
693
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
633
694
|
inference_method=inference_method,
|
634
695
|
input_cols=self.input_cols,
|
635
|
-
expected_output_cols=
|
696
|
+
expected_output_cols=expected_output_cols,
|
636
697
|
**transform_kwargs
|
637
698
|
)
|
638
699
|
return output_df
|
@@ -658,30 +719,34 @@ class EmpiricalCovariance(BaseTransformer):
|
|
658
719
|
Output dataset with results of the decision function for the samples in input dataset.
|
659
720
|
"""
|
660
721
|
super()._check_dataset_type(dataset)
|
661
|
-
inference_method="decision_function"
|
722
|
+
inference_method = "decision_function"
|
662
723
|
|
663
724
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
664
725
|
# are specific to the type of dataset used.
|
665
726
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
666
727
|
|
728
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
729
|
+
|
667
730
|
if isinstance(dataset, DataFrame):
|
668
731
|
self._deps = self._batch_inference_validate_snowpark(
|
669
732
|
dataset=dataset,
|
670
733
|
inference_method=inference_method,
|
671
734
|
)
|
672
|
-
assert isinstance(
|
735
|
+
assert isinstance(
|
736
|
+
dataset._session, Session
|
737
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
673
738
|
transform_kwargs = dict(
|
674
739
|
session=dataset._session,
|
675
740
|
dependencies=self._deps,
|
676
|
-
drop_input_cols
|
741
|
+
drop_input_cols=self._drop_input_cols,
|
677
742
|
expected_output_cols_type="float",
|
678
743
|
)
|
744
|
+
expected_output_cols = self._align_expected_output_names(
|
745
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
746
|
+
)
|
679
747
|
|
680
748
|
elif isinstance(dataset, pd.DataFrame):
|
681
|
-
transform_kwargs = dict(
|
682
|
-
snowpark_input_cols = self._snowpark_cols,
|
683
|
-
drop_input_cols = self._drop_input_cols
|
684
|
-
)
|
749
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
685
750
|
|
686
751
|
transform_handlers = ModelTransformerBuilder.build(
|
687
752
|
dataset=dataset,
|
@@ -694,7 +759,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
694
759
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
695
760
|
inference_method=inference_method,
|
696
761
|
input_cols=self.input_cols,
|
697
|
-
expected_output_cols=
|
762
|
+
expected_output_cols=expected_output_cols,
|
698
763
|
**transform_kwargs
|
699
764
|
)
|
700
765
|
return output_df
|
@@ -723,12 +788,14 @@ class EmpiricalCovariance(BaseTransformer):
|
|
723
788
|
Output dataset with probability of the sample for each class in the model.
|
724
789
|
"""
|
725
790
|
super()._check_dataset_type(dataset)
|
726
|
-
inference_method="score_samples"
|
791
|
+
inference_method = "score_samples"
|
727
792
|
|
728
793
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
729
794
|
# are specific to the type of dataset used.
|
730
795
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
731
796
|
|
797
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
798
|
+
|
732
799
|
if isinstance(dataset, DataFrame):
|
733
800
|
self._deps = self._batch_inference_validate_snowpark(
|
734
801
|
dataset=dataset,
|
@@ -741,6 +808,9 @@ class EmpiricalCovariance(BaseTransformer):
|
|
741
808
|
drop_input_cols = self._drop_input_cols,
|
742
809
|
expected_output_cols_type="float",
|
743
810
|
)
|
811
|
+
expected_output_cols = self._align_expected_output_names(
|
812
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
813
|
+
)
|
744
814
|
|
745
815
|
elif isinstance(dataset, pd.DataFrame):
|
746
816
|
transform_kwargs = dict(
|
@@ -759,7 +829,7 @@ class EmpiricalCovariance(BaseTransformer):
|
|
759
829
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
760
830
|
inference_method=inference_method,
|
761
831
|
input_cols=self.input_cols,
|
762
|
-
expected_output_cols=
|
832
|
+
expected_output_cols=expected_output_cols,
|
763
833
|
**transform_kwargs
|
764
834
|
)
|
765
835
|
return output_df
|
@@ -906,50 +976,84 @@ class EmpiricalCovariance(BaseTransformer):
|
|
906
976
|
)
|
907
977
|
return output_df
|
908
978
|
|
979
|
+
|
980
|
+
|
981
|
+
def to_sklearn(self) -> Any:
|
982
|
+
"""Get sklearn.covariance.EmpiricalCovariance object.
|
983
|
+
"""
|
984
|
+
if self._sklearn_object is None:
|
985
|
+
self._sklearn_object = self._create_sklearn_object()
|
986
|
+
return self._sklearn_object
|
987
|
+
|
988
|
+
def to_xgboost(self) -> Any:
|
989
|
+
raise exceptions.SnowflakeMLException(
|
990
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
991
|
+
original_exception=AttributeError(
|
992
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
993
|
+
"to_xgboost()",
|
994
|
+
"to_sklearn()"
|
995
|
+
)
|
996
|
+
),
|
997
|
+
)
|
998
|
+
|
999
|
+
def to_lightgbm(self) -> Any:
|
1000
|
+
raise exceptions.SnowflakeMLException(
|
1001
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1002
|
+
original_exception=AttributeError(
|
1003
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1004
|
+
"to_lightgbm()",
|
1005
|
+
"to_sklearn()"
|
1006
|
+
)
|
1007
|
+
),
|
1008
|
+
)
|
909
1009
|
|
910
|
-
def
|
1010
|
+
def _get_dependencies(self) -> List[str]:
|
1011
|
+
return self._deps
|
1012
|
+
|
1013
|
+
|
1014
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
911
1015
|
self._model_signature_dict = dict()
|
912
1016
|
|
913
1017
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
914
1018
|
|
915
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1019
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
916
1020
|
outputs: List[BaseFeatureSpec] = []
|
917
1021
|
if hasattr(self, "predict"):
|
918
1022
|
# keep mypy happy
|
919
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1023
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
920
1024
|
# For classifier, the type of predict is the same as the type of label
|
921
|
-
if self._sklearn_object._estimator_type ==
|
922
|
-
|
1025
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1026
|
+
# label columns is the desired type for output
|
923
1027
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
924
1028
|
# rename the output columns
|
925
1029
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
926
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
927
|
-
|
928
|
-
|
1030
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1031
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1032
|
+
)
|
929
1033
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
930
1034
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
931
|
-
# Clusterer returns int64 cluster labels.
|
1035
|
+
# Clusterer returns int64 cluster labels.
|
932
1036
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
933
1037
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
934
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
935
|
-
|
936
|
-
|
937
|
-
|
1038
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1039
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1040
|
+
)
|
1041
|
+
|
938
1042
|
# For regressor, the type of predict is float64
|
939
|
-
elif self._sklearn_object._estimator_type ==
|
1043
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
940
1044
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
941
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
942
|
-
|
943
|
-
|
944
|
-
|
1045
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1046
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1047
|
+
)
|
1048
|
+
|
945
1049
|
for prob_func in PROB_FUNCTIONS:
|
946
1050
|
if hasattr(self, prob_func):
|
947
1051
|
output_cols_prefix: str = f"{prob_func}_"
|
948
1052
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
949
1053
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
950
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
951
|
-
|
952
|
-
|
1054
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1055
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1056
|
+
)
|
953
1057
|
|
954
1058
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
955
1059
|
items = list(self._model_signature_dict.items())
|
@@ -962,10 +1066,10 @@ class EmpiricalCovariance(BaseTransformer):
|
|
962
1066
|
"""Returns model signature of current class.
|
963
1067
|
|
964
1068
|
Raises:
|
965
|
-
|
1069
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
966
1070
|
|
967
1071
|
Returns:
|
968
|
-
Dict
|
1072
|
+
Dict with each method and its input output signature
|
969
1073
|
"""
|
970
1074
|
if self._model_signature_dict is None:
|
971
1075
|
raise exceptions.SnowflakeMLException(
|
@@ -973,35 +1077,3 @@ class EmpiricalCovariance(BaseTransformer):
|
|
973
1077
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
974
1078
|
)
|
975
1079
|
return self._model_signature_dict
|
976
|
-
|
977
|
-
def to_sklearn(self) -> Any:
|
978
|
-
"""Get sklearn.covariance.EmpiricalCovariance object.
|
979
|
-
"""
|
980
|
-
if self._sklearn_object is None:
|
981
|
-
self._sklearn_object = self._create_sklearn_object()
|
982
|
-
return self._sklearn_object
|
983
|
-
|
984
|
-
def to_xgboost(self) -> Any:
|
985
|
-
raise exceptions.SnowflakeMLException(
|
986
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
987
|
-
original_exception=AttributeError(
|
988
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
989
|
-
"to_xgboost()",
|
990
|
-
"to_sklearn()"
|
991
|
-
)
|
992
|
-
),
|
993
|
-
)
|
994
|
-
|
995
|
-
def to_lightgbm(self) -> Any:
|
996
|
-
raise exceptions.SnowflakeMLException(
|
997
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
998
|
-
original_exception=AttributeError(
|
999
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1000
|
-
"to_lightgbm()",
|
1001
|
-
"to_sklearn()"
|
1002
|
-
)
|
1003
|
-
),
|
1004
|
-
)
|
1005
|
-
|
1006
|
-
def _get_dependencies(self) -> List[str]:
|
1007
|
-
return self._deps
|