snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -280,12 +279,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
280
279
|
)
|
281
280
|
return selected_cols
|
282
281
|
|
283
|
-
|
284
|
-
project=_PROJECT,
|
285
|
-
subproject=_SUBPROJECT,
|
286
|
-
custom_tags=dict([("autogen", True)]),
|
287
|
-
)
|
288
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LocalOutlierFactor":
|
282
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LocalOutlierFactor":
|
289
283
|
"""Fit the local outlier factor detector from the training dataset
|
290
284
|
For more details on this function, see [sklearn.neighbors.LocalOutlierFactor.fit]
|
291
285
|
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor.fit)
|
@@ -312,12 +306,14 @@ class LocalOutlierFactor(BaseTransformer):
|
|
312
306
|
|
313
307
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
314
308
|
|
315
|
-
|
309
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
316
310
|
if SNOWML_SPROC_ENV in os.environ:
|
317
311
|
statement_params = telemetry.get_function_usage_statement_params(
|
318
312
|
project=_PROJECT,
|
319
313
|
subproject=_SUBPROJECT,
|
320
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
314
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
315
|
+
inspect.currentframe(), LocalOutlierFactor.__class__.__name__
|
316
|
+
),
|
321
317
|
api_calls=[Session.call],
|
322
318
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
323
319
|
)
|
@@ -338,7 +334,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
338
334
|
)
|
339
335
|
self._sklearn_object = model_trainer.train()
|
340
336
|
self._is_fitted = True
|
341
|
-
self.
|
337
|
+
self._generate_model_signatures(dataset)
|
342
338
|
return self
|
343
339
|
|
344
340
|
def _batch_inference_validate_snowpark(
|
@@ -414,7 +410,9 @@ class LocalOutlierFactor(BaseTransformer):
|
|
414
410
|
# when it is classifier, infer the datatype from label columns
|
415
411
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
416
412
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
417
|
-
label_cols_signatures = [
|
413
|
+
label_cols_signatures = [
|
414
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
415
|
+
]
|
418
416
|
if len(label_cols_signatures) == 0:
|
419
417
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
420
418
|
raise exceptions.SnowflakeMLException(
|
@@ -422,25 +420,22 @@ class LocalOutlierFactor(BaseTransformer):
|
|
422
420
|
original_exception=ValueError(error_str),
|
423
421
|
)
|
424
422
|
|
425
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
426
|
-
label_cols_signatures[0].as_snowpark_type()
|
427
|
-
)
|
423
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
428
424
|
|
429
425
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
430
|
-
assert isinstance(
|
426
|
+
assert isinstance(
|
427
|
+
dataset._session, Session
|
428
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
431
429
|
|
432
430
|
transform_kwargs = dict(
|
433
|
-
session
|
434
|
-
dependencies
|
435
|
-
drop_input_cols
|
436
|
-
expected_output_cols_type
|
431
|
+
session=dataset._session,
|
432
|
+
dependencies=self._deps,
|
433
|
+
drop_input_cols=self._drop_input_cols,
|
434
|
+
expected_output_cols_type=expected_type_inferred,
|
437
435
|
)
|
438
436
|
|
439
437
|
elif isinstance(dataset, pd.DataFrame):
|
440
|
-
transform_kwargs = dict(
|
441
|
-
snowpark_input_cols = self._snowpark_cols,
|
442
|
-
drop_input_cols = self._drop_input_cols
|
443
|
-
)
|
438
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
444
439
|
|
445
440
|
transform_handlers = ModelTransformerBuilder.build(
|
446
441
|
dataset=dataset,
|
@@ -480,7 +475,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
480
475
|
Transformed dataset.
|
481
476
|
"""
|
482
477
|
super()._check_dataset_type(dataset)
|
483
|
-
inference_method="transform"
|
478
|
+
inference_method = "transform"
|
484
479
|
|
485
480
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
486
481
|
# are specific to the type of dataset used.
|
@@ -517,17 +512,14 @@ class LocalOutlierFactor(BaseTransformer):
|
|
517
512
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
518
513
|
|
519
514
|
transform_kwargs = dict(
|
520
|
-
session
|
521
|
-
dependencies
|
522
|
-
drop_input_cols
|
523
|
-
expected_output_cols_type
|
515
|
+
session=dataset._session,
|
516
|
+
dependencies=self._deps,
|
517
|
+
drop_input_cols=self._drop_input_cols,
|
518
|
+
expected_output_cols_type=expected_dtype,
|
524
519
|
)
|
525
520
|
|
526
521
|
elif isinstance(dataset, pd.DataFrame):
|
527
|
-
transform_kwargs = dict(
|
528
|
-
snowpark_input_cols = self._snowpark_cols,
|
529
|
-
drop_input_cols = self._drop_input_cols
|
530
|
-
)
|
522
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
531
523
|
|
532
524
|
transform_handlers = ModelTransformerBuilder.build(
|
533
525
|
dataset=dataset,
|
@@ -546,7 +538,11 @@ class LocalOutlierFactor(BaseTransformer):
|
|
546
538
|
return output_df
|
547
539
|
|
548
540
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
549
|
-
def fit_predict(
|
541
|
+
def fit_predict(
|
542
|
+
self,
|
543
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
544
|
+
output_cols_prefix: str = "fit_predict_",
|
545
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
550
546
|
""" Fit the model to the training set X and return the labels
|
551
547
|
For more details on this function, see [sklearn.neighbors.LocalOutlierFactor.fit_predict]
|
552
548
|
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor.fit_predict)
|
@@ -573,7 +569,9 @@ class LocalOutlierFactor(BaseTransformer):
|
|
573
569
|
)
|
574
570
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
575
571
|
drop_input_cols=self._drop_input_cols,
|
576
|
-
expected_output_cols_list=
|
572
|
+
expected_output_cols_list=(
|
573
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
574
|
+
),
|
577
575
|
)
|
578
576
|
self._sklearn_object = fitted_estimator
|
579
577
|
self._is_fitted = True
|
@@ -590,6 +588,62 @@ class LocalOutlierFactor(BaseTransformer):
|
|
590
588
|
assert self._sklearn_object is not None
|
591
589
|
return self._sklearn_object.embedding_
|
592
590
|
|
591
|
+
|
592
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
593
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
594
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
595
|
+
"""
|
596
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
597
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
598
|
+
if output_cols:
|
599
|
+
output_cols = [
|
600
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
601
|
+
for c in output_cols
|
602
|
+
]
|
603
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
604
|
+
output_cols = [output_cols_prefix]
|
605
|
+
elif self._sklearn_object is not None:
|
606
|
+
classes = self._sklearn_object.classes_
|
607
|
+
if isinstance(classes, numpy.ndarray):
|
608
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
609
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
610
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
611
|
+
output_cols = []
|
612
|
+
for i, cl in enumerate(classes):
|
613
|
+
# For binary classification, there is only one output column for each class
|
614
|
+
# ndarray as the two classes are complementary.
|
615
|
+
if len(cl) == 2:
|
616
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
617
|
+
else:
|
618
|
+
output_cols.extend([
|
619
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
620
|
+
])
|
621
|
+
else:
|
622
|
+
output_cols = []
|
623
|
+
|
624
|
+
# Make sure column names are valid snowflake identifiers.
|
625
|
+
assert output_cols is not None # Make MyPy happy
|
626
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
627
|
+
|
628
|
+
return rv
|
629
|
+
|
630
|
+
def _align_expected_output_names(
|
631
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
632
|
+
) -> List[str]:
|
633
|
+
# in case the inferred output column names dimension is different
|
634
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
635
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
636
|
+
output_df_columns = list(output_df_pd.columns)
|
637
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
638
|
+
if self.sample_weight_col:
|
639
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
640
|
+
# if the dimension of inferred output column names is correct; use it
|
641
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
642
|
+
return expected_output_cols_list
|
643
|
+
# otherwise, use the sklearn estimator's output
|
644
|
+
else:
|
645
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
646
|
+
|
593
647
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
594
648
|
@telemetry.send_api_usage_telemetry(
|
595
649
|
project=_PROJECT,
|
@@ -620,24 +674,28 @@ class LocalOutlierFactor(BaseTransformer):
|
|
620
674
|
# are specific to the type of dataset used.
|
621
675
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
622
676
|
|
677
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
678
|
+
|
623
679
|
if isinstance(dataset, DataFrame):
|
624
680
|
self._deps = self._batch_inference_validate_snowpark(
|
625
681
|
dataset=dataset,
|
626
682
|
inference_method=inference_method,
|
627
683
|
)
|
628
|
-
assert isinstance(
|
684
|
+
assert isinstance(
|
685
|
+
dataset._session, Session
|
686
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
629
687
|
transform_kwargs = dict(
|
630
688
|
session=dataset._session,
|
631
689
|
dependencies=self._deps,
|
632
|
-
drop_input_cols
|
690
|
+
drop_input_cols=self._drop_input_cols,
|
633
691
|
expected_output_cols_type="float",
|
634
692
|
)
|
693
|
+
expected_output_cols = self._align_expected_output_names(
|
694
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
695
|
+
)
|
635
696
|
|
636
697
|
elif isinstance(dataset, pd.DataFrame):
|
637
|
-
transform_kwargs = dict(
|
638
|
-
snowpark_input_cols = self._snowpark_cols,
|
639
|
-
drop_input_cols = self._drop_input_cols
|
640
|
-
)
|
698
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
641
699
|
|
642
700
|
transform_handlers = ModelTransformerBuilder.build(
|
643
701
|
dataset=dataset,
|
@@ -649,7 +707,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
649
707
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
650
708
|
inference_method=inference_method,
|
651
709
|
input_cols=self.input_cols,
|
652
|
-
expected_output_cols=
|
710
|
+
expected_output_cols=expected_output_cols,
|
653
711
|
**transform_kwargs
|
654
712
|
)
|
655
713
|
return output_df
|
@@ -679,7 +737,8 @@ class LocalOutlierFactor(BaseTransformer):
|
|
679
737
|
Output dataset with log probability of the sample for each class in the model.
|
680
738
|
"""
|
681
739
|
super()._check_dataset_type(dataset)
|
682
|
-
inference_method="predict_log_proba"
|
740
|
+
inference_method = "predict_log_proba"
|
741
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
683
742
|
|
684
743
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
685
744
|
# are specific to the type of dataset used.
|
@@ -690,18 +749,20 @@ class LocalOutlierFactor(BaseTransformer):
|
|
690
749
|
dataset=dataset,
|
691
750
|
inference_method=inference_method,
|
692
751
|
)
|
693
|
-
assert isinstance(
|
752
|
+
assert isinstance(
|
753
|
+
dataset._session, Session
|
754
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
694
755
|
transform_kwargs = dict(
|
695
756
|
session=dataset._session,
|
696
757
|
dependencies=self._deps,
|
697
|
-
drop_input_cols
|
758
|
+
drop_input_cols=self._drop_input_cols,
|
698
759
|
expected_output_cols_type="float",
|
699
760
|
)
|
761
|
+
expected_output_cols = self._align_expected_output_names(
|
762
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
763
|
+
)
|
700
764
|
elif isinstance(dataset, pd.DataFrame):
|
701
|
-
transform_kwargs = dict(
|
702
|
-
snowpark_input_cols = self._snowpark_cols,
|
703
|
-
drop_input_cols = self._drop_input_cols
|
704
|
-
)
|
765
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
705
766
|
|
706
767
|
transform_handlers = ModelTransformerBuilder.build(
|
707
768
|
dataset=dataset,
|
@@ -714,7 +775,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
714
775
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
715
776
|
inference_method=inference_method,
|
716
777
|
input_cols=self.input_cols,
|
717
|
-
expected_output_cols=
|
778
|
+
expected_output_cols=expected_output_cols,
|
718
779
|
**transform_kwargs
|
719
780
|
)
|
720
781
|
return output_df
|
@@ -742,30 +803,34 @@ class LocalOutlierFactor(BaseTransformer):
|
|
742
803
|
Output dataset with results of the decision function for the samples in input dataset.
|
743
804
|
"""
|
744
805
|
super()._check_dataset_type(dataset)
|
745
|
-
inference_method="decision_function"
|
806
|
+
inference_method = "decision_function"
|
746
807
|
|
747
808
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
748
809
|
# are specific to the type of dataset used.
|
749
810
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
750
811
|
|
812
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
813
|
+
|
751
814
|
if isinstance(dataset, DataFrame):
|
752
815
|
self._deps = self._batch_inference_validate_snowpark(
|
753
816
|
dataset=dataset,
|
754
817
|
inference_method=inference_method,
|
755
818
|
)
|
756
|
-
assert isinstance(
|
819
|
+
assert isinstance(
|
820
|
+
dataset._session, Session
|
821
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
757
822
|
transform_kwargs = dict(
|
758
823
|
session=dataset._session,
|
759
824
|
dependencies=self._deps,
|
760
|
-
drop_input_cols
|
825
|
+
drop_input_cols=self._drop_input_cols,
|
761
826
|
expected_output_cols_type="float",
|
762
827
|
)
|
828
|
+
expected_output_cols = self._align_expected_output_names(
|
829
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
830
|
+
)
|
763
831
|
|
764
832
|
elif isinstance(dataset, pd.DataFrame):
|
765
|
-
transform_kwargs = dict(
|
766
|
-
snowpark_input_cols = self._snowpark_cols,
|
767
|
-
drop_input_cols = self._drop_input_cols
|
768
|
-
)
|
833
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
769
834
|
|
770
835
|
transform_handlers = ModelTransformerBuilder.build(
|
771
836
|
dataset=dataset,
|
@@ -778,7 +843,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
778
843
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
779
844
|
inference_method=inference_method,
|
780
845
|
input_cols=self.input_cols,
|
781
|
-
expected_output_cols=
|
846
|
+
expected_output_cols=expected_output_cols,
|
782
847
|
**transform_kwargs
|
783
848
|
)
|
784
849
|
return output_df
|
@@ -809,12 +874,14 @@ class LocalOutlierFactor(BaseTransformer):
|
|
809
874
|
Output dataset with probability of the sample for each class in the model.
|
810
875
|
"""
|
811
876
|
super()._check_dataset_type(dataset)
|
812
|
-
inference_method="score_samples"
|
877
|
+
inference_method = "score_samples"
|
813
878
|
|
814
879
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
815
880
|
# are specific to the type of dataset used.
|
816
881
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
817
882
|
|
883
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
884
|
+
|
818
885
|
if isinstance(dataset, DataFrame):
|
819
886
|
self._deps = self._batch_inference_validate_snowpark(
|
820
887
|
dataset=dataset,
|
@@ -827,6 +894,9 @@ class LocalOutlierFactor(BaseTransformer):
|
|
827
894
|
drop_input_cols = self._drop_input_cols,
|
828
895
|
expected_output_cols_type="float",
|
829
896
|
)
|
897
|
+
expected_output_cols = self._align_expected_output_names(
|
898
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
899
|
+
)
|
830
900
|
|
831
901
|
elif isinstance(dataset, pd.DataFrame):
|
832
902
|
transform_kwargs = dict(
|
@@ -845,7 +915,7 @@ class LocalOutlierFactor(BaseTransformer):
|
|
845
915
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
846
916
|
inference_method=inference_method,
|
847
917
|
input_cols=self.input_cols,
|
848
|
-
expected_output_cols=
|
918
|
+
expected_output_cols=expected_output_cols,
|
849
919
|
**transform_kwargs
|
850
920
|
)
|
851
921
|
return output_df
|
@@ -992,50 +1062,84 @@ class LocalOutlierFactor(BaseTransformer):
|
|
992
1062
|
)
|
993
1063
|
return output_df
|
994
1064
|
|
1065
|
+
|
1066
|
+
|
1067
|
+
def to_sklearn(self) -> Any:
|
1068
|
+
"""Get sklearn.neighbors.LocalOutlierFactor object.
|
1069
|
+
"""
|
1070
|
+
if self._sklearn_object is None:
|
1071
|
+
self._sklearn_object = self._create_sklearn_object()
|
1072
|
+
return self._sklearn_object
|
1073
|
+
|
1074
|
+
def to_xgboost(self) -> Any:
|
1075
|
+
raise exceptions.SnowflakeMLException(
|
1076
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1077
|
+
original_exception=AttributeError(
|
1078
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1079
|
+
"to_xgboost()",
|
1080
|
+
"to_sklearn()"
|
1081
|
+
)
|
1082
|
+
),
|
1083
|
+
)
|
1084
|
+
|
1085
|
+
def to_lightgbm(self) -> Any:
|
1086
|
+
raise exceptions.SnowflakeMLException(
|
1087
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1088
|
+
original_exception=AttributeError(
|
1089
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1090
|
+
"to_lightgbm()",
|
1091
|
+
"to_sklearn()"
|
1092
|
+
)
|
1093
|
+
),
|
1094
|
+
)
|
995
1095
|
|
996
|
-
def
|
1096
|
+
def _get_dependencies(self) -> List[str]:
|
1097
|
+
return self._deps
|
1098
|
+
|
1099
|
+
|
1100
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
997
1101
|
self._model_signature_dict = dict()
|
998
1102
|
|
999
1103
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1000
1104
|
|
1001
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1105
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1002
1106
|
outputs: List[BaseFeatureSpec] = []
|
1003
1107
|
if hasattr(self, "predict"):
|
1004
1108
|
# keep mypy happy
|
1005
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1109
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1006
1110
|
# For classifier, the type of predict is the same as the type of label
|
1007
|
-
if self._sklearn_object._estimator_type ==
|
1008
|
-
|
1111
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1112
|
+
# label columns is the desired type for output
|
1009
1113
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1010
1114
|
# rename the output columns
|
1011
1115
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1012
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1013
|
-
|
1014
|
-
|
1116
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1117
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1118
|
+
)
|
1015
1119
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1016
1120
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1017
|
-
# Clusterer returns int64 cluster labels.
|
1121
|
+
# Clusterer returns int64 cluster labels.
|
1018
1122
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1019
1123
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1020
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1124
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1125
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1126
|
+
)
|
1127
|
+
|
1024
1128
|
# For regressor, the type of predict is float64
|
1025
|
-
elif self._sklearn_object._estimator_type ==
|
1129
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1026
1130
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1027
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1131
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1132
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1133
|
+
)
|
1134
|
+
|
1031
1135
|
for prob_func in PROB_FUNCTIONS:
|
1032
1136
|
if hasattr(self, prob_func):
|
1033
1137
|
output_cols_prefix: str = f"{prob_func}_"
|
1034
1138
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1035
1139
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1036
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1037
|
-
|
1038
|
-
|
1140
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1141
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1142
|
+
)
|
1039
1143
|
|
1040
1144
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1041
1145
|
items = list(self._model_signature_dict.items())
|
@@ -1048,10 +1152,10 @@ class LocalOutlierFactor(BaseTransformer):
|
|
1048
1152
|
"""Returns model signature of current class.
|
1049
1153
|
|
1050
1154
|
Raises:
|
1051
|
-
|
1155
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1052
1156
|
|
1053
1157
|
Returns:
|
1054
|
-
Dict
|
1158
|
+
Dict with each method and its input output signature
|
1055
1159
|
"""
|
1056
1160
|
if self._model_signature_dict is None:
|
1057
1161
|
raise exceptions.SnowflakeMLException(
|
@@ -1059,35 +1163,3 @@ class LocalOutlierFactor(BaseTransformer):
|
|
1059
1163
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1060
1164
|
)
|
1061
1165
|
return self._model_signature_dict
|
1062
|
-
|
1063
|
-
def to_sklearn(self) -> Any:
|
1064
|
-
"""Get sklearn.neighbors.LocalOutlierFactor object.
|
1065
|
-
"""
|
1066
|
-
if self._sklearn_object is None:
|
1067
|
-
self._sklearn_object = self._create_sklearn_object()
|
1068
|
-
return self._sklearn_object
|
1069
|
-
|
1070
|
-
def to_xgboost(self) -> Any:
|
1071
|
-
raise exceptions.SnowflakeMLException(
|
1072
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
-
original_exception=AttributeError(
|
1074
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
-
"to_xgboost()",
|
1076
|
-
"to_sklearn()"
|
1077
|
-
)
|
1078
|
-
),
|
1079
|
-
)
|
1080
|
-
|
1081
|
-
def to_lightgbm(self) -> Any:
|
1082
|
-
raise exceptions.SnowflakeMLException(
|
1083
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
-
original_exception=AttributeError(
|
1085
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
-
"to_lightgbm()",
|
1087
|
-
"to_sklearn()"
|
1088
|
-
)
|
1089
|
-
),
|
1090
|
-
)
|
1091
|
-
|
1092
|
-
def _get_dependencies(self) -> List[str]:
|
1093
|
-
return self._deps
|