snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -263,12 +262,7 @@ class NearestNeighbors(BaseTransformer):
|
|
263
262
|
)
|
264
263
|
return selected_cols
|
265
264
|
|
266
|
-
|
267
|
-
project=_PROJECT,
|
268
|
-
subproject=_SUBPROJECT,
|
269
|
-
custom_tags=dict([("autogen", True)]),
|
270
|
-
)
|
271
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "NearestNeighbors":
|
265
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "NearestNeighbors":
|
272
266
|
"""Fit the nearest neighbors estimator from the training dataset
|
273
267
|
For more details on this function, see [sklearn.neighbors.NearestNeighbors.fit]
|
274
268
|
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors.fit)
|
@@ -295,12 +289,14 @@ class NearestNeighbors(BaseTransformer):
|
|
295
289
|
|
296
290
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
297
291
|
|
298
|
-
|
292
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
299
293
|
if SNOWML_SPROC_ENV in os.environ:
|
300
294
|
statement_params = telemetry.get_function_usage_statement_params(
|
301
295
|
project=_PROJECT,
|
302
296
|
subproject=_SUBPROJECT,
|
303
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
297
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
298
|
+
inspect.currentframe(), NearestNeighbors.__class__.__name__
|
299
|
+
),
|
304
300
|
api_calls=[Session.call],
|
305
301
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
306
302
|
)
|
@@ -321,7 +317,7 @@ class NearestNeighbors(BaseTransformer):
|
|
321
317
|
)
|
322
318
|
self._sklearn_object = model_trainer.train()
|
323
319
|
self._is_fitted = True
|
324
|
-
self.
|
320
|
+
self._generate_model_signatures(dataset)
|
325
321
|
return self
|
326
322
|
|
327
323
|
def _batch_inference_validate_snowpark(
|
@@ -395,7 +391,9 @@ class NearestNeighbors(BaseTransformer):
|
|
395
391
|
# when it is classifier, infer the datatype from label columns
|
396
392
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
397
393
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
398
|
-
label_cols_signatures = [
|
394
|
+
label_cols_signatures = [
|
395
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
396
|
+
]
|
399
397
|
if len(label_cols_signatures) == 0:
|
400
398
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
401
399
|
raise exceptions.SnowflakeMLException(
|
@@ -403,25 +401,22 @@ class NearestNeighbors(BaseTransformer):
|
|
403
401
|
original_exception=ValueError(error_str),
|
404
402
|
)
|
405
403
|
|
406
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
407
|
-
label_cols_signatures[0].as_snowpark_type()
|
408
|
-
)
|
404
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
409
405
|
|
410
406
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
411
|
-
assert isinstance(
|
407
|
+
assert isinstance(
|
408
|
+
dataset._session, Session
|
409
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
412
410
|
|
413
411
|
transform_kwargs = dict(
|
414
|
-
session
|
415
|
-
dependencies
|
416
|
-
drop_input_cols
|
417
|
-
expected_output_cols_type
|
412
|
+
session=dataset._session,
|
413
|
+
dependencies=self._deps,
|
414
|
+
drop_input_cols=self._drop_input_cols,
|
415
|
+
expected_output_cols_type=expected_type_inferred,
|
418
416
|
)
|
419
417
|
|
420
418
|
elif isinstance(dataset, pd.DataFrame):
|
421
|
-
transform_kwargs = dict(
|
422
|
-
snowpark_input_cols = self._snowpark_cols,
|
423
|
-
drop_input_cols = self._drop_input_cols
|
424
|
-
)
|
419
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
425
420
|
|
426
421
|
transform_handlers = ModelTransformerBuilder.build(
|
427
422
|
dataset=dataset,
|
@@ -461,7 +456,7 @@ class NearestNeighbors(BaseTransformer):
|
|
461
456
|
Transformed dataset.
|
462
457
|
"""
|
463
458
|
super()._check_dataset_type(dataset)
|
464
|
-
inference_method="transform"
|
459
|
+
inference_method = "transform"
|
465
460
|
|
466
461
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
467
462
|
# are specific to the type of dataset used.
|
@@ -498,17 +493,14 @@ class NearestNeighbors(BaseTransformer):
|
|
498
493
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
499
494
|
|
500
495
|
transform_kwargs = dict(
|
501
|
-
session
|
502
|
-
dependencies
|
503
|
-
drop_input_cols
|
504
|
-
expected_output_cols_type
|
496
|
+
session=dataset._session,
|
497
|
+
dependencies=self._deps,
|
498
|
+
drop_input_cols=self._drop_input_cols,
|
499
|
+
expected_output_cols_type=expected_dtype,
|
505
500
|
)
|
506
501
|
|
507
502
|
elif isinstance(dataset, pd.DataFrame):
|
508
|
-
transform_kwargs = dict(
|
509
|
-
snowpark_input_cols = self._snowpark_cols,
|
510
|
-
drop_input_cols = self._drop_input_cols
|
511
|
-
)
|
503
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
512
504
|
|
513
505
|
transform_handlers = ModelTransformerBuilder.build(
|
514
506
|
dataset=dataset,
|
@@ -527,7 +519,11 @@ class NearestNeighbors(BaseTransformer):
|
|
527
519
|
return output_df
|
528
520
|
|
529
521
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
530
|
-
def fit_predict(
|
522
|
+
def fit_predict(
|
523
|
+
self,
|
524
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
525
|
+
output_cols_prefix: str = "fit_predict_",
|
526
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
531
527
|
""" Method not supported for this class.
|
532
528
|
|
533
529
|
|
@@ -552,7 +548,9 @@ class NearestNeighbors(BaseTransformer):
|
|
552
548
|
)
|
553
549
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
554
550
|
drop_input_cols=self._drop_input_cols,
|
555
|
-
expected_output_cols_list=
|
551
|
+
expected_output_cols_list=(
|
552
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
553
|
+
),
|
556
554
|
)
|
557
555
|
self._sklearn_object = fitted_estimator
|
558
556
|
self._is_fitted = True
|
@@ -569,6 +567,62 @@ class NearestNeighbors(BaseTransformer):
|
|
569
567
|
assert self._sklearn_object is not None
|
570
568
|
return self._sklearn_object.embedding_
|
571
569
|
|
570
|
+
|
571
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
572
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
573
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
574
|
+
"""
|
575
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
576
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
577
|
+
if output_cols:
|
578
|
+
output_cols = [
|
579
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
580
|
+
for c in output_cols
|
581
|
+
]
|
582
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
583
|
+
output_cols = [output_cols_prefix]
|
584
|
+
elif self._sklearn_object is not None:
|
585
|
+
classes = self._sklearn_object.classes_
|
586
|
+
if isinstance(classes, numpy.ndarray):
|
587
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
588
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
589
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
590
|
+
output_cols = []
|
591
|
+
for i, cl in enumerate(classes):
|
592
|
+
# For binary classification, there is only one output column for each class
|
593
|
+
# ndarray as the two classes are complementary.
|
594
|
+
if len(cl) == 2:
|
595
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
596
|
+
else:
|
597
|
+
output_cols.extend([
|
598
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
599
|
+
])
|
600
|
+
else:
|
601
|
+
output_cols = []
|
602
|
+
|
603
|
+
# Make sure column names are valid snowflake identifiers.
|
604
|
+
assert output_cols is not None # Make MyPy happy
|
605
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
606
|
+
|
607
|
+
return rv
|
608
|
+
|
609
|
+
def _align_expected_output_names(
|
610
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
611
|
+
) -> List[str]:
|
612
|
+
# in case the inferred output column names dimension is different
|
613
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
614
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
615
|
+
output_df_columns = list(output_df_pd.columns)
|
616
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
617
|
+
if self.sample_weight_col:
|
618
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
619
|
+
# if the dimension of inferred output column names is correct; use it
|
620
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
621
|
+
return expected_output_cols_list
|
622
|
+
# otherwise, use the sklearn estimator's output
|
623
|
+
else:
|
624
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
625
|
+
|
572
626
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
573
627
|
@telemetry.send_api_usage_telemetry(
|
574
628
|
project=_PROJECT,
|
@@ -599,24 +653,28 @@ class NearestNeighbors(BaseTransformer):
|
|
599
653
|
# are specific to the type of dataset used.
|
600
654
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
601
655
|
|
656
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
657
|
+
|
602
658
|
if isinstance(dataset, DataFrame):
|
603
659
|
self._deps = self._batch_inference_validate_snowpark(
|
604
660
|
dataset=dataset,
|
605
661
|
inference_method=inference_method,
|
606
662
|
)
|
607
|
-
assert isinstance(
|
663
|
+
assert isinstance(
|
664
|
+
dataset._session, Session
|
665
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
608
666
|
transform_kwargs = dict(
|
609
667
|
session=dataset._session,
|
610
668
|
dependencies=self._deps,
|
611
|
-
drop_input_cols
|
669
|
+
drop_input_cols=self._drop_input_cols,
|
612
670
|
expected_output_cols_type="float",
|
613
671
|
)
|
672
|
+
expected_output_cols = self._align_expected_output_names(
|
673
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
674
|
+
)
|
614
675
|
|
615
676
|
elif isinstance(dataset, pd.DataFrame):
|
616
|
-
transform_kwargs = dict(
|
617
|
-
snowpark_input_cols = self._snowpark_cols,
|
618
|
-
drop_input_cols = self._drop_input_cols
|
619
|
-
)
|
677
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
620
678
|
|
621
679
|
transform_handlers = ModelTransformerBuilder.build(
|
622
680
|
dataset=dataset,
|
@@ -628,7 +686,7 @@ class NearestNeighbors(BaseTransformer):
|
|
628
686
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
629
687
|
inference_method=inference_method,
|
630
688
|
input_cols=self.input_cols,
|
631
|
-
expected_output_cols=
|
689
|
+
expected_output_cols=expected_output_cols,
|
632
690
|
**transform_kwargs
|
633
691
|
)
|
634
692
|
return output_df
|
@@ -658,7 +716,8 @@ class NearestNeighbors(BaseTransformer):
|
|
658
716
|
Output dataset with log probability of the sample for each class in the model.
|
659
717
|
"""
|
660
718
|
super()._check_dataset_type(dataset)
|
661
|
-
inference_method="predict_log_proba"
|
719
|
+
inference_method = "predict_log_proba"
|
720
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
662
721
|
|
663
722
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
664
723
|
# are specific to the type of dataset used.
|
@@ -669,18 +728,20 @@ class NearestNeighbors(BaseTransformer):
|
|
669
728
|
dataset=dataset,
|
670
729
|
inference_method=inference_method,
|
671
730
|
)
|
672
|
-
assert isinstance(
|
731
|
+
assert isinstance(
|
732
|
+
dataset._session, Session
|
733
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
673
734
|
transform_kwargs = dict(
|
674
735
|
session=dataset._session,
|
675
736
|
dependencies=self._deps,
|
676
|
-
drop_input_cols
|
737
|
+
drop_input_cols=self._drop_input_cols,
|
677
738
|
expected_output_cols_type="float",
|
678
739
|
)
|
740
|
+
expected_output_cols = self._align_expected_output_names(
|
741
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
742
|
+
)
|
679
743
|
elif isinstance(dataset, pd.DataFrame):
|
680
|
-
transform_kwargs = dict(
|
681
|
-
snowpark_input_cols = self._snowpark_cols,
|
682
|
-
drop_input_cols = self._drop_input_cols
|
683
|
-
)
|
744
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
684
745
|
|
685
746
|
transform_handlers = ModelTransformerBuilder.build(
|
686
747
|
dataset=dataset,
|
@@ -693,7 +754,7 @@ class NearestNeighbors(BaseTransformer):
|
|
693
754
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
694
755
|
inference_method=inference_method,
|
695
756
|
input_cols=self.input_cols,
|
696
|
-
expected_output_cols=
|
757
|
+
expected_output_cols=expected_output_cols,
|
697
758
|
**transform_kwargs
|
698
759
|
)
|
699
760
|
return output_df
|
@@ -719,30 +780,34 @@ class NearestNeighbors(BaseTransformer):
|
|
719
780
|
Output dataset with results of the decision function for the samples in input dataset.
|
720
781
|
"""
|
721
782
|
super()._check_dataset_type(dataset)
|
722
|
-
inference_method="decision_function"
|
783
|
+
inference_method = "decision_function"
|
723
784
|
|
724
785
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
725
786
|
# are specific to the type of dataset used.
|
726
787
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
727
788
|
|
789
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
790
|
+
|
728
791
|
if isinstance(dataset, DataFrame):
|
729
792
|
self._deps = self._batch_inference_validate_snowpark(
|
730
793
|
dataset=dataset,
|
731
794
|
inference_method=inference_method,
|
732
795
|
)
|
733
|
-
assert isinstance(
|
796
|
+
assert isinstance(
|
797
|
+
dataset._session, Session
|
798
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
734
799
|
transform_kwargs = dict(
|
735
800
|
session=dataset._session,
|
736
801
|
dependencies=self._deps,
|
737
|
-
drop_input_cols
|
802
|
+
drop_input_cols=self._drop_input_cols,
|
738
803
|
expected_output_cols_type="float",
|
739
804
|
)
|
805
|
+
expected_output_cols = self._align_expected_output_names(
|
806
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
807
|
+
)
|
740
808
|
|
741
809
|
elif isinstance(dataset, pd.DataFrame):
|
742
|
-
transform_kwargs = dict(
|
743
|
-
snowpark_input_cols = self._snowpark_cols,
|
744
|
-
drop_input_cols = self._drop_input_cols
|
745
|
-
)
|
810
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
746
811
|
|
747
812
|
transform_handlers = ModelTransformerBuilder.build(
|
748
813
|
dataset=dataset,
|
@@ -755,7 +820,7 @@ class NearestNeighbors(BaseTransformer):
|
|
755
820
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
756
821
|
inference_method=inference_method,
|
757
822
|
input_cols=self.input_cols,
|
758
|
-
expected_output_cols=
|
823
|
+
expected_output_cols=expected_output_cols,
|
759
824
|
**transform_kwargs
|
760
825
|
)
|
761
826
|
return output_df
|
@@ -784,12 +849,14 @@ class NearestNeighbors(BaseTransformer):
|
|
784
849
|
Output dataset with probability of the sample for each class in the model.
|
785
850
|
"""
|
786
851
|
super()._check_dataset_type(dataset)
|
787
|
-
inference_method="score_samples"
|
852
|
+
inference_method = "score_samples"
|
788
853
|
|
789
854
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
790
855
|
# are specific to the type of dataset used.
|
791
856
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
792
857
|
|
858
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
859
|
+
|
793
860
|
if isinstance(dataset, DataFrame):
|
794
861
|
self._deps = self._batch_inference_validate_snowpark(
|
795
862
|
dataset=dataset,
|
@@ -802,6 +869,9 @@ class NearestNeighbors(BaseTransformer):
|
|
802
869
|
drop_input_cols = self._drop_input_cols,
|
803
870
|
expected_output_cols_type="float",
|
804
871
|
)
|
872
|
+
expected_output_cols = self._align_expected_output_names(
|
873
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
874
|
+
)
|
805
875
|
|
806
876
|
elif isinstance(dataset, pd.DataFrame):
|
807
877
|
transform_kwargs = dict(
|
@@ -820,7 +890,7 @@ class NearestNeighbors(BaseTransformer):
|
|
820
890
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
821
891
|
inference_method=inference_method,
|
822
892
|
input_cols=self.input_cols,
|
823
|
-
expected_output_cols=
|
893
|
+
expected_output_cols=expected_output_cols,
|
824
894
|
**transform_kwargs
|
825
895
|
)
|
826
896
|
return output_df
|
@@ -967,50 +1037,84 @@ class NearestNeighbors(BaseTransformer):
|
|
967
1037
|
)
|
968
1038
|
return output_df
|
969
1039
|
|
1040
|
+
|
1041
|
+
|
1042
|
+
def to_sklearn(self) -> Any:
|
1043
|
+
"""Get sklearn.neighbors.NearestNeighbors object.
|
1044
|
+
"""
|
1045
|
+
if self._sklearn_object is None:
|
1046
|
+
self._sklearn_object = self._create_sklearn_object()
|
1047
|
+
return self._sklearn_object
|
1048
|
+
|
1049
|
+
def to_xgboost(self) -> Any:
|
1050
|
+
raise exceptions.SnowflakeMLException(
|
1051
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1052
|
+
original_exception=AttributeError(
|
1053
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1054
|
+
"to_xgboost()",
|
1055
|
+
"to_sklearn()"
|
1056
|
+
)
|
1057
|
+
),
|
1058
|
+
)
|
1059
|
+
|
1060
|
+
def to_lightgbm(self) -> Any:
|
1061
|
+
raise exceptions.SnowflakeMLException(
|
1062
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1063
|
+
original_exception=AttributeError(
|
1064
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1065
|
+
"to_lightgbm()",
|
1066
|
+
"to_sklearn()"
|
1067
|
+
)
|
1068
|
+
),
|
1069
|
+
)
|
970
1070
|
|
971
|
-
def
|
1071
|
+
def _get_dependencies(self) -> List[str]:
|
1072
|
+
return self._deps
|
1073
|
+
|
1074
|
+
|
1075
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
972
1076
|
self._model_signature_dict = dict()
|
973
1077
|
|
974
1078
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
975
1079
|
|
976
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1080
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
977
1081
|
outputs: List[BaseFeatureSpec] = []
|
978
1082
|
if hasattr(self, "predict"):
|
979
1083
|
# keep mypy happy
|
980
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1084
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
981
1085
|
# For classifier, the type of predict is the same as the type of label
|
982
|
-
if self._sklearn_object._estimator_type ==
|
983
|
-
|
1086
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1087
|
+
# label columns is the desired type for output
|
984
1088
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
985
1089
|
# rename the output columns
|
986
1090
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
987
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
988
|
-
|
989
|
-
|
1091
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1092
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1093
|
+
)
|
990
1094
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
991
1095
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
992
|
-
# Clusterer returns int64 cluster labels.
|
1096
|
+
# Clusterer returns int64 cluster labels.
|
993
1097
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
994
1098
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
995
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
996
|
-
|
997
|
-
|
998
|
-
|
1099
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1100
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1101
|
+
)
|
1102
|
+
|
999
1103
|
# For regressor, the type of predict is float64
|
1000
|
-
elif self._sklearn_object._estimator_type ==
|
1104
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1001
1105
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1002
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1106
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1107
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1108
|
+
)
|
1109
|
+
|
1006
1110
|
for prob_func in PROB_FUNCTIONS:
|
1007
1111
|
if hasattr(self, prob_func):
|
1008
1112
|
output_cols_prefix: str = f"{prob_func}_"
|
1009
1113
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1010
1114
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1011
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1012
|
-
|
1013
|
-
|
1115
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1116
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1117
|
+
)
|
1014
1118
|
|
1015
1119
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1016
1120
|
items = list(self._model_signature_dict.items())
|
@@ -1023,10 +1127,10 @@ class NearestNeighbors(BaseTransformer):
|
|
1023
1127
|
"""Returns model signature of current class.
|
1024
1128
|
|
1025
1129
|
Raises:
|
1026
|
-
|
1130
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1027
1131
|
|
1028
1132
|
Returns:
|
1029
|
-
Dict
|
1133
|
+
Dict with each method and its input output signature
|
1030
1134
|
"""
|
1031
1135
|
if self._model_signature_dict is None:
|
1032
1136
|
raise exceptions.SnowflakeMLException(
|
@@ -1034,35 +1138,3 @@ class NearestNeighbors(BaseTransformer):
|
|
1034
1138
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1035
1139
|
)
|
1036
1140
|
return self._model_signature_dict
|
1037
|
-
|
1038
|
-
def to_sklearn(self) -> Any:
|
1039
|
-
"""Get sklearn.neighbors.NearestNeighbors object.
|
1040
|
-
"""
|
1041
|
-
if self._sklearn_object is None:
|
1042
|
-
self._sklearn_object = self._create_sklearn_object()
|
1043
|
-
return self._sklearn_object
|
1044
|
-
|
1045
|
-
def to_xgboost(self) -> Any:
|
1046
|
-
raise exceptions.SnowflakeMLException(
|
1047
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1048
|
-
original_exception=AttributeError(
|
1049
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1050
|
-
"to_xgboost()",
|
1051
|
-
"to_sklearn()"
|
1052
|
-
)
|
1053
|
-
),
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
def to_lightgbm(self) -> Any:
|
1057
|
-
raise exceptions.SnowflakeMLException(
|
1058
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1059
|
-
original_exception=AttributeError(
|
1060
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1061
|
-
"to_lightgbm()",
|
1062
|
-
"to_sklearn()"
|
1063
|
-
)
|
1064
|
-
),
|
1065
|
-
)
|
1066
|
-
|
1067
|
-
def _get_dependencies(self) -> List[str]:
|
1068
|
-
return self._deps
|