snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -284,12 +283,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
284
283
|
)
|
285
284
|
return selected_cols
|
286
285
|
|
287
|
-
|
288
|
-
project=_PROJECT,
|
289
|
-
subproject=_SUBPROJECT,
|
290
|
-
custom_tags=dict([("autogen", True)]),
|
291
|
-
)
|
292
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "NeighborhoodComponentsAnalysis":
|
286
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "NeighborhoodComponentsAnalysis":
|
293
287
|
"""Fit the model according to the given training data
|
294
288
|
For more details on this function, see [sklearn.neighbors.NeighborhoodComponentsAnalysis.fit]
|
295
289
|
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NeighborhoodComponentsAnalysis.html#sklearn.neighbors.NeighborhoodComponentsAnalysis.fit)
|
@@ -316,12 +310,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
316
310
|
|
317
311
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
318
312
|
|
319
|
-
|
313
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
320
314
|
if SNOWML_SPROC_ENV in os.environ:
|
321
315
|
statement_params = telemetry.get_function_usage_statement_params(
|
322
316
|
project=_PROJECT,
|
323
317
|
subproject=_SUBPROJECT,
|
324
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
318
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
319
|
+
inspect.currentframe(), NeighborhoodComponentsAnalysis.__class__.__name__
|
320
|
+
),
|
325
321
|
api_calls=[Session.call],
|
326
322
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
327
323
|
)
|
@@ -342,7 +338,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
342
338
|
)
|
343
339
|
self._sklearn_object = model_trainer.train()
|
344
340
|
self._is_fitted = True
|
345
|
-
self.
|
341
|
+
self._generate_model_signatures(dataset)
|
346
342
|
return self
|
347
343
|
|
348
344
|
def _batch_inference_validate_snowpark(
|
@@ -416,7 +412,9 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
416
412
|
# when it is classifier, infer the datatype from label columns
|
417
413
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
418
414
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
419
|
-
label_cols_signatures = [
|
415
|
+
label_cols_signatures = [
|
416
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
417
|
+
]
|
420
418
|
if len(label_cols_signatures) == 0:
|
421
419
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
422
420
|
raise exceptions.SnowflakeMLException(
|
@@ -424,25 +422,22 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
424
422
|
original_exception=ValueError(error_str),
|
425
423
|
)
|
426
424
|
|
427
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
428
|
-
label_cols_signatures[0].as_snowpark_type()
|
429
|
-
)
|
425
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
430
426
|
|
431
427
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
432
|
-
assert isinstance(
|
428
|
+
assert isinstance(
|
429
|
+
dataset._session, Session
|
430
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
433
431
|
|
434
432
|
transform_kwargs = dict(
|
435
|
-
session
|
436
|
-
dependencies
|
437
|
-
drop_input_cols
|
438
|
-
expected_output_cols_type
|
433
|
+
session=dataset._session,
|
434
|
+
dependencies=self._deps,
|
435
|
+
drop_input_cols=self._drop_input_cols,
|
436
|
+
expected_output_cols_type=expected_type_inferred,
|
439
437
|
)
|
440
438
|
|
441
439
|
elif isinstance(dataset, pd.DataFrame):
|
442
|
-
transform_kwargs = dict(
|
443
|
-
snowpark_input_cols = self._snowpark_cols,
|
444
|
-
drop_input_cols = self._drop_input_cols
|
445
|
-
)
|
440
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
446
441
|
|
447
442
|
transform_handlers = ModelTransformerBuilder.build(
|
448
443
|
dataset=dataset,
|
@@ -484,7 +479,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
484
479
|
Transformed dataset.
|
485
480
|
"""
|
486
481
|
super()._check_dataset_type(dataset)
|
487
|
-
inference_method="transform"
|
482
|
+
inference_method = "transform"
|
488
483
|
|
489
484
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
490
485
|
# are specific to the type of dataset used.
|
@@ -521,17 +516,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
521
516
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
522
517
|
|
523
518
|
transform_kwargs = dict(
|
524
|
-
session
|
525
|
-
dependencies
|
526
|
-
drop_input_cols
|
527
|
-
expected_output_cols_type
|
519
|
+
session=dataset._session,
|
520
|
+
dependencies=self._deps,
|
521
|
+
drop_input_cols=self._drop_input_cols,
|
522
|
+
expected_output_cols_type=expected_dtype,
|
528
523
|
)
|
529
524
|
|
530
525
|
elif isinstance(dataset, pd.DataFrame):
|
531
|
-
transform_kwargs = dict(
|
532
|
-
snowpark_input_cols = self._snowpark_cols,
|
533
|
-
drop_input_cols = self._drop_input_cols
|
534
|
-
)
|
526
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
535
527
|
|
536
528
|
transform_handlers = ModelTransformerBuilder.build(
|
537
529
|
dataset=dataset,
|
@@ -550,7 +542,11 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
550
542
|
return output_df
|
551
543
|
|
552
544
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
553
|
-
def fit_predict(
|
545
|
+
def fit_predict(
|
546
|
+
self,
|
547
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
548
|
+
output_cols_prefix: str = "fit_predict_",
|
549
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
554
550
|
""" Method not supported for this class.
|
555
551
|
|
556
552
|
|
@@ -575,7 +571,9 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
575
571
|
)
|
576
572
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
577
573
|
drop_input_cols=self._drop_input_cols,
|
578
|
-
expected_output_cols_list=
|
574
|
+
expected_output_cols_list=(
|
575
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
576
|
+
),
|
579
577
|
)
|
580
578
|
self._sklearn_object = fitted_estimator
|
581
579
|
self._is_fitted = True
|
@@ -592,6 +590,62 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
592
590
|
assert self._sklearn_object is not None
|
593
591
|
return self._sklearn_object.embedding_
|
594
592
|
|
593
|
+
|
594
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
595
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
596
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
597
|
+
"""
|
598
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
599
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
600
|
+
if output_cols:
|
601
|
+
output_cols = [
|
602
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
603
|
+
for c in output_cols
|
604
|
+
]
|
605
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
606
|
+
output_cols = [output_cols_prefix]
|
607
|
+
elif self._sklearn_object is not None:
|
608
|
+
classes = self._sklearn_object.classes_
|
609
|
+
if isinstance(classes, numpy.ndarray):
|
610
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
611
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
612
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
613
|
+
output_cols = []
|
614
|
+
for i, cl in enumerate(classes):
|
615
|
+
# For binary classification, there is only one output column for each class
|
616
|
+
# ndarray as the two classes are complementary.
|
617
|
+
if len(cl) == 2:
|
618
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
619
|
+
else:
|
620
|
+
output_cols.extend([
|
621
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
622
|
+
])
|
623
|
+
else:
|
624
|
+
output_cols = []
|
625
|
+
|
626
|
+
# Make sure column names are valid snowflake identifiers.
|
627
|
+
assert output_cols is not None # Make MyPy happy
|
628
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
629
|
+
|
630
|
+
return rv
|
631
|
+
|
632
|
+
def _align_expected_output_names(
|
633
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
634
|
+
) -> List[str]:
|
635
|
+
# in case the inferred output column names dimension is different
|
636
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
637
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
638
|
+
output_df_columns = list(output_df_pd.columns)
|
639
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
640
|
+
if self.sample_weight_col:
|
641
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
642
|
+
# if the dimension of inferred output column names is correct; use it
|
643
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
644
|
+
return expected_output_cols_list
|
645
|
+
# otherwise, use the sklearn estimator's output
|
646
|
+
else:
|
647
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
648
|
+
|
595
649
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
596
650
|
@telemetry.send_api_usage_telemetry(
|
597
651
|
project=_PROJECT,
|
@@ -622,24 +676,28 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
622
676
|
# are specific to the type of dataset used.
|
623
677
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
624
678
|
|
679
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
680
|
+
|
625
681
|
if isinstance(dataset, DataFrame):
|
626
682
|
self._deps = self._batch_inference_validate_snowpark(
|
627
683
|
dataset=dataset,
|
628
684
|
inference_method=inference_method,
|
629
685
|
)
|
630
|
-
assert isinstance(
|
686
|
+
assert isinstance(
|
687
|
+
dataset._session, Session
|
688
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
631
689
|
transform_kwargs = dict(
|
632
690
|
session=dataset._session,
|
633
691
|
dependencies=self._deps,
|
634
|
-
drop_input_cols
|
692
|
+
drop_input_cols=self._drop_input_cols,
|
635
693
|
expected_output_cols_type="float",
|
636
694
|
)
|
695
|
+
expected_output_cols = self._align_expected_output_names(
|
696
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
697
|
+
)
|
637
698
|
|
638
699
|
elif isinstance(dataset, pd.DataFrame):
|
639
|
-
transform_kwargs = dict(
|
640
|
-
snowpark_input_cols = self._snowpark_cols,
|
641
|
-
drop_input_cols = self._drop_input_cols
|
642
|
-
)
|
700
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
643
701
|
|
644
702
|
transform_handlers = ModelTransformerBuilder.build(
|
645
703
|
dataset=dataset,
|
@@ -651,7 +709,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
651
709
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
652
710
|
inference_method=inference_method,
|
653
711
|
input_cols=self.input_cols,
|
654
|
-
expected_output_cols=
|
712
|
+
expected_output_cols=expected_output_cols,
|
655
713
|
**transform_kwargs
|
656
714
|
)
|
657
715
|
return output_df
|
@@ -681,7 +739,8 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
681
739
|
Output dataset with log probability of the sample for each class in the model.
|
682
740
|
"""
|
683
741
|
super()._check_dataset_type(dataset)
|
684
|
-
inference_method="predict_log_proba"
|
742
|
+
inference_method = "predict_log_proba"
|
743
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
685
744
|
|
686
745
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
687
746
|
# are specific to the type of dataset used.
|
@@ -692,18 +751,20 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
692
751
|
dataset=dataset,
|
693
752
|
inference_method=inference_method,
|
694
753
|
)
|
695
|
-
assert isinstance(
|
754
|
+
assert isinstance(
|
755
|
+
dataset._session, Session
|
756
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
696
757
|
transform_kwargs = dict(
|
697
758
|
session=dataset._session,
|
698
759
|
dependencies=self._deps,
|
699
|
-
drop_input_cols
|
760
|
+
drop_input_cols=self._drop_input_cols,
|
700
761
|
expected_output_cols_type="float",
|
701
762
|
)
|
763
|
+
expected_output_cols = self._align_expected_output_names(
|
764
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
765
|
+
)
|
702
766
|
elif isinstance(dataset, pd.DataFrame):
|
703
|
-
transform_kwargs = dict(
|
704
|
-
snowpark_input_cols = self._snowpark_cols,
|
705
|
-
drop_input_cols = self._drop_input_cols
|
706
|
-
)
|
767
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
707
768
|
|
708
769
|
transform_handlers = ModelTransformerBuilder.build(
|
709
770
|
dataset=dataset,
|
@@ -716,7 +777,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
716
777
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
717
778
|
inference_method=inference_method,
|
718
779
|
input_cols=self.input_cols,
|
719
|
-
expected_output_cols=
|
780
|
+
expected_output_cols=expected_output_cols,
|
720
781
|
**transform_kwargs
|
721
782
|
)
|
722
783
|
return output_df
|
@@ -742,30 +803,34 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
742
803
|
Output dataset with results of the decision function for the samples in input dataset.
|
743
804
|
"""
|
744
805
|
super()._check_dataset_type(dataset)
|
745
|
-
inference_method="decision_function"
|
806
|
+
inference_method = "decision_function"
|
746
807
|
|
747
808
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
748
809
|
# are specific to the type of dataset used.
|
749
810
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
750
811
|
|
812
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
813
|
+
|
751
814
|
if isinstance(dataset, DataFrame):
|
752
815
|
self._deps = self._batch_inference_validate_snowpark(
|
753
816
|
dataset=dataset,
|
754
817
|
inference_method=inference_method,
|
755
818
|
)
|
756
|
-
assert isinstance(
|
819
|
+
assert isinstance(
|
820
|
+
dataset._session, Session
|
821
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
757
822
|
transform_kwargs = dict(
|
758
823
|
session=dataset._session,
|
759
824
|
dependencies=self._deps,
|
760
|
-
drop_input_cols
|
825
|
+
drop_input_cols=self._drop_input_cols,
|
761
826
|
expected_output_cols_type="float",
|
762
827
|
)
|
828
|
+
expected_output_cols = self._align_expected_output_names(
|
829
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
830
|
+
)
|
763
831
|
|
764
832
|
elif isinstance(dataset, pd.DataFrame):
|
765
|
-
transform_kwargs = dict(
|
766
|
-
snowpark_input_cols = self._snowpark_cols,
|
767
|
-
drop_input_cols = self._drop_input_cols
|
768
|
-
)
|
833
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
769
834
|
|
770
835
|
transform_handlers = ModelTransformerBuilder.build(
|
771
836
|
dataset=dataset,
|
@@ -778,7 +843,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
778
843
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
779
844
|
inference_method=inference_method,
|
780
845
|
input_cols=self.input_cols,
|
781
|
-
expected_output_cols=
|
846
|
+
expected_output_cols=expected_output_cols,
|
782
847
|
**transform_kwargs
|
783
848
|
)
|
784
849
|
return output_df
|
@@ -807,12 +872,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
807
872
|
Output dataset with probability of the sample for each class in the model.
|
808
873
|
"""
|
809
874
|
super()._check_dataset_type(dataset)
|
810
|
-
inference_method="score_samples"
|
875
|
+
inference_method = "score_samples"
|
811
876
|
|
812
877
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
813
878
|
# are specific to the type of dataset used.
|
814
879
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
815
880
|
|
881
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
882
|
+
|
816
883
|
if isinstance(dataset, DataFrame):
|
817
884
|
self._deps = self._batch_inference_validate_snowpark(
|
818
885
|
dataset=dataset,
|
@@ -825,6 +892,9 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
825
892
|
drop_input_cols = self._drop_input_cols,
|
826
893
|
expected_output_cols_type="float",
|
827
894
|
)
|
895
|
+
expected_output_cols = self._align_expected_output_names(
|
896
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
897
|
+
)
|
828
898
|
|
829
899
|
elif isinstance(dataset, pd.DataFrame):
|
830
900
|
transform_kwargs = dict(
|
@@ -843,7 +913,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
843
913
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
844
914
|
inference_method=inference_method,
|
845
915
|
input_cols=self.input_cols,
|
846
|
-
expected_output_cols=
|
916
|
+
expected_output_cols=expected_output_cols,
|
847
917
|
**transform_kwargs
|
848
918
|
)
|
849
919
|
return output_df
|
@@ -988,50 +1058,84 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
988
1058
|
)
|
989
1059
|
return output_df
|
990
1060
|
|
1061
|
+
|
1062
|
+
|
1063
|
+
def to_sklearn(self) -> Any:
|
1064
|
+
"""Get sklearn.neighbors.NeighborhoodComponentsAnalysis object.
|
1065
|
+
"""
|
1066
|
+
if self._sklearn_object is None:
|
1067
|
+
self._sklearn_object = self._create_sklearn_object()
|
1068
|
+
return self._sklearn_object
|
1069
|
+
|
1070
|
+
def to_xgboost(self) -> Any:
|
1071
|
+
raise exceptions.SnowflakeMLException(
|
1072
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
+
original_exception=AttributeError(
|
1074
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
+
"to_xgboost()",
|
1076
|
+
"to_sklearn()"
|
1077
|
+
)
|
1078
|
+
),
|
1079
|
+
)
|
1080
|
+
|
1081
|
+
def to_lightgbm(self) -> Any:
|
1082
|
+
raise exceptions.SnowflakeMLException(
|
1083
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
+
original_exception=AttributeError(
|
1085
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
+
"to_lightgbm()",
|
1087
|
+
"to_sklearn()"
|
1088
|
+
)
|
1089
|
+
),
|
1090
|
+
)
|
991
1091
|
|
992
|
-
def
|
1092
|
+
def _get_dependencies(self) -> List[str]:
|
1093
|
+
return self._deps
|
1094
|
+
|
1095
|
+
|
1096
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
993
1097
|
self._model_signature_dict = dict()
|
994
1098
|
|
995
1099
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
996
1100
|
|
997
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1101
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
998
1102
|
outputs: List[BaseFeatureSpec] = []
|
999
1103
|
if hasattr(self, "predict"):
|
1000
1104
|
# keep mypy happy
|
1001
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1105
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1002
1106
|
# For classifier, the type of predict is the same as the type of label
|
1003
|
-
if self._sklearn_object._estimator_type ==
|
1004
|
-
|
1107
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1108
|
+
# label columns is the desired type for output
|
1005
1109
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1006
1110
|
# rename the output columns
|
1007
1111
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1008
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1009
|
-
|
1010
|
-
|
1112
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1113
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1114
|
+
)
|
1011
1115
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1012
1116
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1013
|
-
# Clusterer returns int64 cluster labels.
|
1117
|
+
# Clusterer returns int64 cluster labels.
|
1014
1118
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1015
1119
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1016
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1120
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1121
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1122
|
+
)
|
1123
|
+
|
1020
1124
|
# For regressor, the type of predict is float64
|
1021
|
-
elif self._sklearn_object._estimator_type ==
|
1125
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1022
1126
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1023
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1127
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1128
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1129
|
+
)
|
1130
|
+
|
1027
1131
|
for prob_func in PROB_FUNCTIONS:
|
1028
1132
|
if hasattr(self, prob_func):
|
1029
1133
|
output_cols_prefix: str = f"{prob_func}_"
|
1030
1134
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1031
1135
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1032
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1033
|
-
|
1034
|
-
|
1136
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1137
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1138
|
+
)
|
1035
1139
|
|
1036
1140
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1037
1141
|
items = list(self._model_signature_dict.items())
|
@@ -1044,10 +1148,10 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
1044
1148
|
"""Returns model signature of current class.
|
1045
1149
|
|
1046
1150
|
Raises:
|
1047
|
-
|
1151
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1048
1152
|
|
1049
1153
|
Returns:
|
1050
|
-
Dict
|
1154
|
+
Dict with each method and its input output signature
|
1051
1155
|
"""
|
1052
1156
|
if self._model_signature_dict is None:
|
1053
1157
|
raise exceptions.SnowflakeMLException(
|
@@ -1055,35 +1159,3 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
1055
1159
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1056
1160
|
)
|
1057
1161
|
return self._model_signature_dict
|
1058
|
-
|
1059
|
-
def to_sklearn(self) -> Any:
|
1060
|
-
"""Get sklearn.neighbors.NeighborhoodComponentsAnalysis object.
|
1061
|
-
"""
|
1062
|
-
if self._sklearn_object is None:
|
1063
|
-
self._sklearn_object = self._create_sklearn_object()
|
1064
|
-
return self._sklearn_object
|
1065
|
-
|
1066
|
-
def to_xgboost(self) -> Any:
|
1067
|
-
raise exceptions.SnowflakeMLException(
|
1068
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1069
|
-
original_exception=AttributeError(
|
1070
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1071
|
-
"to_xgboost()",
|
1072
|
-
"to_sklearn()"
|
1073
|
-
)
|
1074
|
-
),
|
1075
|
-
)
|
1076
|
-
|
1077
|
-
def to_lightgbm(self) -> Any:
|
1078
|
-
raise exceptions.SnowflakeMLException(
|
1079
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1080
|
-
original_exception=AttributeError(
|
1081
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1082
|
-
"to_lightgbm()",
|
1083
|
-
"to_sklearn()"
|
1084
|
-
)
|
1085
|
-
),
|
1086
|
-
)
|
1087
|
-
|
1088
|
-
def _get_dependencies(self) -> List[str]:
|
1089
|
-
return self._deps
|