snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -317,12 +316,7 @@ class KernelPCA(BaseTransformer):
|
|
317
316
|
)
|
318
317
|
return selected_cols
|
319
318
|
|
320
|
-
|
321
|
-
project=_PROJECT,
|
322
|
-
subproject=_SUBPROJECT,
|
323
|
-
custom_tags=dict([("autogen", True)]),
|
324
|
-
)
|
325
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "KernelPCA":
|
319
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "KernelPCA":
|
326
320
|
"""Fit the model from data in X
|
327
321
|
For more details on this function, see [sklearn.decomposition.KernelPCA.fit]
|
328
322
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA.fit)
|
@@ -349,12 +343,14 @@ class KernelPCA(BaseTransformer):
|
|
349
343
|
|
350
344
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
351
345
|
|
352
|
-
|
346
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
353
347
|
if SNOWML_SPROC_ENV in os.environ:
|
354
348
|
statement_params = telemetry.get_function_usage_statement_params(
|
355
349
|
project=_PROJECT,
|
356
350
|
subproject=_SUBPROJECT,
|
357
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
351
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
352
|
+
inspect.currentframe(), KernelPCA.__class__.__name__
|
353
|
+
),
|
358
354
|
api_calls=[Session.call],
|
359
355
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
360
356
|
)
|
@@ -375,7 +371,7 @@ class KernelPCA(BaseTransformer):
|
|
375
371
|
)
|
376
372
|
self._sklearn_object = model_trainer.train()
|
377
373
|
self._is_fitted = True
|
378
|
-
self.
|
374
|
+
self._generate_model_signatures(dataset)
|
379
375
|
return self
|
380
376
|
|
381
377
|
def _batch_inference_validate_snowpark(
|
@@ -449,7 +445,9 @@ class KernelPCA(BaseTransformer):
|
|
449
445
|
# when it is classifier, infer the datatype from label columns
|
450
446
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
451
447
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
452
|
-
label_cols_signatures = [
|
448
|
+
label_cols_signatures = [
|
449
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
450
|
+
]
|
453
451
|
if len(label_cols_signatures) == 0:
|
454
452
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
455
453
|
raise exceptions.SnowflakeMLException(
|
@@ -457,25 +455,22 @@ class KernelPCA(BaseTransformer):
|
|
457
455
|
original_exception=ValueError(error_str),
|
458
456
|
)
|
459
457
|
|
460
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
461
|
-
label_cols_signatures[0].as_snowpark_type()
|
462
|
-
)
|
458
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
463
459
|
|
464
460
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
465
|
-
assert isinstance(
|
461
|
+
assert isinstance(
|
462
|
+
dataset._session, Session
|
463
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
466
464
|
|
467
465
|
transform_kwargs = dict(
|
468
|
-
session
|
469
|
-
dependencies
|
470
|
-
drop_input_cols
|
471
|
-
expected_output_cols_type
|
466
|
+
session=dataset._session,
|
467
|
+
dependencies=self._deps,
|
468
|
+
drop_input_cols=self._drop_input_cols,
|
469
|
+
expected_output_cols_type=expected_type_inferred,
|
472
470
|
)
|
473
471
|
|
474
472
|
elif isinstance(dataset, pd.DataFrame):
|
475
|
-
transform_kwargs = dict(
|
476
|
-
snowpark_input_cols = self._snowpark_cols,
|
477
|
-
drop_input_cols = self._drop_input_cols
|
478
|
-
)
|
473
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
479
474
|
|
480
475
|
transform_handlers = ModelTransformerBuilder.build(
|
481
476
|
dataset=dataset,
|
@@ -517,7 +512,7 @@ class KernelPCA(BaseTransformer):
|
|
517
512
|
Transformed dataset.
|
518
513
|
"""
|
519
514
|
super()._check_dataset_type(dataset)
|
520
|
-
inference_method="transform"
|
515
|
+
inference_method = "transform"
|
521
516
|
|
522
517
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
523
518
|
# are specific to the type of dataset used.
|
@@ -554,17 +549,14 @@ class KernelPCA(BaseTransformer):
|
|
554
549
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
555
550
|
|
556
551
|
transform_kwargs = dict(
|
557
|
-
session
|
558
|
-
dependencies
|
559
|
-
drop_input_cols
|
560
|
-
expected_output_cols_type
|
552
|
+
session=dataset._session,
|
553
|
+
dependencies=self._deps,
|
554
|
+
drop_input_cols=self._drop_input_cols,
|
555
|
+
expected_output_cols_type=expected_dtype,
|
561
556
|
)
|
562
557
|
|
563
558
|
elif isinstance(dataset, pd.DataFrame):
|
564
|
-
transform_kwargs = dict(
|
565
|
-
snowpark_input_cols = self._snowpark_cols,
|
566
|
-
drop_input_cols = self._drop_input_cols
|
567
|
-
)
|
559
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
568
560
|
|
569
561
|
transform_handlers = ModelTransformerBuilder.build(
|
570
562
|
dataset=dataset,
|
@@ -583,7 +575,11 @@ class KernelPCA(BaseTransformer):
|
|
583
575
|
return output_df
|
584
576
|
|
585
577
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
586
|
-
def fit_predict(
|
578
|
+
def fit_predict(
|
579
|
+
self,
|
580
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
581
|
+
output_cols_prefix: str = "fit_predict_",
|
582
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
587
583
|
""" Method not supported for this class.
|
588
584
|
|
589
585
|
|
@@ -608,7 +604,9 @@ class KernelPCA(BaseTransformer):
|
|
608
604
|
)
|
609
605
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
610
606
|
drop_input_cols=self._drop_input_cols,
|
611
|
-
expected_output_cols_list=
|
607
|
+
expected_output_cols_list=(
|
608
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
609
|
+
),
|
612
610
|
)
|
613
611
|
self._sklearn_object = fitted_estimator
|
614
612
|
self._is_fitted = True
|
@@ -625,6 +623,62 @@ class KernelPCA(BaseTransformer):
|
|
625
623
|
assert self._sklearn_object is not None
|
626
624
|
return self._sklearn_object.embedding_
|
627
625
|
|
626
|
+
|
627
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
628
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
629
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
630
|
+
"""
|
631
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
632
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
633
|
+
if output_cols:
|
634
|
+
output_cols = [
|
635
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
636
|
+
for c in output_cols
|
637
|
+
]
|
638
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
639
|
+
output_cols = [output_cols_prefix]
|
640
|
+
elif self._sklearn_object is not None:
|
641
|
+
classes = self._sklearn_object.classes_
|
642
|
+
if isinstance(classes, numpy.ndarray):
|
643
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
644
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
645
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
646
|
+
output_cols = []
|
647
|
+
for i, cl in enumerate(classes):
|
648
|
+
# For binary classification, there is only one output column for each class
|
649
|
+
# ndarray as the two classes are complementary.
|
650
|
+
if len(cl) == 2:
|
651
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
652
|
+
else:
|
653
|
+
output_cols.extend([
|
654
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
655
|
+
])
|
656
|
+
else:
|
657
|
+
output_cols = []
|
658
|
+
|
659
|
+
# Make sure column names are valid snowflake identifiers.
|
660
|
+
assert output_cols is not None # Make MyPy happy
|
661
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
662
|
+
|
663
|
+
return rv
|
664
|
+
|
665
|
+
def _align_expected_output_names(
|
666
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
667
|
+
) -> List[str]:
|
668
|
+
# in case the inferred output column names dimension is different
|
669
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
670
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
671
|
+
output_df_columns = list(output_df_pd.columns)
|
672
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
673
|
+
if self.sample_weight_col:
|
674
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
675
|
+
# if the dimension of inferred output column names is correct; use it
|
676
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
677
|
+
return expected_output_cols_list
|
678
|
+
# otherwise, use the sklearn estimator's output
|
679
|
+
else:
|
680
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
681
|
+
|
628
682
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
629
683
|
@telemetry.send_api_usage_telemetry(
|
630
684
|
project=_PROJECT,
|
@@ -655,24 +709,28 @@ class KernelPCA(BaseTransformer):
|
|
655
709
|
# are specific to the type of dataset used.
|
656
710
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
657
711
|
|
712
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
713
|
+
|
658
714
|
if isinstance(dataset, DataFrame):
|
659
715
|
self._deps = self._batch_inference_validate_snowpark(
|
660
716
|
dataset=dataset,
|
661
717
|
inference_method=inference_method,
|
662
718
|
)
|
663
|
-
assert isinstance(
|
719
|
+
assert isinstance(
|
720
|
+
dataset._session, Session
|
721
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
664
722
|
transform_kwargs = dict(
|
665
723
|
session=dataset._session,
|
666
724
|
dependencies=self._deps,
|
667
|
-
drop_input_cols
|
725
|
+
drop_input_cols=self._drop_input_cols,
|
668
726
|
expected_output_cols_type="float",
|
669
727
|
)
|
728
|
+
expected_output_cols = self._align_expected_output_names(
|
729
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
730
|
+
)
|
670
731
|
|
671
732
|
elif isinstance(dataset, pd.DataFrame):
|
672
|
-
transform_kwargs = dict(
|
673
|
-
snowpark_input_cols = self._snowpark_cols,
|
674
|
-
drop_input_cols = self._drop_input_cols
|
675
|
-
)
|
733
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
676
734
|
|
677
735
|
transform_handlers = ModelTransformerBuilder.build(
|
678
736
|
dataset=dataset,
|
@@ -684,7 +742,7 @@ class KernelPCA(BaseTransformer):
|
|
684
742
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
685
743
|
inference_method=inference_method,
|
686
744
|
input_cols=self.input_cols,
|
687
|
-
expected_output_cols=
|
745
|
+
expected_output_cols=expected_output_cols,
|
688
746
|
**transform_kwargs
|
689
747
|
)
|
690
748
|
return output_df
|
@@ -714,7 +772,8 @@ class KernelPCA(BaseTransformer):
|
|
714
772
|
Output dataset with log probability of the sample for each class in the model.
|
715
773
|
"""
|
716
774
|
super()._check_dataset_type(dataset)
|
717
|
-
inference_method="predict_log_proba"
|
775
|
+
inference_method = "predict_log_proba"
|
776
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
718
777
|
|
719
778
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
720
779
|
# are specific to the type of dataset used.
|
@@ -725,18 +784,20 @@ class KernelPCA(BaseTransformer):
|
|
725
784
|
dataset=dataset,
|
726
785
|
inference_method=inference_method,
|
727
786
|
)
|
728
|
-
assert isinstance(
|
787
|
+
assert isinstance(
|
788
|
+
dataset._session, Session
|
789
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
729
790
|
transform_kwargs = dict(
|
730
791
|
session=dataset._session,
|
731
792
|
dependencies=self._deps,
|
732
|
-
drop_input_cols
|
793
|
+
drop_input_cols=self._drop_input_cols,
|
733
794
|
expected_output_cols_type="float",
|
734
795
|
)
|
796
|
+
expected_output_cols = self._align_expected_output_names(
|
797
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
798
|
+
)
|
735
799
|
elif isinstance(dataset, pd.DataFrame):
|
736
|
-
transform_kwargs = dict(
|
737
|
-
snowpark_input_cols = self._snowpark_cols,
|
738
|
-
drop_input_cols = self._drop_input_cols
|
739
|
-
)
|
800
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
740
801
|
|
741
802
|
transform_handlers = ModelTransformerBuilder.build(
|
742
803
|
dataset=dataset,
|
@@ -749,7 +810,7 @@ class KernelPCA(BaseTransformer):
|
|
749
810
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
750
811
|
inference_method=inference_method,
|
751
812
|
input_cols=self.input_cols,
|
752
|
-
expected_output_cols=
|
813
|
+
expected_output_cols=expected_output_cols,
|
753
814
|
**transform_kwargs
|
754
815
|
)
|
755
816
|
return output_df
|
@@ -775,30 +836,34 @@ class KernelPCA(BaseTransformer):
|
|
775
836
|
Output dataset with results of the decision function for the samples in input dataset.
|
776
837
|
"""
|
777
838
|
super()._check_dataset_type(dataset)
|
778
|
-
inference_method="decision_function"
|
839
|
+
inference_method = "decision_function"
|
779
840
|
|
780
841
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
781
842
|
# are specific to the type of dataset used.
|
782
843
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
783
844
|
|
845
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
846
|
+
|
784
847
|
if isinstance(dataset, DataFrame):
|
785
848
|
self._deps = self._batch_inference_validate_snowpark(
|
786
849
|
dataset=dataset,
|
787
850
|
inference_method=inference_method,
|
788
851
|
)
|
789
|
-
assert isinstance(
|
852
|
+
assert isinstance(
|
853
|
+
dataset._session, Session
|
854
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
790
855
|
transform_kwargs = dict(
|
791
856
|
session=dataset._session,
|
792
857
|
dependencies=self._deps,
|
793
|
-
drop_input_cols
|
858
|
+
drop_input_cols=self._drop_input_cols,
|
794
859
|
expected_output_cols_type="float",
|
795
860
|
)
|
861
|
+
expected_output_cols = self._align_expected_output_names(
|
862
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
863
|
+
)
|
796
864
|
|
797
865
|
elif isinstance(dataset, pd.DataFrame):
|
798
|
-
transform_kwargs = dict(
|
799
|
-
snowpark_input_cols = self._snowpark_cols,
|
800
|
-
drop_input_cols = self._drop_input_cols
|
801
|
-
)
|
866
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
802
867
|
|
803
868
|
transform_handlers = ModelTransformerBuilder.build(
|
804
869
|
dataset=dataset,
|
@@ -811,7 +876,7 @@ class KernelPCA(BaseTransformer):
|
|
811
876
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
812
877
|
inference_method=inference_method,
|
813
878
|
input_cols=self.input_cols,
|
814
|
-
expected_output_cols=
|
879
|
+
expected_output_cols=expected_output_cols,
|
815
880
|
**transform_kwargs
|
816
881
|
)
|
817
882
|
return output_df
|
@@ -840,12 +905,14 @@ class KernelPCA(BaseTransformer):
|
|
840
905
|
Output dataset with probability of the sample for each class in the model.
|
841
906
|
"""
|
842
907
|
super()._check_dataset_type(dataset)
|
843
|
-
inference_method="score_samples"
|
908
|
+
inference_method = "score_samples"
|
844
909
|
|
845
910
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
846
911
|
# are specific to the type of dataset used.
|
847
912
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
848
913
|
|
914
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
915
|
+
|
849
916
|
if isinstance(dataset, DataFrame):
|
850
917
|
self._deps = self._batch_inference_validate_snowpark(
|
851
918
|
dataset=dataset,
|
@@ -858,6 +925,9 @@ class KernelPCA(BaseTransformer):
|
|
858
925
|
drop_input_cols = self._drop_input_cols,
|
859
926
|
expected_output_cols_type="float",
|
860
927
|
)
|
928
|
+
expected_output_cols = self._align_expected_output_names(
|
929
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
930
|
+
)
|
861
931
|
|
862
932
|
elif isinstance(dataset, pd.DataFrame):
|
863
933
|
transform_kwargs = dict(
|
@@ -876,7 +946,7 @@ class KernelPCA(BaseTransformer):
|
|
876
946
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
877
947
|
inference_method=inference_method,
|
878
948
|
input_cols=self.input_cols,
|
879
|
-
expected_output_cols=
|
949
|
+
expected_output_cols=expected_output_cols,
|
880
950
|
**transform_kwargs
|
881
951
|
)
|
882
952
|
return output_df
|
@@ -1021,50 +1091,84 @@ class KernelPCA(BaseTransformer):
|
|
1021
1091
|
)
|
1022
1092
|
return output_df
|
1023
1093
|
|
1094
|
+
|
1095
|
+
|
1096
|
+
def to_sklearn(self) -> Any:
|
1097
|
+
"""Get sklearn.decomposition.KernelPCA object.
|
1098
|
+
"""
|
1099
|
+
if self._sklearn_object is None:
|
1100
|
+
self._sklearn_object = self._create_sklearn_object()
|
1101
|
+
return self._sklearn_object
|
1102
|
+
|
1103
|
+
def to_xgboost(self) -> Any:
|
1104
|
+
raise exceptions.SnowflakeMLException(
|
1105
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1106
|
+
original_exception=AttributeError(
|
1107
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1108
|
+
"to_xgboost()",
|
1109
|
+
"to_sklearn()"
|
1110
|
+
)
|
1111
|
+
),
|
1112
|
+
)
|
1113
|
+
|
1114
|
+
def to_lightgbm(self) -> Any:
|
1115
|
+
raise exceptions.SnowflakeMLException(
|
1116
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1117
|
+
original_exception=AttributeError(
|
1118
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1119
|
+
"to_lightgbm()",
|
1120
|
+
"to_sklearn()"
|
1121
|
+
)
|
1122
|
+
),
|
1123
|
+
)
|
1024
1124
|
|
1025
|
-
def
|
1125
|
+
def _get_dependencies(self) -> List[str]:
|
1126
|
+
return self._deps
|
1127
|
+
|
1128
|
+
|
1129
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1026
1130
|
self._model_signature_dict = dict()
|
1027
1131
|
|
1028
1132
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1029
1133
|
|
1030
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1134
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1031
1135
|
outputs: List[BaseFeatureSpec] = []
|
1032
1136
|
if hasattr(self, "predict"):
|
1033
1137
|
# keep mypy happy
|
1034
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1138
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1035
1139
|
# For classifier, the type of predict is the same as the type of label
|
1036
|
-
if self._sklearn_object._estimator_type ==
|
1037
|
-
|
1140
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1141
|
+
# label columns is the desired type for output
|
1038
1142
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1039
1143
|
# rename the output columns
|
1040
1144
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1041
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1042
|
-
|
1043
|
-
|
1145
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1146
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1147
|
+
)
|
1044
1148
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1045
1149
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1046
|
-
# Clusterer returns int64 cluster labels.
|
1150
|
+
# Clusterer returns int64 cluster labels.
|
1047
1151
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1048
1152
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1049
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1153
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1154
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1155
|
+
)
|
1156
|
+
|
1053
1157
|
# For regressor, the type of predict is float64
|
1054
|
-
elif self._sklearn_object._estimator_type ==
|
1158
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1055
1159
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1056
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1160
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1161
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1162
|
+
)
|
1163
|
+
|
1060
1164
|
for prob_func in PROB_FUNCTIONS:
|
1061
1165
|
if hasattr(self, prob_func):
|
1062
1166
|
output_cols_prefix: str = f"{prob_func}_"
|
1063
1167
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1064
1168
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1065
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1066
|
-
|
1067
|
-
|
1169
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1170
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1171
|
+
)
|
1068
1172
|
|
1069
1173
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1070
1174
|
items = list(self._model_signature_dict.items())
|
@@ -1077,10 +1181,10 @@ class KernelPCA(BaseTransformer):
|
|
1077
1181
|
"""Returns model signature of current class.
|
1078
1182
|
|
1079
1183
|
Raises:
|
1080
|
-
|
1184
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1081
1185
|
|
1082
1186
|
Returns:
|
1083
|
-
Dict
|
1187
|
+
Dict with each method and its input output signature
|
1084
1188
|
"""
|
1085
1189
|
if self._model_signature_dict is None:
|
1086
1190
|
raise exceptions.SnowflakeMLException(
|
@@ -1088,35 +1192,3 @@ class KernelPCA(BaseTransformer):
|
|
1088
1192
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1089
1193
|
)
|
1090
1194
|
return self._model_signature_dict
|
1091
|
-
|
1092
|
-
def to_sklearn(self) -> Any:
|
1093
|
-
"""Get sklearn.decomposition.KernelPCA object.
|
1094
|
-
"""
|
1095
|
-
if self._sklearn_object is None:
|
1096
|
-
self._sklearn_object = self._create_sklearn_object()
|
1097
|
-
return self._sklearn_object
|
1098
|
-
|
1099
|
-
def to_xgboost(self) -> Any:
|
1100
|
-
raise exceptions.SnowflakeMLException(
|
1101
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1102
|
-
original_exception=AttributeError(
|
1103
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1104
|
-
"to_xgboost()",
|
1105
|
-
"to_sklearn()"
|
1106
|
-
)
|
1107
|
-
),
|
1108
|
-
)
|
1109
|
-
|
1110
|
-
def to_lightgbm(self) -> Any:
|
1111
|
-
raise exceptions.SnowflakeMLException(
|
1112
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1113
|
-
original_exception=AttributeError(
|
1114
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1115
|
-
"to_lightgbm()",
|
1116
|
-
"to_sklearn()"
|
1117
|
-
)
|
1118
|
-
),
|
1119
|
-
)
|
1120
|
-
|
1121
|
-
def _get_dependencies(self) -> List[str]:
|
1122
|
-
return self._deps
|