snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.decomposition".replace("
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class KernelPCA(BaseTransformer):
|
71
64
|
r"""Kernel Principal component analysis (KPCA) [1]_
|
72
65
|
For more details on this class, see [sklearn.decomposition.KernelPCA]
|
@@ -317,12 +310,7 @@ class KernelPCA(BaseTransformer):
|
|
317
310
|
)
|
318
311
|
return selected_cols
|
319
312
|
|
320
|
-
|
321
|
-
project=_PROJECT,
|
322
|
-
subproject=_SUBPROJECT,
|
323
|
-
custom_tags=dict([("autogen", True)]),
|
324
|
-
)
|
325
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "KernelPCA":
|
313
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "KernelPCA":
|
326
314
|
"""Fit the model from data in X
|
327
315
|
For more details on this function, see [sklearn.decomposition.KernelPCA.fit]
|
328
316
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA.fit)
|
@@ -349,12 +337,14 @@ class KernelPCA(BaseTransformer):
|
|
349
337
|
|
350
338
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
351
339
|
|
352
|
-
|
340
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
353
341
|
if SNOWML_SPROC_ENV in os.environ:
|
354
342
|
statement_params = telemetry.get_function_usage_statement_params(
|
355
343
|
project=_PROJECT,
|
356
344
|
subproject=_SUBPROJECT,
|
357
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
345
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
346
|
+
inspect.currentframe(), KernelPCA.__class__.__name__
|
347
|
+
),
|
358
348
|
api_calls=[Session.call],
|
359
349
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
360
350
|
)
|
@@ -375,27 +365,24 @@ class KernelPCA(BaseTransformer):
|
|
375
365
|
)
|
376
366
|
self._sklearn_object = model_trainer.train()
|
377
367
|
self._is_fitted = True
|
378
|
-
self.
|
368
|
+
self._generate_model_signatures(dataset)
|
379
369
|
return self
|
380
370
|
|
381
371
|
def _batch_inference_validate_snowpark(
|
382
372
|
self,
|
383
373
|
dataset: DataFrame,
|
384
374
|
inference_method: str,
|
385
|
-
) ->
|
386
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
387
|
-
return the available package that exists in the snowflake anaconda channel
|
375
|
+
) -> None:
|
376
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
388
377
|
|
389
378
|
Args:
|
390
379
|
dataset: snowpark dataframe
|
391
380
|
inference_method: the inference method such as predict, score...
|
392
|
-
|
381
|
+
|
393
382
|
Raises:
|
394
383
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
395
384
|
SnowflakeMLException: If the session is None, raise error
|
396
385
|
|
397
|
-
Returns:
|
398
|
-
A list of available package that exists in the snowflake anaconda channel
|
399
386
|
"""
|
400
387
|
if not self._is_fitted:
|
401
388
|
raise exceptions.SnowflakeMLException(
|
@@ -413,9 +400,7 @@ class KernelPCA(BaseTransformer):
|
|
413
400
|
"Session must not specified for snowpark dataset."
|
414
401
|
),
|
415
402
|
)
|
416
|
-
|
417
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
418
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
403
|
+
|
419
404
|
|
420
405
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
421
406
|
@telemetry.send_api_usage_telemetry(
|
@@ -449,7 +434,9 @@ class KernelPCA(BaseTransformer):
|
|
449
434
|
# when it is classifier, infer the datatype from label columns
|
450
435
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
451
436
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
452
|
-
label_cols_signatures = [
|
437
|
+
label_cols_signatures = [
|
438
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
439
|
+
]
|
453
440
|
if len(label_cols_signatures) == 0:
|
454
441
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
455
442
|
raise exceptions.SnowflakeMLException(
|
@@ -457,25 +444,23 @@ class KernelPCA(BaseTransformer):
|
|
457
444
|
original_exception=ValueError(error_str),
|
458
445
|
)
|
459
446
|
|
460
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
461
|
-
label_cols_signatures[0].as_snowpark_type()
|
462
|
-
)
|
447
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
463
448
|
|
464
|
-
self.
|
465
|
-
|
449
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
450
|
+
self._deps = self._get_dependencies()
|
451
|
+
assert isinstance(
|
452
|
+
dataset._session, Session
|
453
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
466
454
|
|
467
455
|
transform_kwargs = dict(
|
468
|
-
session
|
469
|
-
dependencies
|
470
|
-
drop_input_cols
|
471
|
-
expected_output_cols_type
|
456
|
+
session=dataset._session,
|
457
|
+
dependencies=self._deps,
|
458
|
+
drop_input_cols=self._drop_input_cols,
|
459
|
+
expected_output_cols_type=expected_type_inferred,
|
472
460
|
)
|
473
461
|
|
474
462
|
elif isinstance(dataset, pd.DataFrame):
|
475
|
-
transform_kwargs = dict(
|
476
|
-
snowpark_input_cols = self._snowpark_cols,
|
477
|
-
drop_input_cols = self._drop_input_cols
|
478
|
-
)
|
463
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
479
464
|
|
480
465
|
transform_handlers = ModelTransformerBuilder.build(
|
481
466
|
dataset=dataset,
|
@@ -517,7 +502,7 @@ class KernelPCA(BaseTransformer):
|
|
517
502
|
Transformed dataset.
|
518
503
|
"""
|
519
504
|
super()._check_dataset_type(dataset)
|
520
|
-
inference_method="transform"
|
505
|
+
inference_method = "transform"
|
521
506
|
|
522
507
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
523
508
|
# are specific to the type of dataset used.
|
@@ -547,24 +532,19 @@ class KernelPCA(BaseTransformer):
|
|
547
532
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
548
533
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
549
534
|
|
550
|
-
self.
|
551
|
-
|
552
|
-
inference_method=inference_method,
|
553
|
-
)
|
535
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
536
|
+
self._deps = self._get_dependencies()
|
554
537
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
555
538
|
|
556
539
|
transform_kwargs = dict(
|
557
|
-
session
|
558
|
-
dependencies
|
559
|
-
drop_input_cols
|
560
|
-
expected_output_cols_type
|
540
|
+
session=dataset._session,
|
541
|
+
dependencies=self._deps,
|
542
|
+
drop_input_cols=self._drop_input_cols,
|
543
|
+
expected_output_cols_type=expected_dtype,
|
561
544
|
)
|
562
545
|
|
563
546
|
elif isinstance(dataset, pd.DataFrame):
|
564
|
-
transform_kwargs = dict(
|
565
|
-
snowpark_input_cols = self._snowpark_cols,
|
566
|
-
drop_input_cols = self._drop_input_cols
|
567
|
-
)
|
547
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
568
548
|
|
569
549
|
transform_handlers = ModelTransformerBuilder.build(
|
570
550
|
dataset=dataset,
|
@@ -583,7 +563,11 @@ class KernelPCA(BaseTransformer):
|
|
583
563
|
return output_df
|
584
564
|
|
585
565
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
586
|
-
def fit_predict(
|
566
|
+
def fit_predict(
|
567
|
+
self,
|
568
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
569
|
+
output_cols_prefix: str = "fit_predict_",
|
570
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
587
571
|
""" Method not supported for this class.
|
588
572
|
|
589
573
|
|
@@ -608,22 +592,106 @@ class KernelPCA(BaseTransformer):
|
|
608
592
|
)
|
609
593
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
610
594
|
drop_input_cols=self._drop_input_cols,
|
611
|
-
expected_output_cols_list=
|
595
|
+
expected_output_cols_list=(
|
596
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
597
|
+
),
|
612
598
|
)
|
613
599
|
self._sklearn_object = fitted_estimator
|
614
600
|
self._is_fitted = True
|
615
601
|
return output_result
|
616
602
|
|
603
|
+
|
604
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
605
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
606
|
+
""" Fit the model from data in X and transform X
|
607
|
+
For more details on this function, see [sklearn.decomposition.KernelPCA.fit_transform]
|
608
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA.fit_transform)
|
609
|
+
|
610
|
+
|
611
|
+
Raises:
|
612
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
617
613
|
|
618
|
-
|
619
|
-
|
620
|
-
|
614
|
+
Args:
|
615
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
616
|
+
Snowpark or Pandas DataFrame.
|
617
|
+
output_cols_prefix: Prefix for the response columns
|
621
618
|
Returns:
|
622
619
|
Transformed dataset.
|
623
620
|
"""
|
624
|
-
self.
|
625
|
-
|
626
|
-
|
621
|
+
self._infer_input_output_cols(dataset)
|
622
|
+
super()._check_dataset_type(dataset)
|
623
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
624
|
+
estimator=self._sklearn_object,
|
625
|
+
dataset=dataset,
|
626
|
+
input_cols=self.input_cols,
|
627
|
+
label_cols=self.label_cols,
|
628
|
+
sample_weight_col=self.sample_weight_col,
|
629
|
+
autogenerated=self._autogenerated,
|
630
|
+
subproject=_SUBPROJECT,
|
631
|
+
)
|
632
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
633
|
+
drop_input_cols=self._drop_input_cols,
|
634
|
+
expected_output_cols_list=self.output_cols,
|
635
|
+
)
|
636
|
+
self._sklearn_object = fitted_estimator
|
637
|
+
self._is_fitted = True
|
638
|
+
return output_result
|
639
|
+
|
640
|
+
|
641
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
642
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
643
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
644
|
+
"""
|
645
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
646
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
647
|
+
if output_cols:
|
648
|
+
output_cols = [
|
649
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
650
|
+
for c in output_cols
|
651
|
+
]
|
652
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
653
|
+
output_cols = [output_cols_prefix]
|
654
|
+
elif self._sklearn_object is not None:
|
655
|
+
classes = self._sklearn_object.classes_
|
656
|
+
if isinstance(classes, numpy.ndarray):
|
657
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
658
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
659
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
660
|
+
output_cols = []
|
661
|
+
for i, cl in enumerate(classes):
|
662
|
+
# For binary classification, there is only one output column for each class
|
663
|
+
# ndarray as the two classes are complementary.
|
664
|
+
if len(cl) == 2:
|
665
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
666
|
+
else:
|
667
|
+
output_cols.extend([
|
668
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
669
|
+
])
|
670
|
+
else:
|
671
|
+
output_cols = []
|
672
|
+
|
673
|
+
# Make sure column names are valid snowflake identifiers.
|
674
|
+
assert output_cols is not None # Make MyPy happy
|
675
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
676
|
+
|
677
|
+
return rv
|
678
|
+
|
679
|
+
def _align_expected_output_names(
|
680
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
681
|
+
) -> List[str]:
|
682
|
+
# in case the inferred output column names dimension is different
|
683
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
684
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
685
|
+
output_df_columns = list(output_df_pd.columns)
|
686
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
687
|
+
if self.sample_weight_col:
|
688
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
689
|
+
# if the dimension of inferred output column names is correct; use it
|
690
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
691
|
+
return expected_output_cols_list
|
692
|
+
# otherwise, use the sklearn estimator's output
|
693
|
+
else:
|
694
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
627
695
|
|
628
696
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
629
697
|
@telemetry.send_api_usage_telemetry(
|
@@ -655,24 +723,26 @@ class KernelPCA(BaseTransformer):
|
|
655
723
|
# are specific to the type of dataset used.
|
656
724
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
657
725
|
|
726
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
727
|
+
|
658
728
|
if isinstance(dataset, DataFrame):
|
659
|
-
self.
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
729
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
730
|
+
self._deps = self._get_dependencies()
|
731
|
+
assert isinstance(
|
732
|
+
dataset._session, Session
|
733
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
664
734
|
transform_kwargs = dict(
|
665
735
|
session=dataset._session,
|
666
736
|
dependencies=self._deps,
|
667
|
-
drop_input_cols
|
737
|
+
drop_input_cols=self._drop_input_cols,
|
668
738
|
expected_output_cols_type="float",
|
669
739
|
)
|
740
|
+
expected_output_cols = self._align_expected_output_names(
|
741
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
742
|
+
)
|
670
743
|
|
671
744
|
elif isinstance(dataset, pd.DataFrame):
|
672
|
-
transform_kwargs = dict(
|
673
|
-
snowpark_input_cols = self._snowpark_cols,
|
674
|
-
drop_input_cols = self._drop_input_cols
|
675
|
-
)
|
745
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
676
746
|
|
677
747
|
transform_handlers = ModelTransformerBuilder.build(
|
678
748
|
dataset=dataset,
|
@@ -684,7 +754,7 @@ class KernelPCA(BaseTransformer):
|
|
684
754
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
685
755
|
inference_method=inference_method,
|
686
756
|
input_cols=self.input_cols,
|
687
|
-
expected_output_cols=
|
757
|
+
expected_output_cols=expected_output_cols,
|
688
758
|
**transform_kwargs
|
689
759
|
)
|
690
760
|
return output_df
|
@@ -714,29 +784,30 @@ class KernelPCA(BaseTransformer):
|
|
714
784
|
Output dataset with log probability of the sample for each class in the model.
|
715
785
|
"""
|
716
786
|
super()._check_dataset_type(dataset)
|
717
|
-
inference_method="predict_log_proba"
|
787
|
+
inference_method = "predict_log_proba"
|
788
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
718
789
|
|
719
790
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
720
791
|
# are specific to the type of dataset used.
|
721
792
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
722
793
|
|
723
794
|
if isinstance(dataset, DataFrame):
|
724
|
-
self.
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
795
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
796
|
+
self._deps = self._get_dependencies()
|
797
|
+
assert isinstance(
|
798
|
+
dataset._session, Session
|
799
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
729
800
|
transform_kwargs = dict(
|
730
801
|
session=dataset._session,
|
731
802
|
dependencies=self._deps,
|
732
|
-
drop_input_cols
|
803
|
+
drop_input_cols=self._drop_input_cols,
|
733
804
|
expected_output_cols_type="float",
|
734
805
|
)
|
806
|
+
expected_output_cols = self._align_expected_output_names(
|
807
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
808
|
+
)
|
735
809
|
elif isinstance(dataset, pd.DataFrame):
|
736
|
-
transform_kwargs = dict(
|
737
|
-
snowpark_input_cols = self._snowpark_cols,
|
738
|
-
drop_input_cols = self._drop_input_cols
|
739
|
-
)
|
810
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
740
811
|
|
741
812
|
transform_handlers = ModelTransformerBuilder.build(
|
742
813
|
dataset=dataset,
|
@@ -749,7 +820,7 @@ class KernelPCA(BaseTransformer):
|
|
749
820
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
750
821
|
inference_method=inference_method,
|
751
822
|
input_cols=self.input_cols,
|
752
|
-
expected_output_cols=
|
823
|
+
expected_output_cols=expected_output_cols,
|
753
824
|
**transform_kwargs
|
754
825
|
)
|
755
826
|
return output_df
|
@@ -775,30 +846,32 @@ class KernelPCA(BaseTransformer):
|
|
775
846
|
Output dataset with results of the decision function for the samples in input dataset.
|
776
847
|
"""
|
777
848
|
super()._check_dataset_type(dataset)
|
778
|
-
inference_method="decision_function"
|
849
|
+
inference_method = "decision_function"
|
779
850
|
|
780
851
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
781
852
|
# are specific to the type of dataset used.
|
782
853
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
783
854
|
|
855
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
856
|
+
|
784
857
|
if isinstance(dataset, DataFrame):
|
785
|
-
self.
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
858
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
859
|
+
self._deps = self._get_dependencies()
|
860
|
+
assert isinstance(
|
861
|
+
dataset._session, Session
|
862
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
790
863
|
transform_kwargs = dict(
|
791
864
|
session=dataset._session,
|
792
865
|
dependencies=self._deps,
|
793
|
-
drop_input_cols
|
866
|
+
drop_input_cols=self._drop_input_cols,
|
794
867
|
expected_output_cols_type="float",
|
795
868
|
)
|
869
|
+
expected_output_cols = self._align_expected_output_names(
|
870
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
871
|
+
)
|
796
872
|
|
797
873
|
elif isinstance(dataset, pd.DataFrame):
|
798
|
-
transform_kwargs = dict(
|
799
|
-
snowpark_input_cols = self._snowpark_cols,
|
800
|
-
drop_input_cols = self._drop_input_cols
|
801
|
-
)
|
874
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
802
875
|
|
803
876
|
transform_handlers = ModelTransformerBuilder.build(
|
804
877
|
dataset=dataset,
|
@@ -811,7 +884,7 @@ class KernelPCA(BaseTransformer):
|
|
811
884
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
812
885
|
inference_method=inference_method,
|
813
886
|
input_cols=self.input_cols,
|
814
|
-
expected_output_cols=
|
887
|
+
expected_output_cols=expected_output_cols,
|
815
888
|
**transform_kwargs
|
816
889
|
)
|
817
890
|
return output_df
|
@@ -840,17 +913,17 @@ class KernelPCA(BaseTransformer):
|
|
840
913
|
Output dataset with probability of the sample for each class in the model.
|
841
914
|
"""
|
842
915
|
super()._check_dataset_type(dataset)
|
843
|
-
inference_method="score_samples"
|
916
|
+
inference_method = "score_samples"
|
844
917
|
|
845
918
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
846
919
|
# are specific to the type of dataset used.
|
847
920
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
848
921
|
|
922
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
923
|
+
|
849
924
|
if isinstance(dataset, DataFrame):
|
850
|
-
self.
|
851
|
-
|
852
|
-
inference_method=inference_method,
|
853
|
-
)
|
925
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
926
|
+
self._deps = self._get_dependencies()
|
854
927
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
855
928
|
transform_kwargs = dict(
|
856
929
|
session=dataset._session,
|
@@ -858,6 +931,9 @@ class KernelPCA(BaseTransformer):
|
|
858
931
|
drop_input_cols = self._drop_input_cols,
|
859
932
|
expected_output_cols_type="float",
|
860
933
|
)
|
934
|
+
expected_output_cols = self._align_expected_output_names(
|
935
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
936
|
+
)
|
861
937
|
|
862
938
|
elif isinstance(dataset, pd.DataFrame):
|
863
939
|
transform_kwargs = dict(
|
@@ -876,7 +952,7 @@ class KernelPCA(BaseTransformer):
|
|
876
952
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
877
953
|
inference_method=inference_method,
|
878
954
|
input_cols=self.input_cols,
|
879
|
-
expected_output_cols=
|
955
|
+
expected_output_cols=expected_output_cols,
|
880
956
|
**transform_kwargs
|
881
957
|
)
|
882
958
|
return output_df
|
@@ -909,17 +985,15 @@ class KernelPCA(BaseTransformer):
|
|
909
985
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
910
986
|
|
911
987
|
if isinstance(dataset, DataFrame):
|
912
|
-
self.
|
913
|
-
|
914
|
-
inference_method="score",
|
915
|
-
)
|
988
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
989
|
+
self._deps = self._get_dependencies()
|
916
990
|
selected_cols = self._get_active_columns()
|
917
991
|
if len(selected_cols) > 0:
|
918
992
|
dataset = dataset.select(selected_cols)
|
919
993
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
920
994
|
transform_kwargs = dict(
|
921
995
|
session=dataset._session,
|
922
|
-
dependencies=
|
996
|
+
dependencies=self._deps,
|
923
997
|
score_sproc_imports=['sklearn'],
|
924
998
|
)
|
925
999
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -984,11 +1058,8 @@ class KernelPCA(BaseTransformer):
|
|
984
1058
|
|
985
1059
|
if isinstance(dataset, DataFrame):
|
986
1060
|
|
987
|
-
self.
|
988
|
-
|
989
|
-
inference_method=inference_method,
|
990
|
-
|
991
|
-
)
|
1061
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1062
|
+
self._deps = self._get_dependencies()
|
992
1063
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
993
1064
|
transform_kwargs = dict(
|
994
1065
|
session = dataset._session,
|
@@ -1021,50 +1092,84 @@ class KernelPCA(BaseTransformer):
|
|
1021
1092
|
)
|
1022
1093
|
return output_df
|
1023
1094
|
|
1095
|
+
|
1096
|
+
|
1097
|
+
def to_sklearn(self) -> Any:
|
1098
|
+
"""Get sklearn.decomposition.KernelPCA object.
|
1099
|
+
"""
|
1100
|
+
if self._sklearn_object is None:
|
1101
|
+
self._sklearn_object = self._create_sklearn_object()
|
1102
|
+
return self._sklearn_object
|
1103
|
+
|
1104
|
+
def to_xgboost(self) -> Any:
|
1105
|
+
raise exceptions.SnowflakeMLException(
|
1106
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1107
|
+
original_exception=AttributeError(
|
1108
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1109
|
+
"to_xgboost()",
|
1110
|
+
"to_sklearn()"
|
1111
|
+
)
|
1112
|
+
),
|
1113
|
+
)
|
1024
1114
|
|
1025
|
-
def
|
1115
|
+
def to_lightgbm(self) -> Any:
|
1116
|
+
raise exceptions.SnowflakeMLException(
|
1117
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1118
|
+
original_exception=AttributeError(
|
1119
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1120
|
+
"to_lightgbm()",
|
1121
|
+
"to_sklearn()"
|
1122
|
+
)
|
1123
|
+
),
|
1124
|
+
)
|
1125
|
+
|
1126
|
+
def _get_dependencies(self) -> List[str]:
|
1127
|
+
return self._deps
|
1128
|
+
|
1129
|
+
|
1130
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1026
1131
|
self._model_signature_dict = dict()
|
1027
1132
|
|
1028
1133
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1029
1134
|
|
1030
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1135
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1031
1136
|
outputs: List[BaseFeatureSpec] = []
|
1032
1137
|
if hasattr(self, "predict"):
|
1033
1138
|
# keep mypy happy
|
1034
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1139
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1035
1140
|
# For classifier, the type of predict is the same as the type of label
|
1036
|
-
if self._sklearn_object._estimator_type ==
|
1037
|
-
|
1141
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1142
|
+
# label columns is the desired type for output
|
1038
1143
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1039
1144
|
# rename the output columns
|
1040
1145
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1041
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1042
|
-
|
1043
|
-
|
1146
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1147
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1148
|
+
)
|
1044
1149
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1045
1150
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1046
|
-
# Clusterer returns int64 cluster labels.
|
1151
|
+
# Clusterer returns int64 cluster labels.
|
1047
1152
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1048
1153
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1049
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1154
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1155
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1156
|
+
)
|
1157
|
+
|
1053
1158
|
# For regressor, the type of predict is float64
|
1054
|
-
elif self._sklearn_object._estimator_type ==
|
1159
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1055
1160
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1056
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1161
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1162
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1163
|
+
)
|
1164
|
+
|
1060
1165
|
for prob_func in PROB_FUNCTIONS:
|
1061
1166
|
if hasattr(self, prob_func):
|
1062
1167
|
output_cols_prefix: str = f"{prob_func}_"
|
1063
1168
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1064
1169
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1065
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1066
|
-
|
1067
|
-
|
1170
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1171
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1172
|
+
)
|
1068
1173
|
|
1069
1174
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1070
1175
|
items = list(self._model_signature_dict.items())
|
@@ -1077,10 +1182,10 @@ class KernelPCA(BaseTransformer):
|
|
1077
1182
|
"""Returns model signature of current class.
|
1078
1183
|
|
1079
1184
|
Raises:
|
1080
|
-
|
1185
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1081
1186
|
|
1082
1187
|
Returns:
|
1083
|
-
Dict
|
1188
|
+
Dict with each method and its input output signature
|
1084
1189
|
"""
|
1085
1190
|
if self._model_signature_dict is None:
|
1086
1191
|
raise exceptions.SnowflakeMLException(
|
@@ -1088,35 +1193,3 @@ class KernelPCA(BaseTransformer):
|
|
1088
1193
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1089
1194
|
)
|
1090
1195
|
return self._model_signature_dict
|
1091
|
-
|
1092
|
-
def to_sklearn(self) -> Any:
|
1093
|
-
"""Get sklearn.decomposition.KernelPCA object.
|
1094
|
-
"""
|
1095
|
-
if self._sklearn_object is None:
|
1096
|
-
self._sklearn_object = self._create_sklearn_object()
|
1097
|
-
return self._sklearn_object
|
1098
|
-
|
1099
|
-
def to_xgboost(self) -> Any:
|
1100
|
-
raise exceptions.SnowflakeMLException(
|
1101
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1102
|
-
original_exception=AttributeError(
|
1103
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1104
|
-
"to_xgboost()",
|
1105
|
-
"to_sklearn()"
|
1106
|
-
)
|
1107
|
-
),
|
1108
|
-
)
|
1109
|
-
|
1110
|
-
def to_lightgbm(self) -> Any:
|
1111
|
-
raise exceptions.SnowflakeMLException(
|
1112
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1113
|
-
original_exception=AttributeError(
|
1114
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1115
|
-
"to_lightgbm()",
|
1116
|
-
"to_sklearn()"
|
1117
|
-
)
|
1118
|
-
),
|
1119
|
-
)
|
1120
|
-
|
1121
|
-
def _get_dependencies(self) -> List[str]:
|
1122
|
-
return self._deps
|