snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.manifold".replace("sklea
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return True and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class SpectralEmbedding(BaseTransformer):
|
71
64
|
r"""Spectral embedding for non-linear dimensionality reduction
|
72
65
|
For more details on this class, see [sklearn.manifold.SpectralEmbedding]
|
@@ -263,12 +256,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
263
256
|
)
|
264
257
|
return selected_cols
|
265
258
|
|
266
|
-
|
267
|
-
project=_PROJECT,
|
268
|
-
subproject=_SUBPROJECT,
|
269
|
-
custom_tags=dict([("autogen", True)]),
|
270
|
-
)
|
271
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralEmbedding":
|
259
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralEmbedding":
|
272
260
|
"""Fit the model from data in X
|
273
261
|
For more details on this function, see [sklearn.manifold.SpectralEmbedding.fit]
|
274
262
|
(https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html#sklearn.manifold.SpectralEmbedding.fit)
|
@@ -295,12 +283,14 @@ class SpectralEmbedding(BaseTransformer):
|
|
295
283
|
|
296
284
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
297
285
|
|
298
|
-
|
286
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
299
287
|
if SNOWML_SPROC_ENV in os.environ:
|
300
288
|
statement_params = telemetry.get_function_usage_statement_params(
|
301
289
|
project=_PROJECT,
|
302
290
|
subproject=_SUBPROJECT,
|
303
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
291
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
292
|
+
inspect.currentframe(), SpectralEmbedding.__class__.__name__
|
293
|
+
),
|
304
294
|
api_calls=[Session.call],
|
305
295
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
306
296
|
)
|
@@ -321,27 +311,24 @@ class SpectralEmbedding(BaseTransformer):
|
|
321
311
|
)
|
322
312
|
self._sklearn_object = model_trainer.train()
|
323
313
|
self._is_fitted = True
|
324
|
-
self.
|
314
|
+
self._generate_model_signatures(dataset)
|
325
315
|
return self
|
326
316
|
|
327
317
|
def _batch_inference_validate_snowpark(
|
328
318
|
self,
|
329
319
|
dataset: DataFrame,
|
330
320
|
inference_method: str,
|
331
|
-
) ->
|
332
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
333
|
-
return the available package that exists in the snowflake anaconda channel
|
321
|
+
) -> None:
|
322
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
334
323
|
|
335
324
|
Args:
|
336
325
|
dataset: snowpark dataframe
|
337
326
|
inference_method: the inference method such as predict, score...
|
338
|
-
|
327
|
+
|
339
328
|
Raises:
|
340
329
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
341
330
|
SnowflakeMLException: If the session is None, raise error
|
342
331
|
|
343
|
-
Returns:
|
344
|
-
A list of available package that exists in the snowflake anaconda channel
|
345
332
|
"""
|
346
333
|
if not self._is_fitted:
|
347
334
|
raise exceptions.SnowflakeMLException(
|
@@ -359,9 +346,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
359
346
|
"Session must not specified for snowpark dataset."
|
360
347
|
),
|
361
348
|
)
|
362
|
-
|
363
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
364
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
349
|
+
|
365
350
|
|
366
351
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
367
352
|
@telemetry.send_api_usage_telemetry(
|
@@ -395,7 +380,9 @@ class SpectralEmbedding(BaseTransformer):
|
|
395
380
|
# when it is classifier, infer the datatype from label columns
|
396
381
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
397
382
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
398
|
-
label_cols_signatures = [
|
383
|
+
label_cols_signatures = [
|
384
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
385
|
+
]
|
399
386
|
if len(label_cols_signatures) == 0:
|
400
387
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
401
388
|
raise exceptions.SnowflakeMLException(
|
@@ -403,25 +390,23 @@ class SpectralEmbedding(BaseTransformer):
|
|
403
390
|
original_exception=ValueError(error_str),
|
404
391
|
)
|
405
392
|
|
406
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
407
|
-
label_cols_signatures[0].as_snowpark_type()
|
408
|
-
)
|
393
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
409
394
|
|
410
|
-
self.
|
411
|
-
|
395
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
396
|
+
self._deps = self._get_dependencies()
|
397
|
+
assert isinstance(
|
398
|
+
dataset._session, Session
|
399
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
412
400
|
|
413
401
|
transform_kwargs = dict(
|
414
|
-
session
|
415
|
-
dependencies
|
416
|
-
drop_input_cols
|
417
|
-
expected_output_cols_type
|
402
|
+
session=dataset._session,
|
403
|
+
dependencies=self._deps,
|
404
|
+
drop_input_cols=self._drop_input_cols,
|
405
|
+
expected_output_cols_type=expected_type_inferred,
|
418
406
|
)
|
419
407
|
|
420
408
|
elif isinstance(dataset, pd.DataFrame):
|
421
|
-
transform_kwargs = dict(
|
422
|
-
snowpark_input_cols = self._snowpark_cols,
|
423
|
-
drop_input_cols = self._drop_input_cols
|
424
|
-
)
|
409
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
425
410
|
|
426
411
|
transform_handlers = ModelTransformerBuilder.build(
|
427
412
|
dataset=dataset,
|
@@ -461,7 +446,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
461
446
|
Transformed dataset.
|
462
447
|
"""
|
463
448
|
super()._check_dataset_type(dataset)
|
464
|
-
inference_method="transform"
|
449
|
+
inference_method = "transform"
|
465
450
|
|
466
451
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
467
452
|
# are specific to the type of dataset used.
|
@@ -491,24 +476,19 @@ class SpectralEmbedding(BaseTransformer):
|
|
491
476
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
492
477
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
493
478
|
|
494
|
-
self.
|
495
|
-
|
496
|
-
inference_method=inference_method,
|
497
|
-
)
|
479
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
480
|
+
self._deps = self._get_dependencies()
|
498
481
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
499
482
|
|
500
483
|
transform_kwargs = dict(
|
501
|
-
session
|
502
|
-
dependencies
|
503
|
-
drop_input_cols
|
504
|
-
expected_output_cols_type
|
484
|
+
session=dataset._session,
|
485
|
+
dependencies=self._deps,
|
486
|
+
drop_input_cols=self._drop_input_cols,
|
487
|
+
expected_output_cols_type=expected_dtype,
|
505
488
|
)
|
506
489
|
|
507
490
|
elif isinstance(dataset, pd.DataFrame):
|
508
|
-
transform_kwargs = dict(
|
509
|
-
snowpark_input_cols = self._snowpark_cols,
|
510
|
-
drop_input_cols = self._drop_input_cols
|
511
|
-
)
|
491
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
512
492
|
|
513
493
|
transform_handlers = ModelTransformerBuilder.build(
|
514
494
|
dataset=dataset,
|
@@ -527,7 +507,11 @@ class SpectralEmbedding(BaseTransformer):
|
|
527
507
|
return output_df
|
528
508
|
|
529
509
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
530
|
-
def fit_predict(
|
510
|
+
def fit_predict(
|
511
|
+
self,
|
512
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
513
|
+
output_cols_prefix: str = "fit_predict_",
|
514
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
531
515
|
""" Method not supported for this class.
|
532
516
|
|
533
517
|
|
@@ -552,22 +536,106 @@ class SpectralEmbedding(BaseTransformer):
|
|
552
536
|
)
|
553
537
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
554
538
|
drop_input_cols=self._drop_input_cols,
|
555
|
-
expected_output_cols_list=
|
539
|
+
expected_output_cols_list=(
|
540
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
541
|
+
),
|
556
542
|
)
|
557
543
|
self._sklearn_object = fitted_estimator
|
558
544
|
self._is_fitted = True
|
559
545
|
return output_result
|
560
546
|
|
547
|
+
|
548
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
549
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
550
|
+
""" Fit the model from data in X and transform X
|
551
|
+
For more details on this function, see [sklearn.manifold.SpectralEmbedding.fit_transform]
|
552
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html#sklearn.manifold.SpectralEmbedding.fit_transform)
|
553
|
+
|
554
|
+
|
555
|
+
Raises:
|
556
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
561
557
|
|
562
|
-
|
563
|
-
|
564
|
-
|
558
|
+
Args:
|
559
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
560
|
+
Snowpark or Pandas DataFrame.
|
561
|
+
output_cols_prefix: Prefix for the response columns
|
565
562
|
Returns:
|
566
563
|
Transformed dataset.
|
567
564
|
"""
|
568
|
-
self.
|
569
|
-
|
570
|
-
|
565
|
+
self._infer_input_output_cols(dataset)
|
566
|
+
super()._check_dataset_type(dataset)
|
567
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
568
|
+
estimator=self._sklearn_object,
|
569
|
+
dataset=dataset,
|
570
|
+
input_cols=self.input_cols,
|
571
|
+
label_cols=self.label_cols,
|
572
|
+
sample_weight_col=self.sample_weight_col,
|
573
|
+
autogenerated=self._autogenerated,
|
574
|
+
subproject=_SUBPROJECT,
|
575
|
+
)
|
576
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
577
|
+
drop_input_cols=self._drop_input_cols,
|
578
|
+
expected_output_cols_list=self.output_cols,
|
579
|
+
)
|
580
|
+
self._sklearn_object = fitted_estimator
|
581
|
+
self._is_fitted = True
|
582
|
+
return output_result
|
583
|
+
|
584
|
+
|
585
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
586
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
587
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
588
|
+
"""
|
589
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
590
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
591
|
+
if output_cols:
|
592
|
+
output_cols = [
|
593
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
594
|
+
for c in output_cols
|
595
|
+
]
|
596
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
597
|
+
output_cols = [output_cols_prefix]
|
598
|
+
elif self._sklearn_object is not None:
|
599
|
+
classes = self._sklearn_object.classes_
|
600
|
+
if isinstance(classes, numpy.ndarray):
|
601
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
602
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
603
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
604
|
+
output_cols = []
|
605
|
+
for i, cl in enumerate(classes):
|
606
|
+
# For binary classification, there is only one output column for each class
|
607
|
+
# ndarray as the two classes are complementary.
|
608
|
+
if len(cl) == 2:
|
609
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
610
|
+
else:
|
611
|
+
output_cols.extend([
|
612
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
613
|
+
])
|
614
|
+
else:
|
615
|
+
output_cols = []
|
616
|
+
|
617
|
+
# Make sure column names are valid snowflake identifiers.
|
618
|
+
assert output_cols is not None # Make MyPy happy
|
619
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
620
|
+
|
621
|
+
return rv
|
622
|
+
|
623
|
+
def _align_expected_output_names(
|
624
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
625
|
+
) -> List[str]:
|
626
|
+
# in case the inferred output column names dimension is different
|
627
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
628
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
629
|
+
output_df_columns = list(output_df_pd.columns)
|
630
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
631
|
+
if self.sample_weight_col:
|
632
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
633
|
+
# if the dimension of inferred output column names is correct; use it
|
634
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
635
|
+
return expected_output_cols_list
|
636
|
+
# otherwise, use the sklearn estimator's output
|
637
|
+
else:
|
638
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
571
639
|
|
572
640
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
573
641
|
@telemetry.send_api_usage_telemetry(
|
@@ -599,24 +667,26 @@ class SpectralEmbedding(BaseTransformer):
|
|
599
667
|
# are specific to the type of dataset used.
|
600
668
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
601
669
|
|
670
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
671
|
+
|
602
672
|
if isinstance(dataset, DataFrame):
|
603
|
-
self.
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
673
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
674
|
+
self._deps = self._get_dependencies()
|
675
|
+
assert isinstance(
|
676
|
+
dataset._session, Session
|
677
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
608
678
|
transform_kwargs = dict(
|
609
679
|
session=dataset._session,
|
610
680
|
dependencies=self._deps,
|
611
|
-
drop_input_cols
|
681
|
+
drop_input_cols=self._drop_input_cols,
|
612
682
|
expected_output_cols_type="float",
|
613
683
|
)
|
684
|
+
expected_output_cols = self._align_expected_output_names(
|
685
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
686
|
+
)
|
614
687
|
|
615
688
|
elif isinstance(dataset, pd.DataFrame):
|
616
|
-
transform_kwargs = dict(
|
617
|
-
snowpark_input_cols = self._snowpark_cols,
|
618
|
-
drop_input_cols = self._drop_input_cols
|
619
|
-
)
|
689
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
620
690
|
|
621
691
|
transform_handlers = ModelTransformerBuilder.build(
|
622
692
|
dataset=dataset,
|
@@ -628,7 +698,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
628
698
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
629
699
|
inference_method=inference_method,
|
630
700
|
input_cols=self.input_cols,
|
631
|
-
expected_output_cols=
|
701
|
+
expected_output_cols=expected_output_cols,
|
632
702
|
**transform_kwargs
|
633
703
|
)
|
634
704
|
return output_df
|
@@ -658,29 +728,30 @@ class SpectralEmbedding(BaseTransformer):
|
|
658
728
|
Output dataset with log probability of the sample for each class in the model.
|
659
729
|
"""
|
660
730
|
super()._check_dataset_type(dataset)
|
661
|
-
inference_method="predict_log_proba"
|
731
|
+
inference_method = "predict_log_proba"
|
732
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
662
733
|
|
663
734
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
664
735
|
# are specific to the type of dataset used.
|
665
736
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
666
737
|
|
667
738
|
if isinstance(dataset, DataFrame):
|
668
|
-
self.
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
739
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
740
|
+
self._deps = self._get_dependencies()
|
741
|
+
assert isinstance(
|
742
|
+
dataset._session, Session
|
743
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
673
744
|
transform_kwargs = dict(
|
674
745
|
session=dataset._session,
|
675
746
|
dependencies=self._deps,
|
676
|
-
drop_input_cols
|
747
|
+
drop_input_cols=self._drop_input_cols,
|
677
748
|
expected_output_cols_type="float",
|
678
749
|
)
|
750
|
+
expected_output_cols = self._align_expected_output_names(
|
751
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
752
|
+
)
|
679
753
|
elif isinstance(dataset, pd.DataFrame):
|
680
|
-
transform_kwargs = dict(
|
681
|
-
snowpark_input_cols = self._snowpark_cols,
|
682
|
-
drop_input_cols = self._drop_input_cols
|
683
|
-
)
|
754
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
684
755
|
|
685
756
|
transform_handlers = ModelTransformerBuilder.build(
|
686
757
|
dataset=dataset,
|
@@ -693,7 +764,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
693
764
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
694
765
|
inference_method=inference_method,
|
695
766
|
input_cols=self.input_cols,
|
696
|
-
expected_output_cols=
|
767
|
+
expected_output_cols=expected_output_cols,
|
697
768
|
**transform_kwargs
|
698
769
|
)
|
699
770
|
return output_df
|
@@ -719,30 +790,32 @@ class SpectralEmbedding(BaseTransformer):
|
|
719
790
|
Output dataset with results of the decision function for the samples in input dataset.
|
720
791
|
"""
|
721
792
|
super()._check_dataset_type(dataset)
|
722
|
-
inference_method="decision_function"
|
793
|
+
inference_method = "decision_function"
|
723
794
|
|
724
795
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
725
796
|
# are specific to the type of dataset used.
|
726
797
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
727
798
|
|
799
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
800
|
+
|
728
801
|
if isinstance(dataset, DataFrame):
|
729
|
-
self.
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
802
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
803
|
+
self._deps = self._get_dependencies()
|
804
|
+
assert isinstance(
|
805
|
+
dataset._session, Session
|
806
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
734
807
|
transform_kwargs = dict(
|
735
808
|
session=dataset._session,
|
736
809
|
dependencies=self._deps,
|
737
|
-
drop_input_cols
|
810
|
+
drop_input_cols=self._drop_input_cols,
|
738
811
|
expected_output_cols_type="float",
|
739
812
|
)
|
813
|
+
expected_output_cols = self._align_expected_output_names(
|
814
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
815
|
+
)
|
740
816
|
|
741
817
|
elif isinstance(dataset, pd.DataFrame):
|
742
|
-
transform_kwargs = dict(
|
743
|
-
snowpark_input_cols = self._snowpark_cols,
|
744
|
-
drop_input_cols = self._drop_input_cols
|
745
|
-
)
|
818
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
746
819
|
|
747
820
|
transform_handlers = ModelTransformerBuilder.build(
|
748
821
|
dataset=dataset,
|
@@ -755,7 +828,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
755
828
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
756
829
|
inference_method=inference_method,
|
757
830
|
input_cols=self.input_cols,
|
758
|
-
expected_output_cols=
|
831
|
+
expected_output_cols=expected_output_cols,
|
759
832
|
**transform_kwargs
|
760
833
|
)
|
761
834
|
return output_df
|
@@ -784,17 +857,17 @@ class SpectralEmbedding(BaseTransformer):
|
|
784
857
|
Output dataset with probability of the sample for each class in the model.
|
785
858
|
"""
|
786
859
|
super()._check_dataset_type(dataset)
|
787
|
-
inference_method="score_samples"
|
860
|
+
inference_method = "score_samples"
|
788
861
|
|
789
862
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
790
863
|
# are specific to the type of dataset used.
|
791
864
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
792
865
|
|
866
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
867
|
+
|
793
868
|
if isinstance(dataset, DataFrame):
|
794
|
-
self.
|
795
|
-
|
796
|
-
inference_method=inference_method,
|
797
|
-
)
|
869
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
870
|
+
self._deps = self._get_dependencies()
|
798
871
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
799
872
|
transform_kwargs = dict(
|
800
873
|
session=dataset._session,
|
@@ -802,6 +875,9 @@ class SpectralEmbedding(BaseTransformer):
|
|
802
875
|
drop_input_cols = self._drop_input_cols,
|
803
876
|
expected_output_cols_type="float",
|
804
877
|
)
|
878
|
+
expected_output_cols = self._align_expected_output_names(
|
879
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
880
|
+
)
|
805
881
|
|
806
882
|
elif isinstance(dataset, pd.DataFrame):
|
807
883
|
transform_kwargs = dict(
|
@@ -820,7 +896,7 @@ class SpectralEmbedding(BaseTransformer):
|
|
820
896
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
821
897
|
inference_method=inference_method,
|
822
898
|
input_cols=self.input_cols,
|
823
|
-
expected_output_cols=
|
899
|
+
expected_output_cols=expected_output_cols,
|
824
900
|
**transform_kwargs
|
825
901
|
)
|
826
902
|
return output_df
|
@@ -853,17 +929,15 @@ class SpectralEmbedding(BaseTransformer):
|
|
853
929
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
854
930
|
|
855
931
|
if isinstance(dataset, DataFrame):
|
856
|
-
self.
|
857
|
-
|
858
|
-
inference_method="score",
|
859
|
-
)
|
932
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
933
|
+
self._deps = self._get_dependencies()
|
860
934
|
selected_cols = self._get_active_columns()
|
861
935
|
if len(selected_cols) > 0:
|
862
936
|
dataset = dataset.select(selected_cols)
|
863
937
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
864
938
|
transform_kwargs = dict(
|
865
939
|
session=dataset._session,
|
866
|
-
dependencies=
|
940
|
+
dependencies=self._deps,
|
867
941
|
score_sproc_imports=['sklearn'],
|
868
942
|
)
|
869
943
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -928,11 +1002,8 @@ class SpectralEmbedding(BaseTransformer):
|
|
928
1002
|
|
929
1003
|
if isinstance(dataset, DataFrame):
|
930
1004
|
|
931
|
-
self.
|
932
|
-
|
933
|
-
inference_method=inference_method,
|
934
|
-
|
935
|
-
)
|
1005
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
1006
|
+
self._deps = self._get_dependencies()
|
936
1007
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
937
1008
|
transform_kwargs = dict(
|
938
1009
|
session = dataset._session,
|
@@ -965,50 +1036,84 @@ class SpectralEmbedding(BaseTransformer):
|
|
965
1036
|
)
|
966
1037
|
return output_df
|
967
1038
|
|
1039
|
+
|
1040
|
+
|
1041
|
+
def to_sklearn(self) -> Any:
|
1042
|
+
"""Get sklearn.manifold.SpectralEmbedding object.
|
1043
|
+
"""
|
1044
|
+
if self._sklearn_object is None:
|
1045
|
+
self._sklearn_object = self._create_sklearn_object()
|
1046
|
+
return self._sklearn_object
|
1047
|
+
|
1048
|
+
def to_xgboost(self) -> Any:
|
1049
|
+
raise exceptions.SnowflakeMLException(
|
1050
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1051
|
+
original_exception=AttributeError(
|
1052
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1053
|
+
"to_xgboost()",
|
1054
|
+
"to_sklearn()"
|
1055
|
+
)
|
1056
|
+
),
|
1057
|
+
)
|
968
1058
|
|
969
|
-
def
|
1059
|
+
def to_lightgbm(self) -> Any:
|
1060
|
+
raise exceptions.SnowflakeMLException(
|
1061
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1062
|
+
original_exception=AttributeError(
|
1063
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1064
|
+
"to_lightgbm()",
|
1065
|
+
"to_sklearn()"
|
1066
|
+
)
|
1067
|
+
),
|
1068
|
+
)
|
1069
|
+
|
1070
|
+
def _get_dependencies(self) -> List[str]:
|
1071
|
+
return self._deps
|
1072
|
+
|
1073
|
+
|
1074
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
970
1075
|
self._model_signature_dict = dict()
|
971
1076
|
|
972
1077
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
973
1078
|
|
974
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1079
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
975
1080
|
outputs: List[BaseFeatureSpec] = []
|
976
1081
|
if hasattr(self, "predict"):
|
977
1082
|
# keep mypy happy
|
978
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1083
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
979
1084
|
# For classifier, the type of predict is the same as the type of label
|
980
|
-
if self._sklearn_object._estimator_type ==
|
981
|
-
|
1085
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1086
|
+
# label columns is the desired type for output
|
982
1087
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
983
1088
|
# rename the output columns
|
984
1089
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
985
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
986
|
-
|
987
|
-
|
1090
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1091
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1092
|
+
)
|
988
1093
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
989
1094
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
990
|
-
# Clusterer returns int64 cluster labels.
|
1095
|
+
# Clusterer returns int64 cluster labels.
|
991
1096
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
992
1097
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
993
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
994
|
-
|
995
|
-
|
996
|
-
|
1098
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1099
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1100
|
+
)
|
1101
|
+
|
997
1102
|
# For regressor, the type of predict is float64
|
998
|
-
elif self._sklearn_object._estimator_type ==
|
1103
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
999
1104
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1000
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1105
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1106
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1107
|
+
)
|
1108
|
+
|
1004
1109
|
for prob_func in PROB_FUNCTIONS:
|
1005
1110
|
if hasattr(self, prob_func):
|
1006
1111
|
output_cols_prefix: str = f"{prob_func}_"
|
1007
1112
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1008
1113
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1009
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1010
|
-
|
1011
|
-
|
1114
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1115
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1116
|
+
)
|
1012
1117
|
|
1013
1118
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1014
1119
|
items = list(self._model_signature_dict.items())
|
@@ -1021,10 +1126,10 @@ class SpectralEmbedding(BaseTransformer):
|
|
1021
1126
|
"""Returns model signature of current class.
|
1022
1127
|
|
1023
1128
|
Raises:
|
1024
|
-
|
1129
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1025
1130
|
|
1026
1131
|
Returns:
|
1027
|
-
Dict
|
1132
|
+
Dict with each method and its input output signature
|
1028
1133
|
"""
|
1029
1134
|
if self._model_signature_dict is None:
|
1030
1135
|
raise exceptions.SnowflakeMLException(
|
@@ -1032,35 +1137,3 @@ class SpectralEmbedding(BaseTransformer):
|
|
1032
1137
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1033
1138
|
)
|
1034
1139
|
return self._model_signature_dict
|
1035
|
-
|
1036
|
-
def to_sklearn(self) -> Any:
|
1037
|
-
"""Get sklearn.manifold.SpectralEmbedding object.
|
1038
|
-
"""
|
1039
|
-
if self._sklearn_object is None:
|
1040
|
-
self._sklearn_object = self._create_sklearn_object()
|
1041
|
-
return self._sklearn_object
|
1042
|
-
|
1043
|
-
def to_xgboost(self) -> Any:
|
1044
|
-
raise exceptions.SnowflakeMLException(
|
1045
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1046
|
-
original_exception=AttributeError(
|
1047
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1048
|
-
"to_xgboost()",
|
1049
|
-
"to_sklearn()"
|
1050
|
-
)
|
1051
|
-
),
|
1052
|
-
)
|
1053
|
-
|
1054
|
-
def to_lightgbm(self) -> Any:
|
1055
|
-
raise exceptions.SnowflakeMLException(
|
1056
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1057
|
-
original_exception=AttributeError(
|
1058
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1059
|
-
"to_lightgbm()",
|
1060
|
-
"to_sklearn()"
|
1061
|
-
)
|
1062
|
-
),
|
1063
|
-
)
|
1064
|
-
|
1065
|
-
def _get_dependencies(self) -> List[str]:
|
1066
|
-
return self._deps
|