snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.decomposition".replace("
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class IncrementalPCA(BaseTransformer):
|
71
64
|
r"""Incremental principal components analysis (IPCA)
|
72
65
|
For more details on this class, see [sklearn.decomposition.IncrementalPCA]
|
@@ -221,12 +214,7 @@ class IncrementalPCA(BaseTransformer):
|
|
221
214
|
)
|
222
215
|
return selected_cols
|
223
216
|
|
224
|
-
|
225
|
-
project=_PROJECT,
|
226
|
-
subproject=_SUBPROJECT,
|
227
|
-
custom_tags=dict([("autogen", True)]),
|
228
|
-
)
|
229
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IncrementalPCA":
|
217
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IncrementalPCA":
|
230
218
|
"""Fit the model with X, using minibatches of size batch_size
|
231
219
|
For more details on this function, see [sklearn.decomposition.IncrementalPCA.fit]
|
232
220
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA.fit)
|
@@ -253,12 +241,14 @@ class IncrementalPCA(BaseTransformer):
|
|
253
241
|
|
254
242
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
255
243
|
|
256
|
-
|
244
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
257
245
|
if SNOWML_SPROC_ENV in os.environ:
|
258
246
|
statement_params = telemetry.get_function_usage_statement_params(
|
259
247
|
project=_PROJECT,
|
260
248
|
subproject=_SUBPROJECT,
|
261
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
249
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
250
|
+
inspect.currentframe(), IncrementalPCA.__class__.__name__
|
251
|
+
),
|
262
252
|
api_calls=[Session.call],
|
263
253
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
264
254
|
)
|
@@ -279,27 +269,24 @@ class IncrementalPCA(BaseTransformer):
|
|
279
269
|
)
|
280
270
|
self._sklearn_object = model_trainer.train()
|
281
271
|
self._is_fitted = True
|
282
|
-
self.
|
272
|
+
self._generate_model_signatures(dataset)
|
283
273
|
return self
|
284
274
|
|
285
275
|
def _batch_inference_validate_snowpark(
|
286
276
|
self,
|
287
277
|
dataset: DataFrame,
|
288
278
|
inference_method: str,
|
289
|
-
) ->
|
290
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
291
|
-
return the available package that exists in the snowflake anaconda channel
|
279
|
+
) -> None:
|
280
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
292
281
|
|
293
282
|
Args:
|
294
283
|
dataset: snowpark dataframe
|
295
284
|
inference_method: the inference method such as predict, score...
|
296
|
-
|
285
|
+
|
297
286
|
Raises:
|
298
287
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
299
288
|
SnowflakeMLException: If the session is None, raise error
|
300
289
|
|
301
|
-
Returns:
|
302
|
-
A list of available package that exists in the snowflake anaconda channel
|
303
290
|
"""
|
304
291
|
if not self._is_fitted:
|
305
292
|
raise exceptions.SnowflakeMLException(
|
@@ -317,9 +304,7 @@ class IncrementalPCA(BaseTransformer):
|
|
317
304
|
"Session must not specified for snowpark dataset."
|
318
305
|
),
|
319
306
|
)
|
320
|
-
|
321
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
322
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
307
|
+
|
323
308
|
|
324
309
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
325
310
|
@telemetry.send_api_usage_telemetry(
|
@@ -353,7 +338,9 @@ class IncrementalPCA(BaseTransformer):
|
|
353
338
|
# when it is classifier, infer the datatype from label columns
|
354
339
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
355
340
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
356
|
-
label_cols_signatures = [
|
341
|
+
label_cols_signatures = [
|
342
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
343
|
+
]
|
357
344
|
if len(label_cols_signatures) == 0:
|
358
345
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
359
346
|
raise exceptions.SnowflakeMLException(
|
@@ -361,25 +348,23 @@ class IncrementalPCA(BaseTransformer):
|
|
361
348
|
original_exception=ValueError(error_str),
|
362
349
|
)
|
363
350
|
|
364
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
365
|
-
label_cols_signatures[0].as_snowpark_type()
|
366
|
-
)
|
351
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
367
352
|
|
368
|
-
self.
|
369
|
-
|
353
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
354
|
+
self._deps = self._get_dependencies()
|
355
|
+
assert isinstance(
|
356
|
+
dataset._session, Session
|
357
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
370
358
|
|
371
359
|
transform_kwargs = dict(
|
372
|
-
session
|
373
|
-
dependencies
|
374
|
-
drop_input_cols
|
375
|
-
expected_output_cols_type
|
360
|
+
session=dataset._session,
|
361
|
+
dependencies=self._deps,
|
362
|
+
drop_input_cols=self._drop_input_cols,
|
363
|
+
expected_output_cols_type=expected_type_inferred,
|
376
364
|
)
|
377
365
|
|
378
366
|
elif isinstance(dataset, pd.DataFrame):
|
379
|
-
transform_kwargs = dict(
|
380
|
-
snowpark_input_cols = self._snowpark_cols,
|
381
|
-
drop_input_cols = self._drop_input_cols
|
382
|
-
)
|
367
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
383
368
|
|
384
369
|
transform_handlers = ModelTransformerBuilder.build(
|
385
370
|
dataset=dataset,
|
@@ -421,7 +406,7 @@ class IncrementalPCA(BaseTransformer):
|
|
421
406
|
Transformed dataset.
|
422
407
|
"""
|
423
408
|
super()._check_dataset_type(dataset)
|
424
|
-
inference_method="transform"
|
409
|
+
inference_method = "transform"
|
425
410
|
|
426
411
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
427
412
|
# are specific to the type of dataset used.
|
@@ -451,24 +436,19 @@ class IncrementalPCA(BaseTransformer):
|
|
451
436
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
452
437
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
453
438
|
|
454
|
-
self.
|
455
|
-
|
456
|
-
inference_method=inference_method,
|
457
|
-
)
|
439
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
440
|
+
self._deps = self._get_dependencies()
|
458
441
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
459
442
|
|
460
443
|
transform_kwargs = dict(
|
461
|
-
session
|
462
|
-
dependencies
|
463
|
-
drop_input_cols
|
464
|
-
expected_output_cols_type
|
444
|
+
session=dataset._session,
|
445
|
+
dependencies=self._deps,
|
446
|
+
drop_input_cols=self._drop_input_cols,
|
447
|
+
expected_output_cols_type=expected_dtype,
|
465
448
|
)
|
466
449
|
|
467
450
|
elif isinstance(dataset, pd.DataFrame):
|
468
|
-
transform_kwargs = dict(
|
469
|
-
snowpark_input_cols = self._snowpark_cols,
|
470
|
-
drop_input_cols = self._drop_input_cols
|
471
|
-
)
|
451
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
472
452
|
|
473
453
|
transform_handlers = ModelTransformerBuilder.build(
|
474
454
|
dataset=dataset,
|
@@ -487,7 +467,11 @@ class IncrementalPCA(BaseTransformer):
|
|
487
467
|
return output_df
|
488
468
|
|
489
469
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
490
|
-
def fit_predict(
|
470
|
+
def fit_predict(
|
471
|
+
self,
|
472
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
473
|
+
output_cols_prefix: str = "fit_predict_",
|
474
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
491
475
|
""" Method not supported for this class.
|
492
476
|
|
493
477
|
|
@@ -512,22 +496,106 @@ class IncrementalPCA(BaseTransformer):
|
|
512
496
|
)
|
513
497
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
514
498
|
drop_input_cols=self._drop_input_cols,
|
515
|
-
expected_output_cols_list=
|
499
|
+
expected_output_cols_list=(
|
500
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
501
|
+
),
|
516
502
|
)
|
517
503
|
self._sklearn_object = fitted_estimator
|
518
504
|
self._is_fitted = True
|
519
505
|
return output_result
|
520
506
|
|
507
|
+
|
508
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
509
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
510
|
+
""" Fit to data, then transform it
|
511
|
+
For more details on this function, see [sklearn.decomposition.IncrementalPCA.fit_transform]
|
512
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA.fit_transform)
|
513
|
+
|
514
|
+
|
515
|
+
Raises:
|
516
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
521
517
|
|
522
|
-
|
523
|
-
|
524
|
-
|
518
|
+
Args:
|
519
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
520
|
+
Snowpark or Pandas DataFrame.
|
521
|
+
output_cols_prefix: Prefix for the response columns
|
525
522
|
Returns:
|
526
523
|
Transformed dataset.
|
527
524
|
"""
|
528
|
-
self.
|
529
|
-
|
530
|
-
|
525
|
+
self._infer_input_output_cols(dataset)
|
526
|
+
super()._check_dataset_type(dataset)
|
527
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
528
|
+
estimator=self._sklearn_object,
|
529
|
+
dataset=dataset,
|
530
|
+
input_cols=self.input_cols,
|
531
|
+
label_cols=self.label_cols,
|
532
|
+
sample_weight_col=self.sample_weight_col,
|
533
|
+
autogenerated=self._autogenerated,
|
534
|
+
subproject=_SUBPROJECT,
|
535
|
+
)
|
536
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
537
|
+
drop_input_cols=self._drop_input_cols,
|
538
|
+
expected_output_cols_list=self.output_cols,
|
539
|
+
)
|
540
|
+
self._sklearn_object = fitted_estimator
|
541
|
+
self._is_fitted = True
|
542
|
+
return output_result
|
543
|
+
|
544
|
+
|
545
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
546
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
547
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
548
|
+
"""
|
549
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
550
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
551
|
+
if output_cols:
|
552
|
+
output_cols = [
|
553
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
554
|
+
for c in output_cols
|
555
|
+
]
|
556
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
557
|
+
output_cols = [output_cols_prefix]
|
558
|
+
elif self._sklearn_object is not None:
|
559
|
+
classes = self._sklearn_object.classes_
|
560
|
+
if isinstance(classes, numpy.ndarray):
|
561
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
562
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
563
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
564
|
+
output_cols = []
|
565
|
+
for i, cl in enumerate(classes):
|
566
|
+
# For binary classification, there is only one output column for each class
|
567
|
+
# ndarray as the two classes are complementary.
|
568
|
+
if len(cl) == 2:
|
569
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
570
|
+
else:
|
571
|
+
output_cols.extend([
|
572
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
573
|
+
])
|
574
|
+
else:
|
575
|
+
output_cols = []
|
576
|
+
|
577
|
+
# Make sure column names are valid snowflake identifiers.
|
578
|
+
assert output_cols is not None # Make MyPy happy
|
579
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
580
|
+
|
581
|
+
return rv
|
582
|
+
|
583
|
+
def _align_expected_output_names(
|
584
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
585
|
+
) -> List[str]:
|
586
|
+
# in case the inferred output column names dimension is different
|
587
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
588
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
589
|
+
output_df_columns = list(output_df_pd.columns)
|
590
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
591
|
+
if self.sample_weight_col:
|
592
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
593
|
+
# if the dimension of inferred output column names is correct; use it
|
594
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
595
|
+
return expected_output_cols_list
|
596
|
+
# otherwise, use the sklearn estimator's output
|
597
|
+
else:
|
598
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
531
599
|
|
532
600
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
533
601
|
@telemetry.send_api_usage_telemetry(
|
@@ -559,24 +627,26 @@ class IncrementalPCA(BaseTransformer):
|
|
559
627
|
# are specific to the type of dataset used.
|
560
628
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
561
629
|
|
630
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
631
|
+
|
562
632
|
if isinstance(dataset, DataFrame):
|
563
|
-
self.
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
633
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
634
|
+
self._deps = self._get_dependencies()
|
635
|
+
assert isinstance(
|
636
|
+
dataset._session, Session
|
637
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
568
638
|
transform_kwargs = dict(
|
569
639
|
session=dataset._session,
|
570
640
|
dependencies=self._deps,
|
571
|
-
drop_input_cols
|
641
|
+
drop_input_cols=self._drop_input_cols,
|
572
642
|
expected_output_cols_type="float",
|
573
643
|
)
|
644
|
+
expected_output_cols = self._align_expected_output_names(
|
645
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
646
|
+
)
|
574
647
|
|
575
648
|
elif isinstance(dataset, pd.DataFrame):
|
576
|
-
transform_kwargs = dict(
|
577
|
-
snowpark_input_cols = self._snowpark_cols,
|
578
|
-
drop_input_cols = self._drop_input_cols
|
579
|
-
)
|
649
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
580
650
|
|
581
651
|
transform_handlers = ModelTransformerBuilder.build(
|
582
652
|
dataset=dataset,
|
@@ -588,7 +658,7 @@ class IncrementalPCA(BaseTransformer):
|
|
588
658
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
589
659
|
inference_method=inference_method,
|
590
660
|
input_cols=self.input_cols,
|
591
|
-
expected_output_cols=
|
661
|
+
expected_output_cols=expected_output_cols,
|
592
662
|
**transform_kwargs
|
593
663
|
)
|
594
664
|
return output_df
|
@@ -618,29 +688,30 @@ class IncrementalPCA(BaseTransformer):
|
|
618
688
|
Output dataset with log probability of the sample for each class in the model.
|
619
689
|
"""
|
620
690
|
super()._check_dataset_type(dataset)
|
621
|
-
inference_method="predict_log_proba"
|
691
|
+
inference_method = "predict_log_proba"
|
692
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
622
693
|
|
623
694
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
624
695
|
# are specific to the type of dataset used.
|
625
696
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
626
697
|
|
627
698
|
if isinstance(dataset, DataFrame):
|
628
|
-
self.
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
699
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
700
|
+
self._deps = self._get_dependencies()
|
701
|
+
assert isinstance(
|
702
|
+
dataset._session, Session
|
703
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
633
704
|
transform_kwargs = dict(
|
634
705
|
session=dataset._session,
|
635
706
|
dependencies=self._deps,
|
636
|
-
drop_input_cols
|
707
|
+
drop_input_cols=self._drop_input_cols,
|
637
708
|
expected_output_cols_type="float",
|
638
709
|
)
|
710
|
+
expected_output_cols = self._align_expected_output_names(
|
711
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
712
|
+
)
|
639
713
|
elif isinstance(dataset, pd.DataFrame):
|
640
|
-
transform_kwargs = dict(
|
641
|
-
snowpark_input_cols = self._snowpark_cols,
|
642
|
-
drop_input_cols = self._drop_input_cols
|
643
|
-
)
|
714
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
644
715
|
|
645
716
|
transform_handlers = ModelTransformerBuilder.build(
|
646
717
|
dataset=dataset,
|
@@ -653,7 +724,7 @@ class IncrementalPCA(BaseTransformer):
|
|
653
724
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
654
725
|
inference_method=inference_method,
|
655
726
|
input_cols=self.input_cols,
|
656
|
-
expected_output_cols=
|
727
|
+
expected_output_cols=expected_output_cols,
|
657
728
|
**transform_kwargs
|
658
729
|
)
|
659
730
|
return output_df
|
@@ -679,30 +750,32 @@ class IncrementalPCA(BaseTransformer):
|
|
679
750
|
Output dataset with results of the decision function for the samples in input dataset.
|
680
751
|
"""
|
681
752
|
super()._check_dataset_type(dataset)
|
682
|
-
inference_method="decision_function"
|
753
|
+
inference_method = "decision_function"
|
683
754
|
|
684
755
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
685
756
|
# are specific to the type of dataset used.
|
686
757
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
687
758
|
|
759
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
760
|
+
|
688
761
|
if isinstance(dataset, DataFrame):
|
689
|
-
self.
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
762
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
763
|
+
self._deps = self._get_dependencies()
|
764
|
+
assert isinstance(
|
765
|
+
dataset._session, Session
|
766
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
694
767
|
transform_kwargs = dict(
|
695
768
|
session=dataset._session,
|
696
769
|
dependencies=self._deps,
|
697
|
-
drop_input_cols
|
770
|
+
drop_input_cols=self._drop_input_cols,
|
698
771
|
expected_output_cols_type="float",
|
699
772
|
)
|
773
|
+
expected_output_cols = self._align_expected_output_names(
|
774
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
775
|
+
)
|
700
776
|
|
701
777
|
elif isinstance(dataset, pd.DataFrame):
|
702
|
-
transform_kwargs = dict(
|
703
|
-
snowpark_input_cols = self._snowpark_cols,
|
704
|
-
drop_input_cols = self._drop_input_cols
|
705
|
-
)
|
778
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
706
779
|
|
707
780
|
transform_handlers = ModelTransformerBuilder.build(
|
708
781
|
dataset=dataset,
|
@@ -715,7 +788,7 @@ class IncrementalPCA(BaseTransformer):
|
|
715
788
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
716
789
|
inference_method=inference_method,
|
717
790
|
input_cols=self.input_cols,
|
718
|
-
expected_output_cols=
|
791
|
+
expected_output_cols=expected_output_cols,
|
719
792
|
**transform_kwargs
|
720
793
|
)
|
721
794
|
return output_df
|
@@ -744,17 +817,17 @@ class IncrementalPCA(BaseTransformer):
|
|
744
817
|
Output dataset with probability of the sample for each class in the model.
|
745
818
|
"""
|
746
819
|
super()._check_dataset_type(dataset)
|
747
|
-
inference_method="score_samples"
|
820
|
+
inference_method = "score_samples"
|
748
821
|
|
749
822
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
750
823
|
# are specific to the type of dataset used.
|
751
824
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
752
825
|
|
826
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
827
|
+
|
753
828
|
if isinstance(dataset, DataFrame):
|
754
|
-
self.
|
755
|
-
|
756
|
-
inference_method=inference_method,
|
757
|
-
)
|
829
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
830
|
+
self._deps = self._get_dependencies()
|
758
831
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
759
832
|
transform_kwargs = dict(
|
760
833
|
session=dataset._session,
|
@@ -762,6 +835,9 @@ class IncrementalPCA(BaseTransformer):
|
|
762
835
|
drop_input_cols = self._drop_input_cols,
|
763
836
|
expected_output_cols_type="float",
|
764
837
|
)
|
838
|
+
expected_output_cols = self._align_expected_output_names(
|
839
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
840
|
+
)
|
765
841
|
|
766
842
|
elif isinstance(dataset, pd.DataFrame):
|
767
843
|
transform_kwargs = dict(
|
@@ -780,7 +856,7 @@ class IncrementalPCA(BaseTransformer):
|
|
780
856
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
781
857
|
inference_method=inference_method,
|
782
858
|
input_cols=self.input_cols,
|
783
|
-
expected_output_cols=
|
859
|
+
expected_output_cols=expected_output_cols,
|
784
860
|
**transform_kwargs
|
785
861
|
)
|
786
862
|
return output_df
|
@@ -813,17 +889,15 @@ class IncrementalPCA(BaseTransformer):
|
|
813
889
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
814
890
|
|
815
891
|
if isinstance(dataset, DataFrame):
|
816
|
-
self.
|
817
|
-
|
818
|
-
inference_method="score",
|
819
|
-
)
|
892
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
893
|
+
self._deps = self._get_dependencies()
|
820
894
|
selected_cols = self._get_active_columns()
|
821
895
|
if len(selected_cols) > 0:
|
822
896
|
dataset = dataset.select(selected_cols)
|
823
897
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
824
898
|
transform_kwargs = dict(
|
825
899
|
session=dataset._session,
|
826
|
-
dependencies=
|
900
|
+
dependencies=self._deps,
|
827
901
|
score_sproc_imports=['sklearn'],
|
828
902
|
)
|
829
903
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -888,11 +962,8 @@ class IncrementalPCA(BaseTransformer):
|
|
888
962
|
|
889
963
|
if isinstance(dataset, DataFrame):
|
890
964
|
|
891
|
-
self.
|
892
|
-
|
893
|
-
inference_method=inference_method,
|
894
|
-
|
895
|
-
)
|
965
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
966
|
+
self._deps = self._get_dependencies()
|
896
967
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
897
968
|
transform_kwargs = dict(
|
898
969
|
session = dataset._session,
|
@@ -925,50 +996,84 @@ class IncrementalPCA(BaseTransformer):
|
|
925
996
|
)
|
926
997
|
return output_df
|
927
998
|
|
999
|
+
|
1000
|
+
|
1001
|
+
def to_sklearn(self) -> Any:
|
1002
|
+
"""Get sklearn.decomposition.IncrementalPCA object.
|
1003
|
+
"""
|
1004
|
+
if self._sklearn_object is None:
|
1005
|
+
self._sklearn_object = self._create_sklearn_object()
|
1006
|
+
return self._sklearn_object
|
1007
|
+
|
1008
|
+
def to_xgboost(self) -> Any:
|
1009
|
+
raise exceptions.SnowflakeMLException(
|
1010
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1011
|
+
original_exception=AttributeError(
|
1012
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1013
|
+
"to_xgboost()",
|
1014
|
+
"to_sklearn()"
|
1015
|
+
)
|
1016
|
+
),
|
1017
|
+
)
|
928
1018
|
|
929
|
-
def
|
1019
|
+
def to_lightgbm(self) -> Any:
|
1020
|
+
raise exceptions.SnowflakeMLException(
|
1021
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1022
|
+
original_exception=AttributeError(
|
1023
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1024
|
+
"to_lightgbm()",
|
1025
|
+
"to_sklearn()"
|
1026
|
+
)
|
1027
|
+
),
|
1028
|
+
)
|
1029
|
+
|
1030
|
+
def _get_dependencies(self) -> List[str]:
|
1031
|
+
return self._deps
|
1032
|
+
|
1033
|
+
|
1034
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
930
1035
|
self._model_signature_dict = dict()
|
931
1036
|
|
932
1037
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
933
1038
|
|
934
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1039
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
935
1040
|
outputs: List[BaseFeatureSpec] = []
|
936
1041
|
if hasattr(self, "predict"):
|
937
1042
|
# keep mypy happy
|
938
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1043
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
939
1044
|
# For classifier, the type of predict is the same as the type of label
|
940
|
-
if self._sklearn_object._estimator_type ==
|
941
|
-
|
1045
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1046
|
+
# label columns is the desired type for output
|
942
1047
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
943
1048
|
# rename the output columns
|
944
1049
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
945
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
946
|
-
|
947
|
-
|
1050
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1051
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1052
|
+
)
|
948
1053
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
949
1054
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
950
|
-
# Clusterer returns int64 cluster labels.
|
1055
|
+
# Clusterer returns int64 cluster labels.
|
951
1056
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
952
1057
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
953
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
954
|
-
|
955
|
-
|
956
|
-
|
1058
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1059
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1060
|
+
)
|
1061
|
+
|
957
1062
|
# For regressor, the type of predict is float64
|
958
|
-
elif self._sklearn_object._estimator_type ==
|
1063
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
959
1064
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
960
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
961
|
-
|
962
|
-
|
963
|
-
|
1065
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1066
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1067
|
+
)
|
1068
|
+
|
964
1069
|
for prob_func in PROB_FUNCTIONS:
|
965
1070
|
if hasattr(self, prob_func):
|
966
1071
|
output_cols_prefix: str = f"{prob_func}_"
|
967
1072
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
968
1073
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
969
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
970
|
-
|
971
|
-
|
1074
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1075
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1076
|
+
)
|
972
1077
|
|
973
1078
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
974
1079
|
items = list(self._model_signature_dict.items())
|
@@ -981,10 +1086,10 @@ class IncrementalPCA(BaseTransformer):
|
|
981
1086
|
"""Returns model signature of current class.
|
982
1087
|
|
983
1088
|
Raises:
|
984
|
-
|
1089
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
985
1090
|
|
986
1091
|
Returns:
|
987
|
-
Dict
|
1092
|
+
Dict with each method and its input output signature
|
988
1093
|
"""
|
989
1094
|
if self._model_signature_dict is None:
|
990
1095
|
raise exceptions.SnowflakeMLException(
|
@@ -992,35 +1097,3 @@ class IncrementalPCA(BaseTransformer):
|
|
992
1097
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
993
1098
|
)
|
994
1099
|
return self._model_signature_dict
|
995
|
-
|
996
|
-
def to_sklearn(self) -> Any:
|
997
|
-
"""Get sklearn.decomposition.IncrementalPCA object.
|
998
|
-
"""
|
999
|
-
if self._sklearn_object is None:
|
1000
|
-
self._sklearn_object = self._create_sklearn_object()
|
1001
|
-
return self._sklearn_object
|
1002
|
-
|
1003
|
-
def to_xgboost(self) -> Any:
|
1004
|
-
raise exceptions.SnowflakeMLException(
|
1005
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1006
|
-
original_exception=AttributeError(
|
1007
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1008
|
-
"to_xgboost()",
|
1009
|
-
"to_sklearn()"
|
1010
|
-
)
|
1011
|
-
),
|
1012
|
-
)
|
1013
|
-
|
1014
|
-
def to_lightgbm(self) -> Any:
|
1015
|
-
raise exceptions.SnowflakeMLException(
|
1016
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1017
|
-
original_exception=AttributeError(
|
1018
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1019
|
-
"to_lightgbm()",
|
1020
|
-
"to_sklearn()"
|
1021
|
-
)
|
1022
|
-
),
|
1023
|
-
)
|
1024
|
-
|
1025
|
-
def _get_dependencies(self) -> List[str]:
|
1026
|
-
return self._deps
|