snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.kernel_approximation".re
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class PolynomialCountSketch(BaseTransformer):
|
71
64
|
r"""Polynomial kernel approximation via Tensor Sketch
|
72
65
|
For more details on this class, see [sklearn.kernel_approximation.PolynomialCountSketch]
|
@@ -223,12 +216,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
223
216
|
)
|
224
217
|
return selected_cols
|
225
218
|
|
226
|
-
|
227
|
-
project=_PROJECT,
|
228
|
-
subproject=_SUBPROJECT,
|
229
|
-
custom_tags=dict([("autogen", True)]),
|
230
|
-
)
|
231
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PolynomialCountSketch":
|
219
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PolynomialCountSketch":
|
232
220
|
"""Fit the model with X
|
233
221
|
For more details on this function, see [sklearn.kernel_approximation.PolynomialCountSketch.fit]
|
234
222
|
(https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.PolynomialCountSketch.html#sklearn.kernel_approximation.PolynomialCountSketch.fit)
|
@@ -255,12 +243,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
255
243
|
|
256
244
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
257
245
|
|
258
|
-
|
246
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
259
247
|
if SNOWML_SPROC_ENV in os.environ:
|
260
248
|
statement_params = telemetry.get_function_usage_statement_params(
|
261
249
|
project=_PROJECT,
|
262
250
|
subproject=_SUBPROJECT,
|
263
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
251
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
252
|
+
inspect.currentframe(), PolynomialCountSketch.__class__.__name__
|
253
|
+
),
|
264
254
|
api_calls=[Session.call],
|
265
255
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
266
256
|
)
|
@@ -281,27 +271,24 @@ class PolynomialCountSketch(BaseTransformer):
|
|
281
271
|
)
|
282
272
|
self._sklearn_object = model_trainer.train()
|
283
273
|
self._is_fitted = True
|
284
|
-
self.
|
274
|
+
self._generate_model_signatures(dataset)
|
285
275
|
return self
|
286
276
|
|
287
277
|
def _batch_inference_validate_snowpark(
|
288
278
|
self,
|
289
279
|
dataset: DataFrame,
|
290
280
|
inference_method: str,
|
291
|
-
) ->
|
292
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
293
|
-
return the available package that exists in the snowflake anaconda channel
|
281
|
+
) -> None:
|
282
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
294
283
|
|
295
284
|
Args:
|
296
285
|
dataset: snowpark dataframe
|
297
286
|
inference_method: the inference method such as predict, score...
|
298
|
-
|
287
|
+
|
299
288
|
Raises:
|
300
289
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
301
290
|
SnowflakeMLException: If the session is None, raise error
|
302
291
|
|
303
|
-
Returns:
|
304
|
-
A list of available package that exists in the snowflake anaconda channel
|
305
292
|
"""
|
306
293
|
if not self._is_fitted:
|
307
294
|
raise exceptions.SnowflakeMLException(
|
@@ -319,9 +306,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
319
306
|
"Session must not specified for snowpark dataset."
|
320
307
|
),
|
321
308
|
)
|
322
|
-
|
323
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
324
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
309
|
+
|
325
310
|
|
326
311
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
327
312
|
@telemetry.send_api_usage_telemetry(
|
@@ -355,7 +340,9 @@ class PolynomialCountSketch(BaseTransformer):
|
|
355
340
|
# when it is classifier, infer the datatype from label columns
|
356
341
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
357
342
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
358
|
-
label_cols_signatures = [
|
343
|
+
label_cols_signatures = [
|
344
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
345
|
+
]
|
359
346
|
if len(label_cols_signatures) == 0:
|
360
347
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
361
348
|
raise exceptions.SnowflakeMLException(
|
@@ -363,25 +350,23 @@ class PolynomialCountSketch(BaseTransformer):
|
|
363
350
|
original_exception=ValueError(error_str),
|
364
351
|
)
|
365
352
|
|
366
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
367
|
-
label_cols_signatures[0].as_snowpark_type()
|
368
|
-
)
|
353
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
369
354
|
|
370
|
-
self.
|
371
|
-
|
355
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
356
|
+
self._deps = self._get_dependencies()
|
357
|
+
assert isinstance(
|
358
|
+
dataset._session, Session
|
359
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
372
360
|
|
373
361
|
transform_kwargs = dict(
|
374
|
-
session
|
375
|
-
dependencies
|
376
|
-
drop_input_cols
|
377
|
-
expected_output_cols_type
|
362
|
+
session=dataset._session,
|
363
|
+
dependencies=self._deps,
|
364
|
+
drop_input_cols=self._drop_input_cols,
|
365
|
+
expected_output_cols_type=expected_type_inferred,
|
378
366
|
)
|
379
367
|
|
380
368
|
elif isinstance(dataset, pd.DataFrame):
|
381
|
-
transform_kwargs = dict(
|
382
|
-
snowpark_input_cols = self._snowpark_cols,
|
383
|
-
drop_input_cols = self._drop_input_cols
|
384
|
-
)
|
369
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
385
370
|
|
386
371
|
transform_handlers = ModelTransformerBuilder.build(
|
387
372
|
dataset=dataset,
|
@@ -423,7 +408,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
423
408
|
Transformed dataset.
|
424
409
|
"""
|
425
410
|
super()._check_dataset_type(dataset)
|
426
|
-
inference_method="transform"
|
411
|
+
inference_method = "transform"
|
427
412
|
|
428
413
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
429
414
|
# are specific to the type of dataset used.
|
@@ -453,24 +438,19 @@ class PolynomialCountSketch(BaseTransformer):
|
|
453
438
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
454
439
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
455
440
|
|
456
|
-
self.
|
457
|
-
|
458
|
-
inference_method=inference_method,
|
459
|
-
)
|
441
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
442
|
+
self._deps = self._get_dependencies()
|
460
443
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
461
444
|
|
462
445
|
transform_kwargs = dict(
|
463
|
-
session
|
464
|
-
dependencies
|
465
|
-
drop_input_cols
|
466
|
-
expected_output_cols_type
|
446
|
+
session=dataset._session,
|
447
|
+
dependencies=self._deps,
|
448
|
+
drop_input_cols=self._drop_input_cols,
|
449
|
+
expected_output_cols_type=expected_dtype,
|
467
450
|
)
|
468
451
|
|
469
452
|
elif isinstance(dataset, pd.DataFrame):
|
470
|
-
transform_kwargs = dict(
|
471
|
-
snowpark_input_cols = self._snowpark_cols,
|
472
|
-
drop_input_cols = self._drop_input_cols
|
473
|
-
)
|
453
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
474
454
|
|
475
455
|
transform_handlers = ModelTransformerBuilder.build(
|
476
456
|
dataset=dataset,
|
@@ -489,7 +469,11 @@ class PolynomialCountSketch(BaseTransformer):
|
|
489
469
|
return output_df
|
490
470
|
|
491
471
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
492
|
-
def fit_predict(
|
472
|
+
def fit_predict(
|
473
|
+
self,
|
474
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
475
|
+
output_cols_prefix: str = "fit_predict_",
|
476
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
493
477
|
""" Method not supported for this class.
|
494
478
|
|
495
479
|
|
@@ -514,22 +498,106 @@ class PolynomialCountSketch(BaseTransformer):
|
|
514
498
|
)
|
515
499
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
516
500
|
drop_input_cols=self._drop_input_cols,
|
517
|
-
expected_output_cols_list=
|
501
|
+
expected_output_cols_list=(
|
502
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
503
|
+
),
|
518
504
|
)
|
519
505
|
self._sklearn_object = fitted_estimator
|
520
506
|
self._is_fitted = True
|
521
507
|
return output_result
|
522
508
|
|
509
|
+
|
510
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
511
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
512
|
+
""" Fit to data, then transform it
|
513
|
+
For more details on this function, see [sklearn.kernel_approximation.PolynomialCountSketch.fit_transform]
|
514
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.PolynomialCountSketch.html#sklearn.kernel_approximation.PolynomialCountSketch.fit_transform)
|
515
|
+
|
516
|
+
|
517
|
+
Raises:
|
518
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
523
519
|
|
524
|
-
|
525
|
-
|
526
|
-
|
520
|
+
Args:
|
521
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
522
|
+
Snowpark or Pandas DataFrame.
|
523
|
+
output_cols_prefix: Prefix for the response columns
|
527
524
|
Returns:
|
528
525
|
Transformed dataset.
|
529
526
|
"""
|
530
|
-
self.
|
531
|
-
|
532
|
-
|
527
|
+
self._infer_input_output_cols(dataset)
|
528
|
+
super()._check_dataset_type(dataset)
|
529
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
530
|
+
estimator=self._sklearn_object,
|
531
|
+
dataset=dataset,
|
532
|
+
input_cols=self.input_cols,
|
533
|
+
label_cols=self.label_cols,
|
534
|
+
sample_weight_col=self.sample_weight_col,
|
535
|
+
autogenerated=self._autogenerated,
|
536
|
+
subproject=_SUBPROJECT,
|
537
|
+
)
|
538
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
539
|
+
drop_input_cols=self._drop_input_cols,
|
540
|
+
expected_output_cols_list=self.output_cols,
|
541
|
+
)
|
542
|
+
self._sklearn_object = fitted_estimator
|
543
|
+
self._is_fitted = True
|
544
|
+
return output_result
|
545
|
+
|
546
|
+
|
547
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
548
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
549
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
550
|
+
"""
|
551
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
552
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
553
|
+
if output_cols:
|
554
|
+
output_cols = [
|
555
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
556
|
+
for c in output_cols
|
557
|
+
]
|
558
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
559
|
+
output_cols = [output_cols_prefix]
|
560
|
+
elif self._sklearn_object is not None:
|
561
|
+
classes = self._sklearn_object.classes_
|
562
|
+
if isinstance(classes, numpy.ndarray):
|
563
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
564
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
565
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
566
|
+
output_cols = []
|
567
|
+
for i, cl in enumerate(classes):
|
568
|
+
# For binary classification, there is only one output column for each class
|
569
|
+
# ndarray as the two classes are complementary.
|
570
|
+
if len(cl) == 2:
|
571
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
572
|
+
else:
|
573
|
+
output_cols.extend([
|
574
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
575
|
+
])
|
576
|
+
else:
|
577
|
+
output_cols = []
|
578
|
+
|
579
|
+
# Make sure column names are valid snowflake identifiers.
|
580
|
+
assert output_cols is not None # Make MyPy happy
|
581
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
582
|
+
|
583
|
+
return rv
|
584
|
+
|
585
|
+
def _align_expected_output_names(
|
586
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
587
|
+
) -> List[str]:
|
588
|
+
# in case the inferred output column names dimension is different
|
589
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
590
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
591
|
+
output_df_columns = list(output_df_pd.columns)
|
592
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
593
|
+
if self.sample_weight_col:
|
594
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
595
|
+
# if the dimension of inferred output column names is correct; use it
|
596
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
597
|
+
return expected_output_cols_list
|
598
|
+
# otherwise, use the sklearn estimator's output
|
599
|
+
else:
|
600
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
533
601
|
|
534
602
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
535
603
|
@telemetry.send_api_usage_telemetry(
|
@@ -561,24 +629,26 @@ class PolynomialCountSketch(BaseTransformer):
|
|
561
629
|
# are specific to the type of dataset used.
|
562
630
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
563
631
|
|
632
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
633
|
+
|
564
634
|
if isinstance(dataset, DataFrame):
|
565
|
-
self.
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
635
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
636
|
+
self._deps = self._get_dependencies()
|
637
|
+
assert isinstance(
|
638
|
+
dataset._session, Session
|
639
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
570
640
|
transform_kwargs = dict(
|
571
641
|
session=dataset._session,
|
572
642
|
dependencies=self._deps,
|
573
|
-
drop_input_cols
|
643
|
+
drop_input_cols=self._drop_input_cols,
|
574
644
|
expected_output_cols_type="float",
|
575
645
|
)
|
646
|
+
expected_output_cols = self._align_expected_output_names(
|
647
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
648
|
+
)
|
576
649
|
|
577
650
|
elif isinstance(dataset, pd.DataFrame):
|
578
|
-
transform_kwargs = dict(
|
579
|
-
snowpark_input_cols = self._snowpark_cols,
|
580
|
-
drop_input_cols = self._drop_input_cols
|
581
|
-
)
|
651
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
582
652
|
|
583
653
|
transform_handlers = ModelTransformerBuilder.build(
|
584
654
|
dataset=dataset,
|
@@ -590,7 +660,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
590
660
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
591
661
|
inference_method=inference_method,
|
592
662
|
input_cols=self.input_cols,
|
593
|
-
expected_output_cols=
|
663
|
+
expected_output_cols=expected_output_cols,
|
594
664
|
**transform_kwargs
|
595
665
|
)
|
596
666
|
return output_df
|
@@ -620,29 +690,30 @@ class PolynomialCountSketch(BaseTransformer):
|
|
620
690
|
Output dataset with log probability of the sample for each class in the model.
|
621
691
|
"""
|
622
692
|
super()._check_dataset_type(dataset)
|
623
|
-
inference_method="predict_log_proba"
|
693
|
+
inference_method = "predict_log_proba"
|
694
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
624
695
|
|
625
696
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
626
697
|
# are specific to the type of dataset used.
|
627
698
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
628
699
|
|
629
700
|
if isinstance(dataset, DataFrame):
|
630
|
-
self.
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
701
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
702
|
+
self._deps = self._get_dependencies()
|
703
|
+
assert isinstance(
|
704
|
+
dataset._session, Session
|
705
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
635
706
|
transform_kwargs = dict(
|
636
707
|
session=dataset._session,
|
637
708
|
dependencies=self._deps,
|
638
|
-
drop_input_cols
|
709
|
+
drop_input_cols=self._drop_input_cols,
|
639
710
|
expected_output_cols_type="float",
|
640
711
|
)
|
712
|
+
expected_output_cols = self._align_expected_output_names(
|
713
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
714
|
+
)
|
641
715
|
elif isinstance(dataset, pd.DataFrame):
|
642
|
-
transform_kwargs = dict(
|
643
|
-
snowpark_input_cols = self._snowpark_cols,
|
644
|
-
drop_input_cols = self._drop_input_cols
|
645
|
-
)
|
716
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
646
717
|
|
647
718
|
transform_handlers = ModelTransformerBuilder.build(
|
648
719
|
dataset=dataset,
|
@@ -655,7 +726,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
655
726
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
656
727
|
inference_method=inference_method,
|
657
728
|
input_cols=self.input_cols,
|
658
|
-
expected_output_cols=
|
729
|
+
expected_output_cols=expected_output_cols,
|
659
730
|
**transform_kwargs
|
660
731
|
)
|
661
732
|
return output_df
|
@@ -681,30 +752,32 @@ class PolynomialCountSketch(BaseTransformer):
|
|
681
752
|
Output dataset with results of the decision function for the samples in input dataset.
|
682
753
|
"""
|
683
754
|
super()._check_dataset_type(dataset)
|
684
|
-
inference_method="decision_function"
|
755
|
+
inference_method = "decision_function"
|
685
756
|
|
686
757
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
687
758
|
# are specific to the type of dataset used.
|
688
759
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
689
760
|
|
761
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
762
|
+
|
690
763
|
if isinstance(dataset, DataFrame):
|
691
|
-
self.
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
764
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
765
|
+
self._deps = self._get_dependencies()
|
766
|
+
assert isinstance(
|
767
|
+
dataset._session, Session
|
768
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
696
769
|
transform_kwargs = dict(
|
697
770
|
session=dataset._session,
|
698
771
|
dependencies=self._deps,
|
699
|
-
drop_input_cols
|
772
|
+
drop_input_cols=self._drop_input_cols,
|
700
773
|
expected_output_cols_type="float",
|
701
774
|
)
|
775
|
+
expected_output_cols = self._align_expected_output_names(
|
776
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
777
|
+
)
|
702
778
|
|
703
779
|
elif isinstance(dataset, pd.DataFrame):
|
704
|
-
transform_kwargs = dict(
|
705
|
-
snowpark_input_cols = self._snowpark_cols,
|
706
|
-
drop_input_cols = self._drop_input_cols
|
707
|
-
)
|
780
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
708
781
|
|
709
782
|
transform_handlers = ModelTransformerBuilder.build(
|
710
783
|
dataset=dataset,
|
@@ -717,7 +790,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
717
790
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
718
791
|
inference_method=inference_method,
|
719
792
|
input_cols=self.input_cols,
|
720
|
-
expected_output_cols=
|
793
|
+
expected_output_cols=expected_output_cols,
|
721
794
|
**transform_kwargs
|
722
795
|
)
|
723
796
|
return output_df
|
@@ -746,17 +819,17 @@ class PolynomialCountSketch(BaseTransformer):
|
|
746
819
|
Output dataset with probability of the sample for each class in the model.
|
747
820
|
"""
|
748
821
|
super()._check_dataset_type(dataset)
|
749
|
-
inference_method="score_samples"
|
822
|
+
inference_method = "score_samples"
|
750
823
|
|
751
824
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
752
825
|
# are specific to the type of dataset used.
|
753
826
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
754
827
|
|
828
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
829
|
+
|
755
830
|
if isinstance(dataset, DataFrame):
|
756
|
-
self.
|
757
|
-
|
758
|
-
inference_method=inference_method,
|
759
|
-
)
|
831
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
832
|
+
self._deps = self._get_dependencies()
|
760
833
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
761
834
|
transform_kwargs = dict(
|
762
835
|
session=dataset._session,
|
@@ -764,6 +837,9 @@ class PolynomialCountSketch(BaseTransformer):
|
|
764
837
|
drop_input_cols = self._drop_input_cols,
|
765
838
|
expected_output_cols_type="float",
|
766
839
|
)
|
840
|
+
expected_output_cols = self._align_expected_output_names(
|
841
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
842
|
+
)
|
767
843
|
|
768
844
|
elif isinstance(dataset, pd.DataFrame):
|
769
845
|
transform_kwargs = dict(
|
@@ -782,7 +858,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
782
858
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
783
859
|
inference_method=inference_method,
|
784
860
|
input_cols=self.input_cols,
|
785
|
-
expected_output_cols=
|
861
|
+
expected_output_cols=expected_output_cols,
|
786
862
|
**transform_kwargs
|
787
863
|
)
|
788
864
|
return output_df
|
@@ -815,17 +891,15 @@ class PolynomialCountSketch(BaseTransformer):
|
|
815
891
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
816
892
|
|
817
893
|
if isinstance(dataset, DataFrame):
|
818
|
-
self.
|
819
|
-
|
820
|
-
inference_method="score",
|
821
|
-
)
|
894
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
895
|
+
self._deps = self._get_dependencies()
|
822
896
|
selected_cols = self._get_active_columns()
|
823
897
|
if len(selected_cols) > 0:
|
824
898
|
dataset = dataset.select(selected_cols)
|
825
899
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
826
900
|
transform_kwargs = dict(
|
827
901
|
session=dataset._session,
|
828
|
-
dependencies=
|
902
|
+
dependencies=self._deps,
|
829
903
|
score_sproc_imports=['sklearn'],
|
830
904
|
)
|
831
905
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -890,11 +964,8 @@ class PolynomialCountSketch(BaseTransformer):
|
|
890
964
|
|
891
965
|
if isinstance(dataset, DataFrame):
|
892
966
|
|
893
|
-
self.
|
894
|
-
|
895
|
-
inference_method=inference_method,
|
896
|
-
|
897
|
-
)
|
967
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
968
|
+
self._deps = self._get_dependencies()
|
898
969
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
899
970
|
transform_kwargs = dict(
|
900
971
|
session = dataset._session,
|
@@ -927,50 +998,84 @@ class PolynomialCountSketch(BaseTransformer):
|
|
927
998
|
)
|
928
999
|
return output_df
|
929
1000
|
|
1001
|
+
|
1002
|
+
|
1003
|
+
def to_sklearn(self) -> Any:
|
1004
|
+
"""Get sklearn.kernel_approximation.PolynomialCountSketch object.
|
1005
|
+
"""
|
1006
|
+
if self._sklearn_object is None:
|
1007
|
+
self._sklearn_object = self._create_sklearn_object()
|
1008
|
+
return self._sklearn_object
|
1009
|
+
|
1010
|
+
def to_xgboost(self) -> Any:
|
1011
|
+
raise exceptions.SnowflakeMLException(
|
1012
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1013
|
+
original_exception=AttributeError(
|
1014
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1015
|
+
"to_xgboost()",
|
1016
|
+
"to_sklearn()"
|
1017
|
+
)
|
1018
|
+
),
|
1019
|
+
)
|
930
1020
|
|
931
|
-
def
|
1021
|
+
def to_lightgbm(self) -> Any:
|
1022
|
+
raise exceptions.SnowflakeMLException(
|
1023
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1024
|
+
original_exception=AttributeError(
|
1025
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1026
|
+
"to_lightgbm()",
|
1027
|
+
"to_sklearn()"
|
1028
|
+
)
|
1029
|
+
),
|
1030
|
+
)
|
1031
|
+
|
1032
|
+
def _get_dependencies(self) -> List[str]:
|
1033
|
+
return self._deps
|
1034
|
+
|
1035
|
+
|
1036
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
932
1037
|
self._model_signature_dict = dict()
|
933
1038
|
|
934
1039
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
935
1040
|
|
936
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1041
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
937
1042
|
outputs: List[BaseFeatureSpec] = []
|
938
1043
|
if hasattr(self, "predict"):
|
939
1044
|
# keep mypy happy
|
940
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1045
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
941
1046
|
# For classifier, the type of predict is the same as the type of label
|
942
|
-
if self._sklearn_object._estimator_type ==
|
943
|
-
|
1047
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1048
|
+
# label columns is the desired type for output
|
944
1049
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
945
1050
|
# rename the output columns
|
946
1051
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
947
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
948
|
-
|
949
|
-
|
1052
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1053
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1054
|
+
)
|
950
1055
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
951
1056
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
952
|
-
# Clusterer returns int64 cluster labels.
|
1057
|
+
# Clusterer returns int64 cluster labels.
|
953
1058
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
954
1059
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
955
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
956
|
-
|
957
|
-
|
958
|
-
|
1060
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1061
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1062
|
+
)
|
1063
|
+
|
959
1064
|
# For regressor, the type of predict is float64
|
960
|
-
elif self._sklearn_object._estimator_type ==
|
1065
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
961
1066
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
962
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
963
|
-
|
964
|
-
|
965
|
-
|
1067
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1068
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1069
|
+
)
|
1070
|
+
|
966
1071
|
for prob_func in PROB_FUNCTIONS:
|
967
1072
|
if hasattr(self, prob_func):
|
968
1073
|
output_cols_prefix: str = f"{prob_func}_"
|
969
1074
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
970
1075
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
971
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
972
|
-
|
973
|
-
|
1076
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1077
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1078
|
+
)
|
974
1079
|
|
975
1080
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
976
1081
|
items = list(self._model_signature_dict.items())
|
@@ -983,10 +1088,10 @@ class PolynomialCountSketch(BaseTransformer):
|
|
983
1088
|
"""Returns model signature of current class.
|
984
1089
|
|
985
1090
|
Raises:
|
986
|
-
|
1091
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
987
1092
|
|
988
1093
|
Returns:
|
989
|
-
Dict
|
1094
|
+
Dict with each method and its input output signature
|
990
1095
|
"""
|
991
1096
|
if self._model_signature_dict is None:
|
992
1097
|
raise exceptions.SnowflakeMLException(
|
@@ -994,35 +1099,3 @@ class PolynomialCountSketch(BaseTransformer):
|
|
994
1099
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
995
1100
|
)
|
996
1101
|
return self._model_signature_dict
|
997
|
-
|
998
|
-
def to_sklearn(self) -> Any:
|
999
|
-
"""Get sklearn.kernel_approximation.PolynomialCountSketch object.
|
1000
|
-
"""
|
1001
|
-
if self._sklearn_object is None:
|
1002
|
-
self._sklearn_object = self._create_sklearn_object()
|
1003
|
-
return self._sklearn_object
|
1004
|
-
|
1005
|
-
def to_xgboost(self) -> Any:
|
1006
|
-
raise exceptions.SnowflakeMLException(
|
1007
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1008
|
-
original_exception=AttributeError(
|
1009
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1010
|
-
"to_xgboost()",
|
1011
|
-
"to_sklearn()"
|
1012
|
-
)
|
1013
|
-
),
|
1014
|
-
)
|
1015
|
-
|
1016
|
-
def to_lightgbm(self) -> Any:
|
1017
|
-
raise exceptions.SnowflakeMLException(
|
1018
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1019
|
-
original_exception=AttributeError(
|
1020
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1021
|
-
"to_lightgbm()",
|
1022
|
-
"to_sklearn()"
|
1023
|
-
)
|
1024
|
-
),
|
1025
|
-
)
|
1026
|
-
|
1027
|
-
def _get_dependencies(self) -> List[str]:
|
1028
|
-
return self._deps
|