snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.multiclass".replace("skl
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class OutputCodeClassifier(BaseTransformer):
|
71
64
|
r"""(Error-Correcting) Output-Code multiclass strategy
|
72
65
|
For more details on this class, see [sklearn.multiclass.OutputCodeClassifier]
|
@@ -222,12 +215,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
222
215
|
)
|
223
216
|
return selected_cols
|
224
217
|
|
225
|
-
|
226
|
-
project=_PROJECT,
|
227
|
-
subproject=_SUBPROJECT,
|
228
|
-
custom_tags=dict([("autogen", True)]),
|
229
|
-
)
|
230
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "OutputCodeClassifier":
|
218
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "OutputCodeClassifier":
|
231
219
|
"""Fit underlying estimators
|
232
220
|
For more details on this function, see [sklearn.multiclass.OutputCodeClassifier.fit]
|
233
221
|
(https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OutputCodeClassifier.html#sklearn.multiclass.OutputCodeClassifier.fit)
|
@@ -254,12 +242,14 @@ class OutputCodeClassifier(BaseTransformer):
|
|
254
242
|
|
255
243
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
256
244
|
|
257
|
-
|
245
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
258
246
|
if SNOWML_SPROC_ENV in os.environ:
|
259
247
|
statement_params = telemetry.get_function_usage_statement_params(
|
260
248
|
project=_PROJECT,
|
261
249
|
subproject=_SUBPROJECT,
|
262
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
250
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
251
|
+
inspect.currentframe(), OutputCodeClassifier.__class__.__name__
|
252
|
+
),
|
263
253
|
api_calls=[Session.call],
|
264
254
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
265
255
|
)
|
@@ -280,27 +270,24 @@ class OutputCodeClassifier(BaseTransformer):
|
|
280
270
|
)
|
281
271
|
self._sklearn_object = model_trainer.train()
|
282
272
|
self._is_fitted = True
|
283
|
-
self.
|
273
|
+
self._generate_model_signatures(dataset)
|
284
274
|
return self
|
285
275
|
|
286
276
|
def _batch_inference_validate_snowpark(
|
287
277
|
self,
|
288
278
|
dataset: DataFrame,
|
289
279
|
inference_method: str,
|
290
|
-
) ->
|
291
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
292
|
-
return the available package that exists in the snowflake anaconda channel
|
280
|
+
) -> None:
|
281
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
293
282
|
|
294
283
|
Args:
|
295
284
|
dataset: snowpark dataframe
|
296
285
|
inference_method: the inference method such as predict, score...
|
297
|
-
|
286
|
+
|
298
287
|
Raises:
|
299
288
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
300
289
|
SnowflakeMLException: If the session is None, raise error
|
301
290
|
|
302
|
-
Returns:
|
303
|
-
A list of available package that exists in the snowflake anaconda channel
|
304
291
|
"""
|
305
292
|
if not self._is_fitted:
|
306
293
|
raise exceptions.SnowflakeMLException(
|
@@ -318,9 +305,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
318
305
|
"Session must not specified for snowpark dataset."
|
319
306
|
),
|
320
307
|
)
|
321
|
-
|
322
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
323
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
308
|
+
|
324
309
|
|
325
310
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
326
311
|
@telemetry.send_api_usage_telemetry(
|
@@ -356,7 +341,9 @@ class OutputCodeClassifier(BaseTransformer):
|
|
356
341
|
# when it is classifier, infer the datatype from label columns
|
357
342
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
358
343
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
359
|
-
label_cols_signatures = [
|
344
|
+
label_cols_signatures = [
|
345
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
346
|
+
]
|
360
347
|
if len(label_cols_signatures) == 0:
|
361
348
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
362
349
|
raise exceptions.SnowflakeMLException(
|
@@ -364,25 +351,23 @@ class OutputCodeClassifier(BaseTransformer):
|
|
364
351
|
original_exception=ValueError(error_str),
|
365
352
|
)
|
366
353
|
|
367
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
368
|
-
label_cols_signatures[0].as_snowpark_type()
|
369
|
-
)
|
354
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
370
355
|
|
371
|
-
self.
|
372
|
-
|
356
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
357
|
+
self._deps = self._get_dependencies()
|
358
|
+
assert isinstance(
|
359
|
+
dataset._session, Session
|
360
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
373
361
|
|
374
362
|
transform_kwargs = dict(
|
375
|
-
session
|
376
|
-
dependencies
|
377
|
-
drop_input_cols
|
378
|
-
expected_output_cols_type
|
363
|
+
session=dataset._session,
|
364
|
+
dependencies=self._deps,
|
365
|
+
drop_input_cols=self._drop_input_cols,
|
366
|
+
expected_output_cols_type=expected_type_inferred,
|
379
367
|
)
|
380
368
|
|
381
369
|
elif isinstance(dataset, pd.DataFrame):
|
382
|
-
transform_kwargs = dict(
|
383
|
-
snowpark_input_cols = self._snowpark_cols,
|
384
|
-
drop_input_cols = self._drop_input_cols
|
385
|
-
)
|
370
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
386
371
|
|
387
372
|
transform_handlers = ModelTransformerBuilder.build(
|
388
373
|
dataset=dataset,
|
@@ -422,7 +407,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
422
407
|
Transformed dataset.
|
423
408
|
"""
|
424
409
|
super()._check_dataset_type(dataset)
|
425
|
-
inference_method="transform"
|
410
|
+
inference_method = "transform"
|
426
411
|
|
427
412
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
428
413
|
# are specific to the type of dataset used.
|
@@ -452,24 +437,19 @@ class OutputCodeClassifier(BaseTransformer):
|
|
452
437
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
453
438
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
454
439
|
|
455
|
-
self.
|
456
|
-
|
457
|
-
inference_method=inference_method,
|
458
|
-
)
|
440
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
441
|
+
self._deps = self._get_dependencies()
|
459
442
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
460
443
|
|
461
444
|
transform_kwargs = dict(
|
462
|
-
session
|
463
|
-
dependencies
|
464
|
-
drop_input_cols
|
465
|
-
expected_output_cols_type
|
445
|
+
session=dataset._session,
|
446
|
+
dependencies=self._deps,
|
447
|
+
drop_input_cols=self._drop_input_cols,
|
448
|
+
expected_output_cols_type=expected_dtype,
|
466
449
|
)
|
467
450
|
|
468
451
|
elif isinstance(dataset, pd.DataFrame):
|
469
|
-
transform_kwargs = dict(
|
470
|
-
snowpark_input_cols = self._snowpark_cols,
|
471
|
-
drop_input_cols = self._drop_input_cols
|
472
|
-
)
|
452
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
473
453
|
|
474
454
|
transform_handlers = ModelTransformerBuilder.build(
|
475
455
|
dataset=dataset,
|
@@ -488,7 +468,11 @@ class OutputCodeClassifier(BaseTransformer):
|
|
488
468
|
return output_df
|
489
469
|
|
490
470
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
491
|
-
def fit_predict(
|
471
|
+
def fit_predict(
|
472
|
+
self,
|
473
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
474
|
+
output_cols_prefix: str = "fit_predict_",
|
475
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
492
476
|
""" Method not supported for this class.
|
493
477
|
|
494
478
|
|
@@ -513,22 +497,104 @@ class OutputCodeClassifier(BaseTransformer):
|
|
513
497
|
)
|
514
498
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
515
499
|
drop_input_cols=self._drop_input_cols,
|
516
|
-
expected_output_cols_list=
|
500
|
+
expected_output_cols_list=(
|
501
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
502
|
+
),
|
517
503
|
)
|
518
504
|
self._sklearn_object = fitted_estimator
|
519
505
|
self._is_fitted = True
|
520
506
|
return output_result
|
521
507
|
|
508
|
+
|
509
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
510
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
511
|
+
""" Method not supported for this class.
|
512
|
+
|
522
513
|
|
523
|
-
|
524
|
-
|
525
|
-
|
514
|
+
Raises:
|
515
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
516
|
+
|
517
|
+
Args:
|
518
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
519
|
+
Snowpark or Pandas DataFrame.
|
520
|
+
output_cols_prefix: Prefix for the response columns
|
526
521
|
Returns:
|
527
522
|
Transformed dataset.
|
528
523
|
"""
|
529
|
-
self.
|
530
|
-
|
531
|
-
|
524
|
+
self._infer_input_output_cols(dataset)
|
525
|
+
super()._check_dataset_type(dataset)
|
526
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
527
|
+
estimator=self._sklearn_object,
|
528
|
+
dataset=dataset,
|
529
|
+
input_cols=self.input_cols,
|
530
|
+
label_cols=self.label_cols,
|
531
|
+
sample_weight_col=self.sample_weight_col,
|
532
|
+
autogenerated=self._autogenerated,
|
533
|
+
subproject=_SUBPROJECT,
|
534
|
+
)
|
535
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
536
|
+
drop_input_cols=self._drop_input_cols,
|
537
|
+
expected_output_cols_list=self.output_cols,
|
538
|
+
)
|
539
|
+
self._sklearn_object = fitted_estimator
|
540
|
+
self._is_fitted = True
|
541
|
+
return output_result
|
542
|
+
|
543
|
+
|
544
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
545
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
546
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
547
|
+
"""
|
548
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
549
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
550
|
+
if output_cols:
|
551
|
+
output_cols = [
|
552
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
553
|
+
for c in output_cols
|
554
|
+
]
|
555
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
556
|
+
output_cols = [output_cols_prefix]
|
557
|
+
elif self._sklearn_object is not None:
|
558
|
+
classes = self._sklearn_object.classes_
|
559
|
+
if isinstance(classes, numpy.ndarray):
|
560
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
561
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
562
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
563
|
+
output_cols = []
|
564
|
+
for i, cl in enumerate(classes):
|
565
|
+
# For binary classification, there is only one output column for each class
|
566
|
+
# ndarray as the two classes are complementary.
|
567
|
+
if len(cl) == 2:
|
568
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
569
|
+
else:
|
570
|
+
output_cols.extend([
|
571
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
572
|
+
])
|
573
|
+
else:
|
574
|
+
output_cols = []
|
575
|
+
|
576
|
+
# Make sure column names are valid snowflake identifiers.
|
577
|
+
assert output_cols is not None # Make MyPy happy
|
578
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
579
|
+
|
580
|
+
return rv
|
581
|
+
|
582
|
+
def _align_expected_output_names(
|
583
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
584
|
+
) -> List[str]:
|
585
|
+
# in case the inferred output column names dimension is different
|
586
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
587
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
588
|
+
output_df_columns = list(output_df_pd.columns)
|
589
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
590
|
+
if self.sample_weight_col:
|
591
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
592
|
+
# if the dimension of inferred output column names is correct; use it
|
593
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
594
|
+
return expected_output_cols_list
|
595
|
+
# otherwise, use the sklearn estimator's output
|
596
|
+
else:
|
597
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
532
598
|
|
533
599
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
534
600
|
@telemetry.send_api_usage_telemetry(
|
@@ -560,24 +626,26 @@ class OutputCodeClassifier(BaseTransformer):
|
|
560
626
|
# are specific to the type of dataset used.
|
561
627
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
562
628
|
|
629
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
630
|
+
|
563
631
|
if isinstance(dataset, DataFrame):
|
564
|
-
self.
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
632
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
633
|
+
self._deps = self._get_dependencies()
|
634
|
+
assert isinstance(
|
635
|
+
dataset._session, Session
|
636
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
569
637
|
transform_kwargs = dict(
|
570
638
|
session=dataset._session,
|
571
639
|
dependencies=self._deps,
|
572
|
-
drop_input_cols
|
640
|
+
drop_input_cols=self._drop_input_cols,
|
573
641
|
expected_output_cols_type="float",
|
574
642
|
)
|
643
|
+
expected_output_cols = self._align_expected_output_names(
|
644
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
645
|
+
)
|
575
646
|
|
576
647
|
elif isinstance(dataset, pd.DataFrame):
|
577
|
-
transform_kwargs = dict(
|
578
|
-
snowpark_input_cols = self._snowpark_cols,
|
579
|
-
drop_input_cols = self._drop_input_cols
|
580
|
-
)
|
648
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
581
649
|
|
582
650
|
transform_handlers = ModelTransformerBuilder.build(
|
583
651
|
dataset=dataset,
|
@@ -589,7 +657,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
589
657
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
590
658
|
inference_method=inference_method,
|
591
659
|
input_cols=self.input_cols,
|
592
|
-
expected_output_cols=
|
660
|
+
expected_output_cols=expected_output_cols,
|
593
661
|
**transform_kwargs
|
594
662
|
)
|
595
663
|
return output_df
|
@@ -619,29 +687,30 @@ class OutputCodeClassifier(BaseTransformer):
|
|
619
687
|
Output dataset with log probability of the sample for each class in the model.
|
620
688
|
"""
|
621
689
|
super()._check_dataset_type(dataset)
|
622
|
-
inference_method="predict_log_proba"
|
690
|
+
inference_method = "predict_log_proba"
|
691
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
623
692
|
|
624
693
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
625
694
|
# are specific to the type of dataset used.
|
626
695
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
627
696
|
|
628
697
|
if isinstance(dataset, DataFrame):
|
629
|
-
self.
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
698
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
699
|
+
self._deps = self._get_dependencies()
|
700
|
+
assert isinstance(
|
701
|
+
dataset._session, Session
|
702
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
634
703
|
transform_kwargs = dict(
|
635
704
|
session=dataset._session,
|
636
705
|
dependencies=self._deps,
|
637
|
-
drop_input_cols
|
706
|
+
drop_input_cols=self._drop_input_cols,
|
638
707
|
expected_output_cols_type="float",
|
639
708
|
)
|
709
|
+
expected_output_cols = self._align_expected_output_names(
|
710
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
711
|
+
)
|
640
712
|
elif isinstance(dataset, pd.DataFrame):
|
641
|
-
transform_kwargs = dict(
|
642
|
-
snowpark_input_cols = self._snowpark_cols,
|
643
|
-
drop_input_cols = self._drop_input_cols
|
644
|
-
)
|
713
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
645
714
|
|
646
715
|
transform_handlers = ModelTransformerBuilder.build(
|
647
716
|
dataset=dataset,
|
@@ -654,7 +723,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
654
723
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
655
724
|
inference_method=inference_method,
|
656
725
|
input_cols=self.input_cols,
|
657
|
-
expected_output_cols=
|
726
|
+
expected_output_cols=expected_output_cols,
|
658
727
|
**transform_kwargs
|
659
728
|
)
|
660
729
|
return output_df
|
@@ -680,30 +749,32 @@ class OutputCodeClassifier(BaseTransformer):
|
|
680
749
|
Output dataset with results of the decision function for the samples in input dataset.
|
681
750
|
"""
|
682
751
|
super()._check_dataset_type(dataset)
|
683
|
-
inference_method="decision_function"
|
752
|
+
inference_method = "decision_function"
|
684
753
|
|
685
754
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
686
755
|
# are specific to the type of dataset used.
|
687
756
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
688
757
|
|
758
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
759
|
+
|
689
760
|
if isinstance(dataset, DataFrame):
|
690
|
-
self.
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
761
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
762
|
+
self._deps = self._get_dependencies()
|
763
|
+
assert isinstance(
|
764
|
+
dataset._session, Session
|
765
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
695
766
|
transform_kwargs = dict(
|
696
767
|
session=dataset._session,
|
697
768
|
dependencies=self._deps,
|
698
|
-
drop_input_cols
|
769
|
+
drop_input_cols=self._drop_input_cols,
|
699
770
|
expected_output_cols_type="float",
|
700
771
|
)
|
772
|
+
expected_output_cols = self._align_expected_output_names(
|
773
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
774
|
+
)
|
701
775
|
|
702
776
|
elif isinstance(dataset, pd.DataFrame):
|
703
|
-
transform_kwargs = dict(
|
704
|
-
snowpark_input_cols = self._snowpark_cols,
|
705
|
-
drop_input_cols = self._drop_input_cols
|
706
|
-
)
|
777
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
707
778
|
|
708
779
|
transform_handlers = ModelTransformerBuilder.build(
|
709
780
|
dataset=dataset,
|
@@ -716,7 +787,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
716
787
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
717
788
|
inference_method=inference_method,
|
718
789
|
input_cols=self.input_cols,
|
719
|
-
expected_output_cols=
|
790
|
+
expected_output_cols=expected_output_cols,
|
720
791
|
**transform_kwargs
|
721
792
|
)
|
722
793
|
return output_df
|
@@ -745,17 +816,17 @@ class OutputCodeClassifier(BaseTransformer):
|
|
745
816
|
Output dataset with probability of the sample for each class in the model.
|
746
817
|
"""
|
747
818
|
super()._check_dataset_type(dataset)
|
748
|
-
inference_method="score_samples"
|
819
|
+
inference_method = "score_samples"
|
749
820
|
|
750
821
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
751
822
|
# are specific to the type of dataset used.
|
752
823
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
753
824
|
|
825
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
826
|
+
|
754
827
|
if isinstance(dataset, DataFrame):
|
755
|
-
self.
|
756
|
-
|
757
|
-
inference_method=inference_method,
|
758
|
-
)
|
828
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
829
|
+
self._deps = self._get_dependencies()
|
759
830
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
760
831
|
transform_kwargs = dict(
|
761
832
|
session=dataset._session,
|
@@ -763,6 +834,9 @@ class OutputCodeClassifier(BaseTransformer):
|
|
763
834
|
drop_input_cols = self._drop_input_cols,
|
764
835
|
expected_output_cols_type="float",
|
765
836
|
)
|
837
|
+
expected_output_cols = self._align_expected_output_names(
|
838
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
839
|
+
)
|
766
840
|
|
767
841
|
elif isinstance(dataset, pd.DataFrame):
|
768
842
|
transform_kwargs = dict(
|
@@ -781,7 +855,7 @@ class OutputCodeClassifier(BaseTransformer):
|
|
781
855
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
782
856
|
inference_method=inference_method,
|
783
857
|
input_cols=self.input_cols,
|
784
|
-
expected_output_cols=
|
858
|
+
expected_output_cols=expected_output_cols,
|
785
859
|
**transform_kwargs
|
786
860
|
)
|
787
861
|
return output_df
|
@@ -816,17 +890,15 @@ class OutputCodeClassifier(BaseTransformer):
|
|
816
890
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
817
891
|
|
818
892
|
if isinstance(dataset, DataFrame):
|
819
|
-
self.
|
820
|
-
|
821
|
-
inference_method="score",
|
822
|
-
)
|
893
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
894
|
+
self._deps = self._get_dependencies()
|
823
895
|
selected_cols = self._get_active_columns()
|
824
896
|
if len(selected_cols) > 0:
|
825
897
|
dataset = dataset.select(selected_cols)
|
826
898
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
827
899
|
transform_kwargs = dict(
|
828
900
|
session=dataset._session,
|
829
|
-
dependencies=
|
901
|
+
dependencies=self._deps,
|
830
902
|
score_sproc_imports=['sklearn'],
|
831
903
|
)
|
832
904
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -891,11 +963,8 @@ class OutputCodeClassifier(BaseTransformer):
|
|
891
963
|
|
892
964
|
if isinstance(dataset, DataFrame):
|
893
965
|
|
894
|
-
self.
|
895
|
-
|
896
|
-
inference_method=inference_method,
|
897
|
-
|
898
|
-
)
|
966
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
967
|
+
self._deps = self._get_dependencies()
|
899
968
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
900
969
|
transform_kwargs = dict(
|
901
970
|
session = dataset._session,
|
@@ -928,50 +997,84 @@ class OutputCodeClassifier(BaseTransformer):
|
|
928
997
|
)
|
929
998
|
return output_df
|
930
999
|
|
1000
|
+
|
1001
|
+
|
1002
|
+
def to_sklearn(self) -> Any:
|
1003
|
+
"""Get sklearn.multiclass.OutputCodeClassifier object.
|
1004
|
+
"""
|
1005
|
+
if self._sklearn_object is None:
|
1006
|
+
self._sklearn_object = self._create_sklearn_object()
|
1007
|
+
return self._sklearn_object
|
1008
|
+
|
1009
|
+
def to_xgboost(self) -> Any:
|
1010
|
+
raise exceptions.SnowflakeMLException(
|
1011
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1012
|
+
original_exception=AttributeError(
|
1013
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1014
|
+
"to_xgboost()",
|
1015
|
+
"to_sklearn()"
|
1016
|
+
)
|
1017
|
+
),
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
def to_lightgbm(self) -> Any:
|
1021
|
+
raise exceptions.SnowflakeMLException(
|
1022
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1023
|
+
original_exception=AttributeError(
|
1024
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1025
|
+
"to_lightgbm()",
|
1026
|
+
"to_sklearn()"
|
1027
|
+
)
|
1028
|
+
),
|
1029
|
+
)
|
1030
|
+
|
1031
|
+
def _get_dependencies(self) -> List[str]:
|
1032
|
+
return self._deps
|
1033
|
+
|
931
1034
|
|
932
|
-
def
|
1035
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
933
1036
|
self._model_signature_dict = dict()
|
934
1037
|
|
935
1038
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
936
1039
|
|
937
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1040
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
938
1041
|
outputs: List[BaseFeatureSpec] = []
|
939
1042
|
if hasattr(self, "predict"):
|
940
1043
|
# keep mypy happy
|
941
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1044
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
942
1045
|
# For classifier, the type of predict is the same as the type of label
|
943
|
-
if self._sklearn_object._estimator_type ==
|
944
|
-
|
1046
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1047
|
+
# label columns is the desired type for output
|
945
1048
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
946
1049
|
# rename the output columns
|
947
1050
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
948
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
949
|
-
|
950
|
-
|
1051
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1052
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1053
|
+
)
|
951
1054
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
952
1055
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
953
|
-
# Clusterer returns int64 cluster labels.
|
1056
|
+
# Clusterer returns int64 cluster labels.
|
954
1057
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
955
1058
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
956
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
957
|
-
|
958
|
-
|
959
|
-
|
1059
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1060
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1061
|
+
)
|
1062
|
+
|
960
1063
|
# For regressor, the type of predict is float64
|
961
|
-
elif self._sklearn_object._estimator_type ==
|
1064
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
962
1065
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
963
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
964
|
-
|
965
|
-
|
966
|
-
|
1066
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1067
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1068
|
+
)
|
1069
|
+
|
967
1070
|
for prob_func in PROB_FUNCTIONS:
|
968
1071
|
if hasattr(self, prob_func):
|
969
1072
|
output_cols_prefix: str = f"{prob_func}_"
|
970
1073
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
971
1074
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
972
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
973
|
-
|
974
|
-
|
1075
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1076
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1077
|
+
)
|
975
1078
|
|
976
1079
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
977
1080
|
items = list(self._model_signature_dict.items())
|
@@ -984,10 +1087,10 @@ class OutputCodeClassifier(BaseTransformer):
|
|
984
1087
|
"""Returns model signature of current class.
|
985
1088
|
|
986
1089
|
Raises:
|
987
|
-
|
1090
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
988
1091
|
|
989
1092
|
Returns:
|
990
|
-
Dict
|
1093
|
+
Dict with each method and its input output signature
|
991
1094
|
"""
|
992
1095
|
if self._model_signature_dict is None:
|
993
1096
|
raise exceptions.SnowflakeMLException(
|
@@ -995,35 +1098,3 @@ class OutputCodeClassifier(BaseTransformer):
|
|
995
1098
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
996
1099
|
)
|
997
1100
|
return self._model_signature_dict
|
998
|
-
|
999
|
-
def to_sklearn(self) -> Any:
|
1000
|
-
"""Get sklearn.multiclass.OutputCodeClassifier object.
|
1001
|
-
"""
|
1002
|
-
if self._sklearn_object is None:
|
1003
|
-
self._sklearn_object = self._create_sklearn_object()
|
1004
|
-
return self._sklearn_object
|
1005
|
-
|
1006
|
-
def to_xgboost(self) -> Any:
|
1007
|
-
raise exceptions.SnowflakeMLException(
|
1008
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1009
|
-
original_exception=AttributeError(
|
1010
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1011
|
-
"to_xgboost()",
|
1012
|
-
"to_sklearn()"
|
1013
|
-
)
|
1014
|
-
),
|
1015
|
-
)
|
1016
|
-
|
1017
|
-
def to_lightgbm(self) -> Any:
|
1018
|
-
raise exceptions.SnowflakeMLException(
|
1019
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1020
|
-
original_exception=AttributeError(
|
1021
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1022
|
-
"to_lightgbm()",
|
1023
|
-
"to_sklearn()"
|
1024
|
-
)
|
1025
|
-
),
|
1026
|
-
)
|
1027
|
-
|
1028
|
-
def _get_dependencies(self) -> List[str]:
|
1029
|
-
return self._deps
|