snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -61,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "lightgbm".replace("sklearn.", ""
|
|
61
60
|
|
62
61
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
63
62
|
|
64
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
65
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
66
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
67
|
-
return check
|
68
|
-
|
69
|
-
|
70
63
|
class LGBMClassifier(BaseTransformer):
|
71
64
|
r"""LightGBM classifier
|
72
65
|
For more details on this class, see [lightgbm.LGBMClassifier]
|
@@ -233,12 +226,7 @@ class LGBMClassifier(BaseTransformer):
|
|
233
226
|
)
|
234
227
|
return selected_cols
|
235
228
|
|
236
|
-
|
237
|
-
project=_PROJECT,
|
238
|
-
subproject=_SUBPROJECT,
|
239
|
-
custom_tags=dict([("autogen", True)]),
|
240
|
-
)
|
241
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMClassifier":
|
229
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMClassifier":
|
242
230
|
"""Build a gradient boosting model from the training set (X, y)
|
243
231
|
For more details on this function, see [lightgbm.LGBMClassifier.fit]
|
244
232
|
(https://lightgbm.readthedocs.io/en/v3.3.2/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit)
|
@@ -265,12 +253,14 @@ class LGBMClassifier(BaseTransformer):
|
|
265
253
|
|
266
254
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
267
255
|
|
268
|
-
|
256
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
269
257
|
if SNOWML_SPROC_ENV in os.environ:
|
270
258
|
statement_params = telemetry.get_function_usage_statement_params(
|
271
259
|
project=_PROJECT,
|
272
260
|
subproject=_SUBPROJECT,
|
273
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
261
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
262
|
+
inspect.currentframe(), LGBMClassifier.__class__.__name__
|
263
|
+
),
|
274
264
|
api_calls=[Session.call],
|
275
265
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
276
266
|
)
|
@@ -291,27 +281,24 @@ class LGBMClassifier(BaseTransformer):
|
|
291
281
|
)
|
292
282
|
self._sklearn_object = model_trainer.train()
|
293
283
|
self._is_fitted = True
|
294
|
-
self.
|
284
|
+
self._generate_model_signatures(dataset)
|
295
285
|
return self
|
296
286
|
|
297
287
|
def _batch_inference_validate_snowpark(
|
298
288
|
self,
|
299
289
|
dataset: DataFrame,
|
300
290
|
inference_method: str,
|
301
|
-
) ->
|
302
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
303
|
-
return the available package that exists in the snowflake anaconda channel
|
291
|
+
) -> None:
|
292
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
304
293
|
|
305
294
|
Args:
|
306
295
|
dataset: snowpark dataframe
|
307
296
|
inference_method: the inference method such as predict, score...
|
308
|
-
|
297
|
+
|
309
298
|
Raises:
|
310
299
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
311
300
|
SnowflakeMLException: If the session is None, raise error
|
312
301
|
|
313
|
-
Returns:
|
314
|
-
A list of available package that exists in the snowflake anaconda channel
|
315
302
|
"""
|
316
303
|
if not self._is_fitted:
|
317
304
|
raise exceptions.SnowflakeMLException(
|
@@ -329,9 +316,7 @@ class LGBMClassifier(BaseTransformer):
|
|
329
316
|
"Session must not specified for snowpark dataset."
|
330
317
|
),
|
331
318
|
)
|
332
|
-
|
333
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
334
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
319
|
+
|
335
320
|
|
336
321
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
337
322
|
@telemetry.send_api_usage_telemetry(
|
@@ -367,7 +352,9 @@ class LGBMClassifier(BaseTransformer):
|
|
367
352
|
# when it is classifier, infer the datatype from label columns
|
368
353
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
369
354
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
370
|
-
label_cols_signatures = [
|
355
|
+
label_cols_signatures = [
|
356
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
357
|
+
]
|
371
358
|
if len(label_cols_signatures) == 0:
|
372
359
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
373
360
|
raise exceptions.SnowflakeMLException(
|
@@ -375,25 +362,23 @@ class LGBMClassifier(BaseTransformer):
|
|
375
362
|
original_exception=ValueError(error_str),
|
376
363
|
)
|
377
364
|
|
378
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
379
|
-
label_cols_signatures[0].as_snowpark_type()
|
380
|
-
)
|
365
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
381
366
|
|
382
|
-
self.
|
383
|
-
|
367
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
368
|
+
self._deps = self._get_dependencies()
|
369
|
+
assert isinstance(
|
370
|
+
dataset._session, Session
|
371
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
384
372
|
|
385
373
|
transform_kwargs = dict(
|
386
|
-
session
|
387
|
-
dependencies
|
388
|
-
drop_input_cols
|
389
|
-
expected_output_cols_type
|
374
|
+
session=dataset._session,
|
375
|
+
dependencies=self._deps,
|
376
|
+
drop_input_cols=self._drop_input_cols,
|
377
|
+
expected_output_cols_type=expected_type_inferred,
|
390
378
|
)
|
391
379
|
|
392
380
|
elif isinstance(dataset, pd.DataFrame):
|
393
|
-
transform_kwargs = dict(
|
394
|
-
snowpark_input_cols = self._snowpark_cols,
|
395
|
-
drop_input_cols = self._drop_input_cols
|
396
|
-
)
|
381
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
397
382
|
|
398
383
|
transform_handlers = ModelTransformerBuilder.build(
|
399
384
|
dataset=dataset,
|
@@ -433,7 +418,7 @@ class LGBMClassifier(BaseTransformer):
|
|
433
418
|
Transformed dataset.
|
434
419
|
"""
|
435
420
|
super()._check_dataset_type(dataset)
|
436
|
-
inference_method="transform"
|
421
|
+
inference_method = "transform"
|
437
422
|
|
438
423
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
439
424
|
# are specific to the type of dataset used.
|
@@ -463,24 +448,19 @@ class LGBMClassifier(BaseTransformer):
|
|
463
448
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
464
449
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
465
450
|
|
466
|
-
self.
|
467
|
-
|
468
|
-
inference_method=inference_method,
|
469
|
-
)
|
451
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
452
|
+
self._deps = self._get_dependencies()
|
470
453
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
471
454
|
|
472
455
|
transform_kwargs = dict(
|
473
|
-
session
|
474
|
-
dependencies
|
475
|
-
drop_input_cols
|
476
|
-
expected_output_cols_type
|
456
|
+
session=dataset._session,
|
457
|
+
dependencies=self._deps,
|
458
|
+
drop_input_cols=self._drop_input_cols,
|
459
|
+
expected_output_cols_type=expected_dtype,
|
477
460
|
)
|
478
461
|
|
479
462
|
elif isinstance(dataset, pd.DataFrame):
|
480
|
-
transform_kwargs = dict(
|
481
|
-
snowpark_input_cols = self._snowpark_cols,
|
482
|
-
drop_input_cols = self._drop_input_cols
|
483
|
-
)
|
463
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
484
464
|
|
485
465
|
transform_handlers = ModelTransformerBuilder.build(
|
486
466
|
dataset=dataset,
|
@@ -499,7 +479,11 @@ class LGBMClassifier(BaseTransformer):
|
|
499
479
|
return output_df
|
500
480
|
|
501
481
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
502
|
-
def fit_predict(
|
482
|
+
def fit_predict(
|
483
|
+
self,
|
484
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
485
|
+
output_cols_prefix: str = "fit_predict_",
|
486
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
503
487
|
""" Method not supported for this class.
|
504
488
|
|
505
489
|
|
@@ -524,22 +508,104 @@ class LGBMClassifier(BaseTransformer):
|
|
524
508
|
)
|
525
509
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
526
510
|
drop_input_cols=self._drop_input_cols,
|
527
|
-
expected_output_cols_list=
|
511
|
+
expected_output_cols_list=(
|
512
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
513
|
+
),
|
528
514
|
)
|
529
515
|
self._sklearn_object = fitted_estimator
|
530
516
|
self._is_fitted = True
|
531
517
|
return output_result
|
532
518
|
|
519
|
+
|
520
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
521
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
522
|
+
""" Method not supported for this class.
|
523
|
+
|
533
524
|
|
534
|
-
|
535
|
-
|
536
|
-
|
525
|
+
Raises:
|
526
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
527
|
+
|
528
|
+
Args:
|
529
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
530
|
+
Snowpark or Pandas DataFrame.
|
531
|
+
output_cols_prefix: Prefix for the response columns
|
537
532
|
Returns:
|
538
533
|
Transformed dataset.
|
539
534
|
"""
|
540
|
-
self.
|
541
|
-
|
542
|
-
|
535
|
+
self._infer_input_output_cols(dataset)
|
536
|
+
super()._check_dataset_type(dataset)
|
537
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
538
|
+
estimator=self._sklearn_object,
|
539
|
+
dataset=dataset,
|
540
|
+
input_cols=self.input_cols,
|
541
|
+
label_cols=self.label_cols,
|
542
|
+
sample_weight_col=self.sample_weight_col,
|
543
|
+
autogenerated=self._autogenerated,
|
544
|
+
subproject=_SUBPROJECT,
|
545
|
+
)
|
546
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
547
|
+
drop_input_cols=self._drop_input_cols,
|
548
|
+
expected_output_cols_list=self.output_cols,
|
549
|
+
)
|
550
|
+
self._sklearn_object = fitted_estimator
|
551
|
+
self._is_fitted = True
|
552
|
+
return output_result
|
553
|
+
|
554
|
+
|
555
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
556
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
557
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
558
|
+
"""
|
559
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
560
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
561
|
+
if output_cols:
|
562
|
+
output_cols = [
|
563
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
564
|
+
for c in output_cols
|
565
|
+
]
|
566
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
567
|
+
output_cols = [output_cols_prefix]
|
568
|
+
elif self._sklearn_object is not None:
|
569
|
+
classes = self._sklearn_object.classes_
|
570
|
+
if isinstance(classes, numpy.ndarray):
|
571
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
572
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
573
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
574
|
+
output_cols = []
|
575
|
+
for i, cl in enumerate(classes):
|
576
|
+
# For binary classification, there is only one output column for each class
|
577
|
+
# ndarray as the two classes are complementary.
|
578
|
+
if len(cl) == 2:
|
579
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
580
|
+
else:
|
581
|
+
output_cols.extend([
|
582
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
583
|
+
])
|
584
|
+
else:
|
585
|
+
output_cols = []
|
586
|
+
|
587
|
+
# Make sure column names are valid snowflake identifiers.
|
588
|
+
assert output_cols is not None # Make MyPy happy
|
589
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
590
|
+
|
591
|
+
return rv
|
592
|
+
|
593
|
+
def _align_expected_output_names(
|
594
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
595
|
+
) -> List[str]:
|
596
|
+
# in case the inferred output column names dimension is different
|
597
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
598
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
599
|
+
output_df_columns = list(output_df_pd.columns)
|
600
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
601
|
+
if self.sample_weight_col:
|
602
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
603
|
+
# if the dimension of inferred output column names is correct; use it
|
604
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
605
|
+
return expected_output_cols_list
|
606
|
+
# otherwise, use the sklearn estimator's output
|
607
|
+
else:
|
608
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
543
609
|
|
544
610
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
545
611
|
@telemetry.send_api_usage_telemetry(
|
@@ -573,24 +639,26 @@ class LGBMClassifier(BaseTransformer):
|
|
573
639
|
# are specific to the type of dataset used.
|
574
640
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
575
641
|
|
642
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
643
|
+
|
576
644
|
if isinstance(dataset, DataFrame):
|
577
|
-
self.
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
645
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
646
|
+
self._deps = self._get_dependencies()
|
647
|
+
assert isinstance(
|
648
|
+
dataset._session, Session
|
649
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
582
650
|
transform_kwargs = dict(
|
583
651
|
session=dataset._session,
|
584
652
|
dependencies=self._deps,
|
585
|
-
drop_input_cols
|
653
|
+
drop_input_cols=self._drop_input_cols,
|
586
654
|
expected_output_cols_type="float",
|
587
655
|
)
|
656
|
+
expected_output_cols = self._align_expected_output_names(
|
657
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
658
|
+
)
|
588
659
|
|
589
660
|
elif isinstance(dataset, pd.DataFrame):
|
590
|
-
transform_kwargs = dict(
|
591
|
-
snowpark_input_cols = self._snowpark_cols,
|
592
|
-
drop_input_cols = self._drop_input_cols
|
593
|
-
)
|
661
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
594
662
|
|
595
663
|
transform_handlers = ModelTransformerBuilder.build(
|
596
664
|
dataset=dataset,
|
@@ -602,7 +670,7 @@ class LGBMClassifier(BaseTransformer):
|
|
602
670
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
603
671
|
inference_method=inference_method,
|
604
672
|
input_cols=self.input_cols,
|
605
|
-
expected_output_cols=
|
673
|
+
expected_output_cols=expected_output_cols,
|
606
674
|
**transform_kwargs
|
607
675
|
)
|
608
676
|
return output_df
|
@@ -634,29 +702,30 @@ class LGBMClassifier(BaseTransformer):
|
|
634
702
|
Output dataset with log probability of the sample for each class in the model.
|
635
703
|
"""
|
636
704
|
super()._check_dataset_type(dataset)
|
637
|
-
inference_method="predict_log_proba"
|
705
|
+
inference_method = "predict_log_proba"
|
706
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
638
707
|
|
639
708
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
640
709
|
# are specific to the type of dataset used.
|
641
710
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
642
711
|
|
643
712
|
if isinstance(dataset, DataFrame):
|
644
|
-
self.
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
713
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
714
|
+
self._deps = self._get_dependencies()
|
715
|
+
assert isinstance(
|
716
|
+
dataset._session, Session
|
717
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
649
718
|
transform_kwargs = dict(
|
650
719
|
session=dataset._session,
|
651
720
|
dependencies=self._deps,
|
652
|
-
drop_input_cols
|
721
|
+
drop_input_cols=self._drop_input_cols,
|
653
722
|
expected_output_cols_type="float",
|
654
723
|
)
|
724
|
+
expected_output_cols = self._align_expected_output_names(
|
725
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
726
|
+
)
|
655
727
|
elif isinstance(dataset, pd.DataFrame):
|
656
|
-
transform_kwargs = dict(
|
657
|
-
snowpark_input_cols = self._snowpark_cols,
|
658
|
-
drop_input_cols = self._drop_input_cols
|
659
|
-
)
|
728
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
660
729
|
|
661
730
|
transform_handlers = ModelTransformerBuilder.build(
|
662
731
|
dataset=dataset,
|
@@ -669,7 +738,7 @@ class LGBMClassifier(BaseTransformer):
|
|
669
738
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
670
739
|
inference_method=inference_method,
|
671
740
|
input_cols=self.input_cols,
|
672
|
-
expected_output_cols=
|
741
|
+
expected_output_cols=expected_output_cols,
|
673
742
|
**transform_kwargs
|
674
743
|
)
|
675
744
|
return output_df
|
@@ -695,30 +764,32 @@ class LGBMClassifier(BaseTransformer):
|
|
695
764
|
Output dataset with results of the decision function for the samples in input dataset.
|
696
765
|
"""
|
697
766
|
super()._check_dataset_type(dataset)
|
698
|
-
inference_method="decision_function"
|
767
|
+
inference_method = "decision_function"
|
699
768
|
|
700
769
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
701
770
|
# are specific to the type of dataset used.
|
702
771
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
703
772
|
|
773
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
774
|
+
|
704
775
|
if isinstance(dataset, DataFrame):
|
705
|
-
self.
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
776
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
777
|
+
self._deps = self._get_dependencies()
|
778
|
+
assert isinstance(
|
779
|
+
dataset._session, Session
|
780
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
710
781
|
transform_kwargs = dict(
|
711
782
|
session=dataset._session,
|
712
783
|
dependencies=self._deps,
|
713
|
-
drop_input_cols
|
784
|
+
drop_input_cols=self._drop_input_cols,
|
714
785
|
expected_output_cols_type="float",
|
715
786
|
)
|
787
|
+
expected_output_cols = self._align_expected_output_names(
|
788
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
789
|
+
)
|
716
790
|
|
717
791
|
elif isinstance(dataset, pd.DataFrame):
|
718
|
-
transform_kwargs = dict(
|
719
|
-
snowpark_input_cols = self._snowpark_cols,
|
720
|
-
drop_input_cols = self._drop_input_cols
|
721
|
-
)
|
792
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
722
793
|
|
723
794
|
transform_handlers = ModelTransformerBuilder.build(
|
724
795
|
dataset=dataset,
|
@@ -731,7 +802,7 @@ class LGBMClassifier(BaseTransformer):
|
|
731
802
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
732
803
|
inference_method=inference_method,
|
733
804
|
input_cols=self.input_cols,
|
734
|
-
expected_output_cols=
|
805
|
+
expected_output_cols=expected_output_cols,
|
735
806
|
**transform_kwargs
|
736
807
|
)
|
737
808
|
return output_df
|
@@ -760,17 +831,17 @@ class LGBMClassifier(BaseTransformer):
|
|
760
831
|
Output dataset with probability of the sample for each class in the model.
|
761
832
|
"""
|
762
833
|
super()._check_dataset_type(dataset)
|
763
|
-
inference_method="score_samples"
|
834
|
+
inference_method = "score_samples"
|
764
835
|
|
765
836
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
766
837
|
# are specific to the type of dataset used.
|
767
838
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
768
839
|
|
840
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
841
|
+
|
769
842
|
if isinstance(dataset, DataFrame):
|
770
|
-
self.
|
771
|
-
|
772
|
-
inference_method=inference_method,
|
773
|
-
)
|
843
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
844
|
+
self._deps = self._get_dependencies()
|
774
845
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
775
846
|
transform_kwargs = dict(
|
776
847
|
session=dataset._session,
|
@@ -778,6 +849,9 @@ class LGBMClassifier(BaseTransformer):
|
|
778
849
|
drop_input_cols = self._drop_input_cols,
|
779
850
|
expected_output_cols_type="float",
|
780
851
|
)
|
852
|
+
expected_output_cols = self._align_expected_output_names(
|
853
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
854
|
+
)
|
781
855
|
|
782
856
|
elif isinstance(dataset, pd.DataFrame):
|
783
857
|
transform_kwargs = dict(
|
@@ -796,7 +870,7 @@ class LGBMClassifier(BaseTransformer):
|
|
796
870
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
797
871
|
inference_method=inference_method,
|
798
872
|
input_cols=self.input_cols,
|
799
|
-
expected_output_cols=
|
873
|
+
expected_output_cols=expected_output_cols,
|
800
874
|
**transform_kwargs
|
801
875
|
)
|
802
876
|
return output_df
|
@@ -831,17 +905,15 @@ class LGBMClassifier(BaseTransformer):
|
|
831
905
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
832
906
|
|
833
907
|
if isinstance(dataset, DataFrame):
|
834
|
-
self.
|
835
|
-
|
836
|
-
inference_method="score",
|
837
|
-
)
|
908
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
909
|
+
self._deps = self._get_dependencies()
|
838
910
|
selected_cols = self._get_active_columns()
|
839
911
|
if len(selected_cols) > 0:
|
840
912
|
dataset = dataset.select(selected_cols)
|
841
913
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
842
914
|
transform_kwargs = dict(
|
843
915
|
session=dataset._session,
|
844
|
-
dependencies=
|
916
|
+
dependencies=self._deps,
|
845
917
|
score_sproc_imports=['lightgbm', 'sklearn'],
|
846
918
|
)
|
847
919
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -906,11 +978,8 @@ class LGBMClassifier(BaseTransformer):
|
|
906
978
|
|
907
979
|
if isinstance(dataset, DataFrame):
|
908
980
|
|
909
|
-
self.
|
910
|
-
|
911
|
-
inference_method=inference_method,
|
912
|
-
|
913
|
-
)
|
981
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
982
|
+
self._deps = self._get_dependencies()
|
914
983
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
915
984
|
transform_kwargs = dict(
|
916
985
|
session = dataset._session,
|
@@ -943,50 +1012,84 @@ class LGBMClassifier(BaseTransformer):
|
|
943
1012
|
)
|
944
1013
|
return output_df
|
945
1014
|
|
1015
|
+
|
1016
|
+
|
1017
|
+
def to_lightgbm(self) -> Any:
|
1018
|
+
"""Get lightgbm.LGBMClassifier object.
|
1019
|
+
"""
|
1020
|
+
if self._sklearn_object is None:
|
1021
|
+
self._sklearn_object = self._create_sklearn_object()
|
1022
|
+
return self._sklearn_object
|
1023
|
+
|
1024
|
+
def to_sklearn(self) -> Any:
|
1025
|
+
raise exceptions.SnowflakeMLException(
|
1026
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1027
|
+
original_exception=AttributeError(
|
1028
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1029
|
+
"to_sklearn()",
|
1030
|
+
"to_lightgbm()"
|
1031
|
+
)
|
1032
|
+
),
|
1033
|
+
)
|
1034
|
+
|
1035
|
+
def to_xgboost(self) -> Any:
|
1036
|
+
raise exceptions.SnowflakeMLException(
|
1037
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1038
|
+
original_exception=AttributeError(
|
1039
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1040
|
+
"to_xgboost()",
|
1041
|
+
"to_lightgbm()"
|
1042
|
+
)
|
1043
|
+
),
|
1044
|
+
)
|
1045
|
+
|
1046
|
+
def _get_dependencies(self) -> List[str]:
|
1047
|
+
return self._deps
|
1048
|
+
|
946
1049
|
|
947
|
-
def
|
1050
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
948
1051
|
self._model_signature_dict = dict()
|
949
1052
|
|
950
1053
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
951
1054
|
|
952
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1055
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
953
1056
|
outputs: List[BaseFeatureSpec] = []
|
954
1057
|
if hasattr(self, "predict"):
|
955
1058
|
# keep mypy happy
|
956
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1059
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
957
1060
|
# For classifier, the type of predict is the same as the type of label
|
958
|
-
if self._sklearn_object._estimator_type ==
|
959
|
-
|
1061
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1062
|
+
# label columns is the desired type for output
|
960
1063
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
961
1064
|
# rename the output columns
|
962
1065
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
963
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
964
|
-
|
965
|
-
|
1066
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1067
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1068
|
+
)
|
966
1069
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
967
1070
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
968
|
-
# Clusterer returns int64 cluster labels.
|
1071
|
+
# Clusterer returns int64 cluster labels.
|
969
1072
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
970
1073
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
971
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
972
|
-
|
973
|
-
|
974
|
-
|
1074
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1075
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1076
|
+
)
|
1077
|
+
|
975
1078
|
# For regressor, the type of predict is float64
|
976
|
-
elif self._sklearn_object._estimator_type ==
|
1079
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
977
1080
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
978
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
979
|
-
|
980
|
-
|
981
|
-
|
1081
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1082
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1083
|
+
)
|
1084
|
+
|
982
1085
|
for prob_func in PROB_FUNCTIONS:
|
983
1086
|
if hasattr(self, prob_func):
|
984
1087
|
output_cols_prefix: str = f"{prob_func}_"
|
985
1088
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
986
1089
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
987
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
988
|
-
|
989
|
-
|
1090
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1091
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1092
|
+
)
|
990
1093
|
|
991
1094
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
992
1095
|
items = list(self._model_signature_dict.items())
|
@@ -999,10 +1102,10 @@ class LGBMClassifier(BaseTransformer):
|
|
999
1102
|
"""Returns model signature of current class.
|
1000
1103
|
|
1001
1104
|
Raises:
|
1002
|
-
|
1105
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1003
1106
|
|
1004
1107
|
Returns:
|
1005
|
-
Dict
|
1108
|
+
Dict with each method and its input output signature
|
1006
1109
|
"""
|
1007
1110
|
if self._model_signature_dict is None:
|
1008
1111
|
raise exceptions.SnowflakeMLException(
|
@@ -1010,35 +1113,3 @@ class LGBMClassifier(BaseTransformer):
|
|
1010
1113
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1011
1114
|
)
|
1012
1115
|
return self._model_signature_dict
|
1013
|
-
|
1014
|
-
def to_lightgbm(self) -> Any:
|
1015
|
-
"""Get lightgbm.LGBMClassifier object.
|
1016
|
-
"""
|
1017
|
-
if self._sklearn_object is None:
|
1018
|
-
self._sklearn_object = self._create_sklearn_object()
|
1019
|
-
return self._sklearn_object
|
1020
|
-
|
1021
|
-
def to_sklearn(self) -> Any:
|
1022
|
-
raise exceptions.SnowflakeMLException(
|
1023
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1024
|
-
original_exception=AttributeError(
|
1025
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1026
|
-
"to_sklearn()",
|
1027
|
-
"to_lightgbm()"
|
1028
|
-
)
|
1029
|
-
),
|
1030
|
-
)
|
1031
|
-
|
1032
|
-
def to_xgboost(self) -> Any:
|
1033
|
-
raise exceptions.SnowflakeMLException(
|
1034
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1035
|
-
original_exception=AttributeError(
|
1036
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1037
|
-
"to_xgboost()",
|
1038
|
-
"to_lightgbm()"
|
1039
|
-
)
|
1040
|
-
),
|
1041
|
-
)
|
1042
|
-
|
1043
|
-
def _get_dependencies(self) -> List[str]:
|
1044
|
-
return self._deps
|