snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -233,12 +232,7 @@ class LGBMClassifier(BaseTransformer):
|
|
233
232
|
)
|
234
233
|
return selected_cols
|
235
234
|
|
236
|
-
|
237
|
-
project=_PROJECT,
|
238
|
-
subproject=_SUBPROJECT,
|
239
|
-
custom_tags=dict([("autogen", True)]),
|
240
|
-
)
|
241
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMClassifier":
|
235
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMClassifier":
|
242
236
|
"""Build a gradient boosting model from the training set (X, y)
|
243
237
|
For more details on this function, see [lightgbm.LGBMClassifier.fit]
|
244
238
|
(https://lightgbm.readthedocs.io/en/v3.3.2/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit)
|
@@ -265,12 +259,14 @@ class LGBMClassifier(BaseTransformer):
|
|
265
259
|
|
266
260
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
267
261
|
|
268
|
-
|
262
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
269
263
|
if SNOWML_SPROC_ENV in os.environ:
|
270
264
|
statement_params = telemetry.get_function_usage_statement_params(
|
271
265
|
project=_PROJECT,
|
272
266
|
subproject=_SUBPROJECT,
|
273
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
267
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
268
|
+
inspect.currentframe(), LGBMClassifier.__class__.__name__
|
269
|
+
),
|
274
270
|
api_calls=[Session.call],
|
275
271
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
276
272
|
)
|
@@ -291,7 +287,7 @@ class LGBMClassifier(BaseTransformer):
|
|
291
287
|
)
|
292
288
|
self._sklearn_object = model_trainer.train()
|
293
289
|
self._is_fitted = True
|
294
|
-
self.
|
290
|
+
self._generate_model_signatures(dataset)
|
295
291
|
return self
|
296
292
|
|
297
293
|
def _batch_inference_validate_snowpark(
|
@@ -367,7 +363,9 @@ class LGBMClassifier(BaseTransformer):
|
|
367
363
|
# when it is classifier, infer the datatype from label columns
|
368
364
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
369
365
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
370
|
-
label_cols_signatures = [
|
366
|
+
label_cols_signatures = [
|
367
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
368
|
+
]
|
371
369
|
if len(label_cols_signatures) == 0:
|
372
370
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
373
371
|
raise exceptions.SnowflakeMLException(
|
@@ -375,25 +373,22 @@ class LGBMClassifier(BaseTransformer):
|
|
375
373
|
original_exception=ValueError(error_str),
|
376
374
|
)
|
377
375
|
|
378
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
379
|
-
label_cols_signatures[0].as_snowpark_type()
|
380
|
-
)
|
376
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
381
377
|
|
382
378
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
383
|
-
assert isinstance(
|
379
|
+
assert isinstance(
|
380
|
+
dataset._session, Session
|
381
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
384
382
|
|
385
383
|
transform_kwargs = dict(
|
386
|
-
session
|
387
|
-
dependencies
|
388
|
-
drop_input_cols
|
389
|
-
expected_output_cols_type
|
384
|
+
session=dataset._session,
|
385
|
+
dependencies=self._deps,
|
386
|
+
drop_input_cols=self._drop_input_cols,
|
387
|
+
expected_output_cols_type=expected_type_inferred,
|
390
388
|
)
|
391
389
|
|
392
390
|
elif isinstance(dataset, pd.DataFrame):
|
393
|
-
transform_kwargs = dict(
|
394
|
-
snowpark_input_cols = self._snowpark_cols,
|
395
|
-
drop_input_cols = self._drop_input_cols
|
396
|
-
)
|
391
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
397
392
|
|
398
393
|
transform_handlers = ModelTransformerBuilder.build(
|
399
394
|
dataset=dataset,
|
@@ -433,7 +428,7 @@ class LGBMClassifier(BaseTransformer):
|
|
433
428
|
Transformed dataset.
|
434
429
|
"""
|
435
430
|
super()._check_dataset_type(dataset)
|
436
|
-
inference_method="transform"
|
431
|
+
inference_method = "transform"
|
437
432
|
|
438
433
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
439
434
|
# are specific to the type of dataset used.
|
@@ -470,17 +465,14 @@ class LGBMClassifier(BaseTransformer):
|
|
470
465
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
471
466
|
|
472
467
|
transform_kwargs = dict(
|
473
|
-
session
|
474
|
-
dependencies
|
475
|
-
drop_input_cols
|
476
|
-
expected_output_cols_type
|
468
|
+
session=dataset._session,
|
469
|
+
dependencies=self._deps,
|
470
|
+
drop_input_cols=self._drop_input_cols,
|
471
|
+
expected_output_cols_type=expected_dtype,
|
477
472
|
)
|
478
473
|
|
479
474
|
elif isinstance(dataset, pd.DataFrame):
|
480
|
-
transform_kwargs = dict(
|
481
|
-
snowpark_input_cols = self._snowpark_cols,
|
482
|
-
drop_input_cols = self._drop_input_cols
|
483
|
-
)
|
475
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
484
476
|
|
485
477
|
transform_handlers = ModelTransformerBuilder.build(
|
486
478
|
dataset=dataset,
|
@@ -499,7 +491,11 @@ class LGBMClassifier(BaseTransformer):
|
|
499
491
|
return output_df
|
500
492
|
|
501
493
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
502
|
-
def fit_predict(
|
494
|
+
def fit_predict(
|
495
|
+
self,
|
496
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
497
|
+
output_cols_prefix: str = "fit_predict_",
|
498
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
503
499
|
""" Method not supported for this class.
|
504
500
|
|
505
501
|
|
@@ -524,7 +520,9 @@ class LGBMClassifier(BaseTransformer):
|
|
524
520
|
)
|
525
521
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
526
522
|
drop_input_cols=self._drop_input_cols,
|
527
|
-
expected_output_cols_list=
|
523
|
+
expected_output_cols_list=(
|
524
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
525
|
+
),
|
528
526
|
)
|
529
527
|
self._sklearn_object = fitted_estimator
|
530
528
|
self._is_fitted = True
|
@@ -541,6 +539,62 @@ class LGBMClassifier(BaseTransformer):
|
|
541
539
|
assert self._sklearn_object is not None
|
542
540
|
return self._sklearn_object.embedding_
|
543
541
|
|
542
|
+
|
543
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
544
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
545
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
546
|
+
"""
|
547
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
548
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
549
|
+
if output_cols:
|
550
|
+
output_cols = [
|
551
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
552
|
+
for c in output_cols
|
553
|
+
]
|
554
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
555
|
+
output_cols = [output_cols_prefix]
|
556
|
+
elif self._sklearn_object is not None:
|
557
|
+
classes = self._sklearn_object.classes_
|
558
|
+
if isinstance(classes, numpy.ndarray):
|
559
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
560
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
561
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
562
|
+
output_cols = []
|
563
|
+
for i, cl in enumerate(classes):
|
564
|
+
# For binary classification, there is only one output column for each class
|
565
|
+
# ndarray as the two classes are complementary.
|
566
|
+
if len(cl) == 2:
|
567
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
568
|
+
else:
|
569
|
+
output_cols.extend([
|
570
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
571
|
+
])
|
572
|
+
else:
|
573
|
+
output_cols = []
|
574
|
+
|
575
|
+
# Make sure column names are valid snowflake identifiers.
|
576
|
+
assert output_cols is not None # Make MyPy happy
|
577
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
578
|
+
|
579
|
+
return rv
|
580
|
+
|
581
|
+
def _align_expected_output_names(
|
582
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
583
|
+
) -> List[str]:
|
584
|
+
# in case the inferred output column names dimension is different
|
585
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
586
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
587
|
+
output_df_columns = list(output_df_pd.columns)
|
588
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
589
|
+
if self.sample_weight_col:
|
590
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
591
|
+
# if the dimension of inferred output column names is correct; use it
|
592
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
593
|
+
return expected_output_cols_list
|
594
|
+
# otherwise, use the sklearn estimator's output
|
595
|
+
else:
|
596
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
597
|
+
|
544
598
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
545
599
|
@telemetry.send_api_usage_telemetry(
|
546
600
|
project=_PROJECT,
|
@@ -573,24 +627,28 @@ class LGBMClassifier(BaseTransformer):
|
|
573
627
|
# are specific to the type of dataset used.
|
574
628
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
575
629
|
|
630
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
631
|
+
|
576
632
|
if isinstance(dataset, DataFrame):
|
577
633
|
self._deps = self._batch_inference_validate_snowpark(
|
578
634
|
dataset=dataset,
|
579
635
|
inference_method=inference_method,
|
580
636
|
)
|
581
|
-
assert isinstance(
|
637
|
+
assert isinstance(
|
638
|
+
dataset._session, Session
|
639
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
582
640
|
transform_kwargs = dict(
|
583
641
|
session=dataset._session,
|
584
642
|
dependencies=self._deps,
|
585
|
-
drop_input_cols
|
643
|
+
drop_input_cols=self._drop_input_cols,
|
586
644
|
expected_output_cols_type="float",
|
587
645
|
)
|
646
|
+
expected_output_cols = self._align_expected_output_names(
|
647
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
648
|
+
)
|
588
649
|
|
589
650
|
elif isinstance(dataset, pd.DataFrame):
|
590
|
-
transform_kwargs = dict(
|
591
|
-
snowpark_input_cols = self._snowpark_cols,
|
592
|
-
drop_input_cols = self._drop_input_cols
|
593
|
-
)
|
651
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
594
652
|
|
595
653
|
transform_handlers = ModelTransformerBuilder.build(
|
596
654
|
dataset=dataset,
|
@@ -602,7 +660,7 @@ class LGBMClassifier(BaseTransformer):
|
|
602
660
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
603
661
|
inference_method=inference_method,
|
604
662
|
input_cols=self.input_cols,
|
605
|
-
expected_output_cols=
|
663
|
+
expected_output_cols=expected_output_cols,
|
606
664
|
**transform_kwargs
|
607
665
|
)
|
608
666
|
return output_df
|
@@ -634,7 +692,8 @@ class LGBMClassifier(BaseTransformer):
|
|
634
692
|
Output dataset with log probability of the sample for each class in the model.
|
635
693
|
"""
|
636
694
|
super()._check_dataset_type(dataset)
|
637
|
-
inference_method="predict_log_proba"
|
695
|
+
inference_method = "predict_log_proba"
|
696
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
638
697
|
|
639
698
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
640
699
|
# are specific to the type of dataset used.
|
@@ -645,18 +704,20 @@ class LGBMClassifier(BaseTransformer):
|
|
645
704
|
dataset=dataset,
|
646
705
|
inference_method=inference_method,
|
647
706
|
)
|
648
|
-
assert isinstance(
|
707
|
+
assert isinstance(
|
708
|
+
dataset._session, Session
|
709
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
649
710
|
transform_kwargs = dict(
|
650
711
|
session=dataset._session,
|
651
712
|
dependencies=self._deps,
|
652
|
-
drop_input_cols
|
713
|
+
drop_input_cols=self._drop_input_cols,
|
653
714
|
expected_output_cols_type="float",
|
654
715
|
)
|
716
|
+
expected_output_cols = self._align_expected_output_names(
|
717
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
718
|
+
)
|
655
719
|
elif isinstance(dataset, pd.DataFrame):
|
656
|
-
transform_kwargs = dict(
|
657
|
-
snowpark_input_cols = self._snowpark_cols,
|
658
|
-
drop_input_cols = self._drop_input_cols
|
659
|
-
)
|
720
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
660
721
|
|
661
722
|
transform_handlers = ModelTransformerBuilder.build(
|
662
723
|
dataset=dataset,
|
@@ -669,7 +730,7 @@ class LGBMClassifier(BaseTransformer):
|
|
669
730
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
670
731
|
inference_method=inference_method,
|
671
732
|
input_cols=self.input_cols,
|
672
|
-
expected_output_cols=
|
733
|
+
expected_output_cols=expected_output_cols,
|
673
734
|
**transform_kwargs
|
674
735
|
)
|
675
736
|
return output_df
|
@@ -695,30 +756,34 @@ class LGBMClassifier(BaseTransformer):
|
|
695
756
|
Output dataset with results of the decision function for the samples in input dataset.
|
696
757
|
"""
|
697
758
|
super()._check_dataset_type(dataset)
|
698
|
-
inference_method="decision_function"
|
759
|
+
inference_method = "decision_function"
|
699
760
|
|
700
761
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
701
762
|
# are specific to the type of dataset used.
|
702
763
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
703
764
|
|
765
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
766
|
+
|
704
767
|
if isinstance(dataset, DataFrame):
|
705
768
|
self._deps = self._batch_inference_validate_snowpark(
|
706
769
|
dataset=dataset,
|
707
770
|
inference_method=inference_method,
|
708
771
|
)
|
709
|
-
assert isinstance(
|
772
|
+
assert isinstance(
|
773
|
+
dataset._session, Session
|
774
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
710
775
|
transform_kwargs = dict(
|
711
776
|
session=dataset._session,
|
712
777
|
dependencies=self._deps,
|
713
|
-
drop_input_cols
|
778
|
+
drop_input_cols=self._drop_input_cols,
|
714
779
|
expected_output_cols_type="float",
|
715
780
|
)
|
781
|
+
expected_output_cols = self._align_expected_output_names(
|
782
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
783
|
+
)
|
716
784
|
|
717
785
|
elif isinstance(dataset, pd.DataFrame):
|
718
|
-
transform_kwargs = dict(
|
719
|
-
snowpark_input_cols = self._snowpark_cols,
|
720
|
-
drop_input_cols = self._drop_input_cols
|
721
|
-
)
|
786
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
722
787
|
|
723
788
|
transform_handlers = ModelTransformerBuilder.build(
|
724
789
|
dataset=dataset,
|
@@ -731,7 +796,7 @@ class LGBMClassifier(BaseTransformer):
|
|
731
796
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
732
797
|
inference_method=inference_method,
|
733
798
|
input_cols=self.input_cols,
|
734
|
-
expected_output_cols=
|
799
|
+
expected_output_cols=expected_output_cols,
|
735
800
|
**transform_kwargs
|
736
801
|
)
|
737
802
|
return output_df
|
@@ -760,12 +825,14 @@ class LGBMClassifier(BaseTransformer):
|
|
760
825
|
Output dataset with probability of the sample for each class in the model.
|
761
826
|
"""
|
762
827
|
super()._check_dataset_type(dataset)
|
763
|
-
inference_method="score_samples"
|
828
|
+
inference_method = "score_samples"
|
764
829
|
|
765
830
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
766
831
|
# are specific to the type of dataset used.
|
767
832
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
768
833
|
|
834
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
835
|
+
|
769
836
|
if isinstance(dataset, DataFrame):
|
770
837
|
self._deps = self._batch_inference_validate_snowpark(
|
771
838
|
dataset=dataset,
|
@@ -778,6 +845,9 @@ class LGBMClassifier(BaseTransformer):
|
|
778
845
|
drop_input_cols = self._drop_input_cols,
|
779
846
|
expected_output_cols_type="float",
|
780
847
|
)
|
848
|
+
expected_output_cols = self._align_expected_output_names(
|
849
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
850
|
+
)
|
781
851
|
|
782
852
|
elif isinstance(dataset, pd.DataFrame):
|
783
853
|
transform_kwargs = dict(
|
@@ -796,7 +866,7 @@ class LGBMClassifier(BaseTransformer):
|
|
796
866
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
797
867
|
inference_method=inference_method,
|
798
868
|
input_cols=self.input_cols,
|
799
|
-
expected_output_cols=
|
869
|
+
expected_output_cols=expected_output_cols,
|
800
870
|
**transform_kwargs
|
801
871
|
)
|
802
872
|
return output_df
|
@@ -943,50 +1013,84 @@ class LGBMClassifier(BaseTransformer):
|
|
943
1013
|
)
|
944
1014
|
return output_df
|
945
1015
|
|
1016
|
+
|
1017
|
+
|
1018
|
+
def to_lightgbm(self) -> Any:
|
1019
|
+
"""Get lightgbm.LGBMClassifier object.
|
1020
|
+
"""
|
1021
|
+
if self._sklearn_object is None:
|
1022
|
+
self._sklearn_object = self._create_sklearn_object()
|
1023
|
+
return self._sklearn_object
|
1024
|
+
|
1025
|
+
def to_sklearn(self) -> Any:
|
1026
|
+
raise exceptions.SnowflakeMLException(
|
1027
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1028
|
+
original_exception=AttributeError(
|
1029
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1030
|
+
"to_sklearn()",
|
1031
|
+
"to_lightgbm()"
|
1032
|
+
)
|
1033
|
+
),
|
1034
|
+
)
|
1035
|
+
|
1036
|
+
def to_xgboost(self) -> Any:
|
1037
|
+
raise exceptions.SnowflakeMLException(
|
1038
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1039
|
+
original_exception=AttributeError(
|
1040
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1041
|
+
"to_xgboost()",
|
1042
|
+
"to_lightgbm()"
|
1043
|
+
)
|
1044
|
+
),
|
1045
|
+
)
|
946
1046
|
|
947
|
-
def
|
1047
|
+
def _get_dependencies(self) -> List[str]:
|
1048
|
+
return self._deps
|
1049
|
+
|
1050
|
+
|
1051
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
948
1052
|
self._model_signature_dict = dict()
|
949
1053
|
|
950
1054
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
951
1055
|
|
952
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1056
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
953
1057
|
outputs: List[BaseFeatureSpec] = []
|
954
1058
|
if hasattr(self, "predict"):
|
955
1059
|
# keep mypy happy
|
956
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1060
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
957
1061
|
# For classifier, the type of predict is the same as the type of label
|
958
|
-
if self._sklearn_object._estimator_type ==
|
959
|
-
|
1062
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1063
|
+
# label columns is the desired type for output
|
960
1064
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
961
1065
|
# rename the output columns
|
962
1066
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
963
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
964
|
-
|
965
|
-
|
1067
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1068
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1069
|
+
)
|
966
1070
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
967
1071
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
968
|
-
# Clusterer returns int64 cluster labels.
|
1072
|
+
# Clusterer returns int64 cluster labels.
|
969
1073
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
970
1074
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
971
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
972
|
-
|
973
|
-
|
974
|
-
|
1075
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1076
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1077
|
+
)
|
1078
|
+
|
975
1079
|
# For regressor, the type of predict is float64
|
976
|
-
elif self._sklearn_object._estimator_type ==
|
1080
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
977
1081
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
978
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
979
|
-
|
980
|
-
|
981
|
-
|
1082
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1083
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1084
|
+
)
|
1085
|
+
|
982
1086
|
for prob_func in PROB_FUNCTIONS:
|
983
1087
|
if hasattr(self, prob_func):
|
984
1088
|
output_cols_prefix: str = f"{prob_func}_"
|
985
1089
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
986
1090
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
987
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
988
|
-
|
989
|
-
|
1091
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1092
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1093
|
+
)
|
990
1094
|
|
991
1095
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
992
1096
|
items = list(self._model_signature_dict.items())
|
@@ -999,10 +1103,10 @@ class LGBMClassifier(BaseTransformer):
|
|
999
1103
|
"""Returns model signature of current class.
|
1000
1104
|
|
1001
1105
|
Raises:
|
1002
|
-
|
1106
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1003
1107
|
|
1004
1108
|
Returns:
|
1005
|
-
Dict
|
1109
|
+
Dict with each method and its input output signature
|
1006
1110
|
"""
|
1007
1111
|
if self._model_signature_dict is None:
|
1008
1112
|
raise exceptions.SnowflakeMLException(
|
@@ -1010,35 +1114,3 @@ class LGBMClassifier(BaseTransformer):
|
|
1010
1114
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1011
1115
|
)
|
1012
1116
|
return self._model_signature_dict
|
1013
|
-
|
1014
|
-
def to_lightgbm(self) -> Any:
|
1015
|
-
"""Get lightgbm.LGBMClassifier object.
|
1016
|
-
"""
|
1017
|
-
if self._sklearn_object is None:
|
1018
|
-
self._sklearn_object = self._create_sklearn_object()
|
1019
|
-
return self._sklearn_object
|
1020
|
-
|
1021
|
-
def to_sklearn(self) -> Any:
|
1022
|
-
raise exceptions.SnowflakeMLException(
|
1023
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1024
|
-
original_exception=AttributeError(
|
1025
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1026
|
-
"to_sklearn()",
|
1027
|
-
"to_lightgbm()"
|
1028
|
-
)
|
1029
|
-
),
|
1030
|
-
)
|
1031
|
-
|
1032
|
-
def to_xgboost(self) -> Any:
|
1033
|
-
raise exceptions.SnowflakeMLException(
|
1034
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1035
|
-
original_exception=AttributeError(
|
1036
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1037
|
-
"to_xgboost()",
|
1038
|
-
"to_lightgbm()"
|
1039
|
-
)
|
1040
|
-
),
|
1041
|
-
)
|
1042
|
-
|
1043
|
-
def _get_dependencies(self) -> List[str]:
|
1044
|
-
return self._deps
|