snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -233,12 +232,7 @@ class LGBMRegressor(BaseTransformer):
|
|
233
232
|
)
|
234
233
|
return selected_cols
|
235
234
|
|
236
|
-
|
237
|
-
project=_PROJECT,
|
238
|
-
subproject=_SUBPROJECT,
|
239
|
-
custom_tags=dict([("autogen", True)]),
|
240
|
-
)
|
241
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMRegressor":
|
235
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "LGBMRegressor":
|
242
236
|
"""Build a gradient boosting model from the training set (X, y)
|
243
237
|
For more details on this function, see [lightgbm.LGBMRegressor.fit]
|
244
238
|
(https://lightgbm.readthedocs.io/en/v3.3.2/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor.fit)
|
@@ -265,12 +259,14 @@ class LGBMRegressor(BaseTransformer):
|
|
265
259
|
|
266
260
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
267
261
|
|
268
|
-
|
262
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
269
263
|
if SNOWML_SPROC_ENV in os.environ:
|
270
264
|
statement_params = telemetry.get_function_usage_statement_params(
|
271
265
|
project=_PROJECT,
|
272
266
|
subproject=_SUBPROJECT,
|
273
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
267
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
268
|
+
inspect.currentframe(), LGBMRegressor.__class__.__name__
|
269
|
+
),
|
274
270
|
api_calls=[Session.call],
|
275
271
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
276
272
|
)
|
@@ -291,7 +287,7 @@ class LGBMRegressor(BaseTransformer):
|
|
291
287
|
)
|
292
288
|
self._sklearn_object = model_trainer.train()
|
293
289
|
self._is_fitted = True
|
294
|
-
self.
|
290
|
+
self._generate_model_signatures(dataset)
|
295
291
|
return self
|
296
292
|
|
297
293
|
def _batch_inference_validate_snowpark(
|
@@ -367,7 +363,9 @@ class LGBMRegressor(BaseTransformer):
|
|
367
363
|
# when it is classifier, infer the datatype from label columns
|
368
364
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
369
365
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
370
|
-
label_cols_signatures = [
|
366
|
+
label_cols_signatures = [
|
367
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
368
|
+
]
|
371
369
|
if len(label_cols_signatures) == 0:
|
372
370
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
373
371
|
raise exceptions.SnowflakeMLException(
|
@@ -375,25 +373,22 @@ class LGBMRegressor(BaseTransformer):
|
|
375
373
|
original_exception=ValueError(error_str),
|
376
374
|
)
|
377
375
|
|
378
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
379
|
-
label_cols_signatures[0].as_snowpark_type()
|
380
|
-
)
|
376
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
381
377
|
|
382
378
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
383
|
-
assert isinstance(
|
379
|
+
assert isinstance(
|
380
|
+
dataset._session, Session
|
381
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
384
382
|
|
385
383
|
transform_kwargs = dict(
|
386
|
-
session
|
387
|
-
dependencies
|
388
|
-
drop_input_cols
|
389
|
-
expected_output_cols_type
|
384
|
+
session=dataset._session,
|
385
|
+
dependencies=self._deps,
|
386
|
+
drop_input_cols=self._drop_input_cols,
|
387
|
+
expected_output_cols_type=expected_type_inferred,
|
390
388
|
)
|
391
389
|
|
392
390
|
elif isinstance(dataset, pd.DataFrame):
|
393
|
-
transform_kwargs = dict(
|
394
|
-
snowpark_input_cols = self._snowpark_cols,
|
395
|
-
drop_input_cols = self._drop_input_cols
|
396
|
-
)
|
391
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
397
392
|
|
398
393
|
transform_handlers = ModelTransformerBuilder.build(
|
399
394
|
dataset=dataset,
|
@@ -433,7 +428,7 @@ class LGBMRegressor(BaseTransformer):
|
|
433
428
|
Transformed dataset.
|
434
429
|
"""
|
435
430
|
super()._check_dataset_type(dataset)
|
436
|
-
inference_method="transform"
|
431
|
+
inference_method = "transform"
|
437
432
|
|
438
433
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
439
434
|
# are specific to the type of dataset used.
|
@@ -470,17 +465,14 @@ class LGBMRegressor(BaseTransformer):
|
|
470
465
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
471
466
|
|
472
467
|
transform_kwargs = dict(
|
473
|
-
session
|
474
|
-
dependencies
|
475
|
-
drop_input_cols
|
476
|
-
expected_output_cols_type
|
468
|
+
session=dataset._session,
|
469
|
+
dependencies=self._deps,
|
470
|
+
drop_input_cols=self._drop_input_cols,
|
471
|
+
expected_output_cols_type=expected_dtype,
|
477
472
|
)
|
478
473
|
|
479
474
|
elif isinstance(dataset, pd.DataFrame):
|
480
|
-
transform_kwargs = dict(
|
481
|
-
snowpark_input_cols = self._snowpark_cols,
|
482
|
-
drop_input_cols = self._drop_input_cols
|
483
|
-
)
|
475
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
484
476
|
|
485
477
|
transform_handlers = ModelTransformerBuilder.build(
|
486
478
|
dataset=dataset,
|
@@ -499,7 +491,11 @@ class LGBMRegressor(BaseTransformer):
|
|
499
491
|
return output_df
|
500
492
|
|
501
493
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
502
|
-
def fit_predict(
|
494
|
+
def fit_predict(
|
495
|
+
self,
|
496
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
497
|
+
output_cols_prefix: str = "fit_predict_",
|
498
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
503
499
|
""" Method not supported for this class.
|
504
500
|
|
505
501
|
|
@@ -524,7 +520,9 @@ class LGBMRegressor(BaseTransformer):
|
|
524
520
|
)
|
525
521
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
526
522
|
drop_input_cols=self._drop_input_cols,
|
527
|
-
expected_output_cols_list=
|
523
|
+
expected_output_cols_list=(
|
524
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
525
|
+
),
|
528
526
|
)
|
529
527
|
self._sklearn_object = fitted_estimator
|
530
528
|
self._is_fitted = True
|
@@ -541,6 +539,62 @@ class LGBMRegressor(BaseTransformer):
|
|
541
539
|
assert self._sklearn_object is not None
|
542
540
|
return self._sklearn_object.embedding_
|
543
541
|
|
542
|
+
|
543
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
544
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
545
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
546
|
+
"""
|
547
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
548
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
549
|
+
if output_cols:
|
550
|
+
output_cols = [
|
551
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
552
|
+
for c in output_cols
|
553
|
+
]
|
554
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
555
|
+
output_cols = [output_cols_prefix]
|
556
|
+
elif self._sklearn_object is not None:
|
557
|
+
classes = self._sklearn_object.classes_
|
558
|
+
if isinstance(classes, numpy.ndarray):
|
559
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
560
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
561
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
562
|
+
output_cols = []
|
563
|
+
for i, cl in enumerate(classes):
|
564
|
+
# For binary classification, there is only one output column for each class
|
565
|
+
# ndarray as the two classes are complementary.
|
566
|
+
if len(cl) == 2:
|
567
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
568
|
+
else:
|
569
|
+
output_cols.extend([
|
570
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
571
|
+
])
|
572
|
+
else:
|
573
|
+
output_cols = []
|
574
|
+
|
575
|
+
# Make sure column names are valid snowflake identifiers.
|
576
|
+
assert output_cols is not None # Make MyPy happy
|
577
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
578
|
+
|
579
|
+
return rv
|
580
|
+
|
581
|
+
def _align_expected_output_names(
|
582
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
583
|
+
) -> List[str]:
|
584
|
+
# in case the inferred output column names dimension is different
|
585
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
586
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
587
|
+
output_df_columns = list(output_df_pd.columns)
|
588
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
589
|
+
if self.sample_weight_col:
|
590
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
591
|
+
# if the dimension of inferred output column names is correct; use it
|
592
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
593
|
+
return expected_output_cols_list
|
594
|
+
# otherwise, use the sklearn estimator's output
|
595
|
+
else:
|
596
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
597
|
+
|
544
598
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
545
599
|
@telemetry.send_api_usage_telemetry(
|
546
600
|
project=_PROJECT,
|
@@ -571,24 +625,28 @@ class LGBMRegressor(BaseTransformer):
|
|
571
625
|
# are specific to the type of dataset used.
|
572
626
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
573
627
|
|
628
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
629
|
+
|
574
630
|
if isinstance(dataset, DataFrame):
|
575
631
|
self._deps = self._batch_inference_validate_snowpark(
|
576
632
|
dataset=dataset,
|
577
633
|
inference_method=inference_method,
|
578
634
|
)
|
579
|
-
assert isinstance(
|
635
|
+
assert isinstance(
|
636
|
+
dataset._session, Session
|
637
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
580
638
|
transform_kwargs = dict(
|
581
639
|
session=dataset._session,
|
582
640
|
dependencies=self._deps,
|
583
|
-
drop_input_cols
|
641
|
+
drop_input_cols=self._drop_input_cols,
|
584
642
|
expected_output_cols_type="float",
|
585
643
|
)
|
644
|
+
expected_output_cols = self._align_expected_output_names(
|
645
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
646
|
+
)
|
586
647
|
|
587
648
|
elif isinstance(dataset, pd.DataFrame):
|
588
|
-
transform_kwargs = dict(
|
589
|
-
snowpark_input_cols = self._snowpark_cols,
|
590
|
-
drop_input_cols = self._drop_input_cols
|
591
|
-
)
|
649
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
592
650
|
|
593
651
|
transform_handlers = ModelTransformerBuilder.build(
|
594
652
|
dataset=dataset,
|
@@ -600,7 +658,7 @@ class LGBMRegressor(BaseTransformer):
|
|
600
658
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
601
659
|
inference_method=inference_method,
|
602
660
|
input_cols=self.input_cols,
|
603
|
-
expected_output_cols=
|
661
|
+
expected_output_cols=expected_output_cols,
|
604
662
|
**transform_kwargs
|
605
663
|
)
|
606
664
|
return output_df
|
@@ -630,7 +688,8 @@ class LGBMRegressor(BaseTransformer):
|
|
630
688
|
Output dataset with log probability of the sample for each class in the model.
|
631
689
|
"""
|
632
690
|
super()._check_dataset_type(dataset)
|
633
|
-
inference_method="predict_log_proba"
|
691
|
+
inference_method = "predict_log_proba"
|
692
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
634
693
|
|
635
694
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
636
695
|
# are specific to the type of dataset used.
|
@@ -641,18 +700,20 @@ class LGBMRegressor(BaseTransformer):
|
|
641
700
|
dataset=dataset,
|
642
701
|
inference_method=inference_method,
|
643
702
|
)
|
644
|
-
assert isinstance(
|
703
|
+
assert isinstance(
|
704
|
+
dataset._session, Session
|
705
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
645
706
|
transform_kwargs = dict(
|
646
707
|
session=dataset._session,
|
647
708
|
dependencies=self._deps,
|
648
|
-
drop_input_cols
|
709
|
+
drop_input_cols=self._drop_input_cols,
|
649
710
|
expected_output_cols_type="float",
|
650
711
|
)
|
712
|
+
expected_output_cols = self._align_expected_output_names(
|
713
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
714
|
+
)
|
651
715
|
elif isinstance(dataset, pd.DataFrame):
|
652
|
-
transform_kwargs = dict(
|
653
|
-
snowpark_input_cols = self._snowpark_cols,
|
654
|
-
drop_input_cols = self._drop_input_cols
|
655
|
-
)
|
716
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
656
717
|
|
657
718
|
transform_handlers = ModelTransformerBuilder.build(
|
658
719
|
dataset=dataset,
|
@@ -665,7 +726,7 @@ class LGBMRegressor(BaseTransformer):
|
|
665
726
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
666
727
|
inference_method=inference_method,
|
667
728
|
input_cols=self.input_cols,
|
668
|
-
expected_output_cols=
|
729
|
+
expected_output_cols=expected_output_cols,
|
669
730
|
**transform_kwargs
|
670
731
|
)
|
671
732
|
return output_df
|
@@ -691,30 +752,34 @@ class LGBMRegressor(BaseTransformer):
|
|
691
752
|
Output dataset with results of the decision function for the samples in input dataset.
|
692
753
|
"""
|
693
754
|
super()._check_dataset_type(dataset)
|
694
|
-
inference_method="decision_function"
|
755
|
+
inference_method = "decision_function"
|
695
756
|
|
696
757
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
697
758
|
# are specific to the type of dataset used.
|
698
759
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
699
760
|
|
761
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
762
|
+
|
700
763
|
if isinstance(dataset, DataFrame):
|
701
764
|
self._deps = self._batch_inference_validate_snowpark(
|
702
765
|
dataset=dataset,
|
703
766
|
inference_method=inference_method,
|
704
767
|
)
|
705
|
-
assert isinstance(
|
768
|
+
assert isinstance(
|
769
|
+
dataset._session, Session
|
770
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
706
771
|
transform_kwargs = dict(
|
707
772
|
session=dataset._session,
|
708
773
|
dependencies=self._deps,
|
709
|
-
drop_input_cols
|
774
|
+
drop_input_cols=self._drop_input_cols,
|
710
775
|
expected_output_cols_type="float",
|
711
776
|
)
|
777
|
+
expected_output_cols = self._align_expected_output_names(
|
778
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
779
|
+
)
|
712
780
|
|
713
781
|
elif isinstance(dataset, pd.DataFrame):
|
714
|
-
transform_kwargs = dict(
|
715
|
-
snowpark_input_cols = self._snowpark_cols,
|
716
|
-
drop_input_cols = self._drop_input_cols
|
717
|
-
)
|
782
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
718
783
|
|
719
784
|
transform_handlers = ModelTransformerBuilder.build(
|
720
785
|
dataset=dataset,
|
@@ -727,7 +792,7 @@ class LGBMRegressor(BaseTransformer):
|
|
727
792
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
728
793
|
inference_method=inference_method,
|
729
794
|
input_cols=self.input_cols,
|
730
|
-
expected_output_cols=
|
795
|
+
expected_output_cols=expected_output_cols,
|
731
796
|
**transform_kwargs
|
732
797
|
)
|
733
798
|
return output_df
|
@@ -756,12 +821,14 @@ class LGBMRegressor(BaseTransformer):
|
|
756
821
|
Output dataset with probability of the sample for each class in the model.
|
757
822
|
"""
|
758
823
|
super()._check_dataset_type(dataset)
|
759
|
-
inference_method="score_samples"
|
824
|
+
inference_method = "score_samples"
|
760
825
|
|
761
826
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
762
827
|
# are specific to the type of dataset used.
|
763
828
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
764
829
|
|
830
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
831
|
+
|
765
832
|
if isinstance(dataset, DataFrame):
|
766
833
|
self._deps = self._batch_inference_validate_snowpark(
|
767
834
|
dataset=dataset,
|
@@ -774,6 +841,9 @@ class LGBMRegressor(BaseTransformer):
|
|
774
841
|
drop_input_cols = self._drop_input_cols,
|
775
842
|
expected_output_cols_type="float",
|
776
843
|
)
|
844
|
+
expected_output_cols = self._align_expected_output_names(
|
845
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
846
|
+
)
|
777
847
|
|
778
848
|
elif isinstance(dataset, pd.DataFrame):
|
779
849
|
transform_kwargs = dict(
|
@@ -792,7 +862,7 @@ class LGBMRegressor(BaseTransformer):
|
|
792
862
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
793
863
|
inference_method=inference_method,
|
794
864
|
input_cols=self.input_cols,
|
795
|
-
expected_output_cols=
|
865
|
+
expected_output_cols=expected_output_cols,
|
796
866
|
**transform_kwargs
|
797
867
|
)
|
798
868
|
return output_df
|
@@ -939,50 +1009,84 @@ class LGBMRegressor(BaseTransformer):
|
|
939
1009
|
)
|
940
1010
|
return output_df
|
941
1011
|
|
1012
|
+
|
1013
|
+
|
1014
|
+
def to_lightgbm(self) -> Any:
|
1015
|
+
"""Get lightgbm.LGBMRegressor object.
|
1016
|
+
"""
|
1017
|
+
if self._sklearn_object is None:
|
1018
|
+
self._sklearn_object = self._create_sklearn_object()
|
1019
|
+
return self._sklearn_object
|
1020
|
+
|
1021
|
+
def to_sklearn(self) -> Any:
|
1022
|
+
raise exceptions.SnowflakeMLException(
|
1023
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1024
|
+
original_exception=AttributeError(
|
1025
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1026
|
+
"to_sklearn()",
|
1027
|
+
"to_lightgbm()"
|
1028
|
+
)
|
1029
|
+
),
|
1030
|
+
)
|
1031
|
+
|
1032
|
+
def to_xgboost(self) -> Any:
|
1033
|
+
raise exceptions.SnowflakeMLException(
|
1034
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1035
|
+
original_exception=AttributeError(
|
1036
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1037
|
+
"to_xgboost()",
|
1038
|
+
"to_lightgbm()"
|
1039
|
+
)
|
1040
|
+
),
|
1041
|
+
)
|
942
1042
|
|
943
|
-
def
|
1043
|
+
def _get_dependencies(self) -> List[str]:
|
1044
|
+
return self._deps
|
1045
|
+
|
1046
|
+
|
1047
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
944
1048
|
self._model_signature_dict = dict()
|
945
1049
|
|
946
1050
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
947
1051
|
|
948
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1052
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
949
1053
|
outputs: List[BaseFeatureSpec] = []
|
950
1054
|
if hasattr(self, "predict"):
|
951
1055
|
# keep mypy happy
|
952
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1056
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
953
1057
|
# For classifier, the type of predict is the same as the type of label
|
954
|
-
if self._sklearn_object._estimator_type ==
|
955
|
-
|
1058
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1059
|
+
# label columns is the desired type for output
|
956
1060
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
957
1061
|
# rename the output columns
|
958
1062
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
959
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
960
|
-
|
961
|
-
|
1063
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1064
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1065
|
+
)
|
962
1066
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
963
1067
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
964
|
-
# Clusterer returns int64 cluster labels.
|
1068
|
+
# Clusterer returns int64 cluster labels.
|
965
1069
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
966
1070
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
967
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
968
|
-
|
969
|
-
|
970
|
-
|
1071
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1072
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1073
|
+
)
|
1074
|
+
|
971
1075
|
# For regressor, the type of predict is float64
|
972
|
-
elif self._sklearn_object._estimator_type ==
|
1076
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
973
1077
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
974
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
975
|
-
|
976
|
-
|
977
|
-
|
1078
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1079
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1080
|
+
)
|
1081
|
+
|
978
1082
|
for prob_func in PROB_FUNCTIONS:
|
979
1083
|
if hasattr(self, prob_func):
|
980
1084
|
output_cols_prefix: str = f"{prob_func}_"
|
981
1085
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
982
1086
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
983
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
984
|
-
|
985
|
-
|
1087
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1088
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1089
|
+
)
|
986
1090
|
|
987
1091
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
988
1092
|
items = list(self._model_signature_dict.items())
|
@@ -995,10 +1099,10 @@ class LGBMRegressor(BaseTransformer):
|
|
995
1099
|
"""Returns model signature of current class.
|
996
1100
|
|
997
1101
|
Raises:
|
998
|
-
|
1102
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
999
1103
|
|
1000
1104
|
Returns:
|
1001
|
-
Dict
|
1105
|
+
Dict with each method and its input output signature
|
1002
1106
|
"""
|
1003
1107
|
if self._model_signature_dict is None:
|
1004
1108
|
raise exceptions.SnowflakeMLException(
|
@@ -1006,35 +1110,3 @@ class LGBMRegressor(BaseTransformer):
|
|
1006
1110
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1007
1111
|
)
|
1008
1112
|
return self._model_signature_dict
|
1009
|
-
|
1010
|
-
def to_lightgbm(self) -> Any:
|
1011
|
-
"""Get lightgbm.LGBMRegressor object.
|
1012
|
-
"""
|
1013
|
-
if self._sklearn_object is None:
|
1014
|
-
self._sklearn_object = self._create_sklearn_object()
|
1015
|
-
return self._sklearn_object
|
1016
|
-
|
1017
|
-
def to_sklearn(self) -> Any:
|
1018
|
-
raise exceptions.SnowflakeMLException(
|
1019
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1020
|
-
original_exception=AttributeError(
|
1021
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1022
|
-
"to_sklearn()",
|
1023
|
-
"to_lightgbm()"
|
1024
|
-
)
|
1025
|
-
),
|
1026
|
-
)
|
1027
|
-
|
1028
|
-
def to_xgboost(self) -> Any:
|
1029
|
-
raise exceptions.SnowflakeMLException(
|
1030
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1031
|
-
original_exception=AttributeError(
|
1032
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1033
|
-
"to_xgboost()",
|
1034
|
-
"to_lightgbm()"
|
1035
|
-
)
|
1036
|
-
),
|
1037
|
-
)
|
1038
|
-
|
1039
|
-
def _get_dependencies(self) -> List[str]:
|
1040
|
-
return self._deps
|