snowflake-ml-python 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
- snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
- snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
- snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
- snowflake/ml/_internal/utils/formatting.py +1 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +166 -184
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +6 -49
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -3
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +5 -2
- snowflake/ml/model/_model_composer/model_composer.py +7 -5
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +1 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +13 -1
- snowflake/ml/model/_packager/model_handlers/xgboost.py +1 -1
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/custom_model.py +3 -1
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/model_specifications.py +3 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +545 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +8 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +24 -6
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/impute/simple_imputer.py +4 -15
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +198 -125
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +198 -125
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/_manager/model_manager.py +5 -1
- snowflake/ml/registry/model_registry.py +99 -26
- snowflake/ml/registry/registry.py +3 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +94 -55
- {snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +218 -212
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.3.1.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -196,12 +195,7 @@ class VarianceThreshold(BaseTransformer):
|
|
196
195
|
)
|
197
196
|
return selected_cols
|
198
197
|
|
199
|
-
|
200
|
-
project=_PROJECT,
|
201
|
-
subproject=_SUBPROJECT,
|
202
|
-
custom_tags=dict([("autogen", True)]),
|
203
|
-
)
|
204
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "VarianceThreshold":
|
198
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "VarianceThreshold":
|
205
199
|
"""Learn empirical variances from X
|
206
200
|
For more details on this function, see [sklearn.feature_selection.VarianceThreshold.fit]
|
207
201
|
(https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html#sklearn.feature_selection.VarianceThreshold.fit)
|
@@ -228,12 +222,14 @@ class VarianceThreshold(BaseTransformer):
|
|
228
222
|
|
229
223
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
230
224
|
|
231
|
-
|
225
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
232
226
|
if SNOWML_SPROC_ENV in os.environ:
|
233
227
|
statement_params = telemetry.get_function_usage_statement_params(
|
234
228
|
project=_PROJECT,
|
235
229
|
subproject=_SUBPROJECT,
|
236
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
230
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
231
|
+
inspect.currentframe(), VarianceThreshold.__class__.__name__
|
232
|
+
),
|
237
233
|
api_calls=[Session.call],
|
238
234
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
239
235
|
)
|
@@ -254,7 +250,7 @@ class VarianceThreshold(BaseTransformer):
|
|
254
250
|
)
|
255
251
|
self._sklearn_object = model_trainer.train()
|
256
252
|
self._is_fitted = True
|
257
|
-
self.
|
253
|
+
self._generate_model_signatures(dataset)
|
258
254
|
return self
|
259
255
|
|
260
256
|
def _batch_inference_validate_snowpark(
|
@@ -328,7 +324,9 @@ class VarianceThreshold(BaseTransformer):
|
|
328
324
|
# when it is classifier, infer the datatype from label columns
|
329
325
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
330
326
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
331
|
-
label_cols_signatures = [
|
327
|
+
label_cols_signatures = [
|
328
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
329
|
+
]
|
332
330
|
if len(label_cols_signatures) == 0:
|
333
331
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
334
332
|
raise exceptions.SnowflakeMLException(
|
@@ -336,25 +334,22 @@ class VarianceThreshold(BaseTransformer):
|
|
336
334
|
original_exception=ValueError(error_str),
|
337
335
|
)
|
338
336
|
|
339
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
340
|
-
label_cols_signatures[0].as_snowpark_type()
|
341
|
-
)
|
337
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
342
338
|
|
343
339
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
344
|
-
assert isinstance(
|
340
|
+
assert isinstance(
|
341
|
+
dataset._session, Session
|
342
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
345
343
|
|
346
344
|
transform_kwargs = dict(
|
347
|
-
session
|
348
|
-
dependencies
|
349
|
-
drop_input_cols
|
350
|
-
expected_output_cols_type
|
345
|
+
session=dataset._session,
|
346
|
+
dependencies=self._deps,
|
347
|
+
drop_input_cols=self._drop_input_cols,
|
348
|
+
expected_output_cols_type=expected_type_inferred,
|
351
349
|
)
|
352
350
|
|
353
351
|
elif isinstance(dataset, pd.DataFrame):
|
354
|
-
transform_kwargs = dict(
|
355
|
-
snowpark_input_cols = self._snowpark_cols,
|
356
|
-
drop_input_cols = self._drop_input_cols
|
357
|
-
)
|
352
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
358
353
|
|
359
354
|
transform_handlers = ModelTransformerBuilder.build(
|
360
355
|
dataset=dataset,
|
@@ -396,7 +391,7 @@ class VarianceThreshold(BaseTransformer):
|
|
396
391
|
Transformed dataset.
|
397
392
|
"""
|
398
393
|
super()._check_dataset_type(dataset)
|
399
|
-
inference_method="transform"
|
394
|
+
inference_method = "transform"
|
400
395
|
|
401
396
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
402
397
|
# are specific to the type of dataset used.
|
@@ -433,17 +428,14 @@ class VarianceThreshold(BaseTransformer):
|
|
433
428
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
434
429
|
|
435
430
|
transform_kwargs = dict(
|
436
|
-
session
|
437
|
-
dependencies
|
438
|
-
drop_input_cols
|
439
|
-
expected_output_cols_type
|
431
|
+
session=dataset._session,
|
432
|
+
dependencies=self._deps,
|
433
|
+
drop_input_cols=self._drop_input_cols,
|
434
|
+
expected_output_cols_type=expected_dtype,
|
440
435
|
)
|
441
436
|
|
442
437
|
elif isinstance(dataset, pd.DataFrame):
|
443
|
-
transform_kwargs = dict(
|
444
|
-
snowpark_input_cols = self._snowpark_cols,
|
445
|
-
drop_input_cols = self._drop_input_cols
|
446
|
-
)
|
438
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
447
439
|
|
448
440
|
transform_handlers = ModelTransformerBuilder.build(
|
449
441
|
dataset=dataset,
|
@@ -462,7 +454,11 @@ class VarianceThreshold(BaseTransformer):
|
|
462
454
|
return output_df
|
463
455
|
|
464
456
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
465
|
-
def fit_predict(
|
457
|
+
def fit_predict(
|
458
|
+
self,
|
459
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
460
|
+
output_cols_prefix: str = "fit_predict_",
|
461
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
466
462
|
""" Method not supported for this class.
|
467
463
|
|
468
464
|
|
@@ -487,7 +483,9 @@ class VarianceThreshold(BaseTransformer):
|
|
487
483
|
)
|
488
484
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
489
485
|
drop_input_cols=self._drop_input_cols,
|
490
|
-
expected_output_cols_list=
|
486
|
+
expected_output_cols_list=(
|
487
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
488
|
+
),
|
491
489
|
)
|
492
490
|
self._sklearn_object = fitted_estimator
|
493
491
|
self._is_fitted = True
|
@@ -504,6 +502,62 @@ class VarianceThreshold(BaseTransformer):
|
|
504
502
|
assert self._sklearn_object is not None
|
505
503
|
return self._sklearn_object.embedding_
|
506
504
|
|
505
|
+
|
506
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
507
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
508
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
509
|
+
"""
|
510
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
511
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
512
|
+
if output_cols:
|
513
|
+
output_cols = [
|
514
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
515
|
+
for c in output_cols
|
516
|
+
]
|
517
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
518
|
+
output_cols = [output_cols_prefix]
|
519
|
+
elif self._sklearn_object is not None:
|
520
|
+
classes = self._sklearn_object.classes_
|
521
|
+
if isinstance(classes, numpy.ndarray):
|
522
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
523
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
524
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
525
|
+
output_cols = []
|
526
|
+
for i, cl in enumerate(classes):
|
527
|
+
# For binary classification, there is only one output column for each class
|
528
|
+
# ndarray as the two classes are complementary.
|
529
|
+
if len(cl) == 2:
|
530
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
531
|
+
else:
|
532
|
+
output_cols.extend([
|
533
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
534
|
+
])
|
535
|
+
else:
|
536
|
+
output_cols = []
|
537
|
+
|
538
|
+
# Make sure column names are valid snowflake identifiers.
|
539
|
+
assert output_cols is not None # Make MyPy happy
|
540
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
541
|
+
|
542
|
+
return rv
|
543
|
+
|
544
|
+
def _align_expected_output_names(
|
545
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
546
|
+
) -> List[str]:
|
547
|
+
# in case the inferred output column names dimension is different
|
548
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
549
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
550
|
+
output_df_columns = list(output_df_pd.columns)
|
551
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
552
|
+
if self.sample_weight_col:
|
553
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
554
|
+
# if the dimension of inferred output column names is correct; use it
|
555
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
556
|
+
return expected_output_cols_list
|
557
|
+
# otherwise, use the sklearn estimator's output
|
558
|
+
else:
|
559
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
560
|
+
|
507
561
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
508
562
|
@telemetry.send_api_usage_telemetry(
|
509
563
|
project=_PROJECT,
|
@@ -534,24 +588,28 @@ class VarianceThreshold(BaseTransformer):
|
|
534
588
|
# are specific to the type of dataset used.
|
535
589
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
536
590
|
|
591
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
592
|
+
|
537
593
|
if isinstance(dataset, DataFrame):
|
538
594
|
self._deps = self._batch_inference_validate_snowpark(
|
539
595
|
dataset=dataset,
|
540
596
|
inference_method=inference_method,
|
541
597
|
)
|
542
|
-
assert isinstance(
|
598
|
+
assert isinstance(
|
599
|
+
dataset._session, Session
|
600
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
543
601
|
transform_kwargs = dict(
|
544
602
|
session=dataset._session,
|
545
603
|
dependencies=self._deps,
|
546
|
-
drop_input_cols
|
604
|
+
drop_input_cols=self._drop_input_cols,
|
547
605
|
expected_output_cols_type="float",
|
548
606
|
)
|
607
|
+
expected_output_cols = self._align_expected_output_names(
|
608
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
609
|
+
)
|
549
610
|
|
550
611
|
elif isinstance(dataset, pd.DataFrame):
|
551
|
-
transform_kwargs = dict(
|
552
|
-
snowpark_input_cols = self._snowpark_cols,
|
553
|
-
drop_input_cols = self._drop_input_cols
|
554
|
-
)
|
612
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
555
613
|
|
556
614
|
transform_handlers = ModelTransformerBuilder.build(
|
557
615
|
dataset=dataset,
|
@@ -563,7 +621,7 @@ class VarianceThreshold(BaseTransformer):
|
|
563
621
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
564
622
|
inference_method=inference_method,
|
565
623
|
input_cols=self.input_cols,
|
566
|
-
expected_output_cols=
|
624
|
+
expected_output_cols=expected_output_cols,
|
567
625
|
**transform_kwargs
|
568
626
|
)
|
569
627
|
return output_df
|
@@ -593,7 +651,8 @@ class VarianceThreshold(BaseTransformer):
|
|
593
651
|
Output dataset with log probability of the sample for each class in the model.
|
594
652
|
"""
|
595
653
|
super()._check_dataset_type(dataset)
|
596
|
-
inference_method="predict_log_proba"
|
654
|
+
inference_method = "predict_log_proba"
|
655
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
597
656
|
|
598
657
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
599
658
|
# are specific to the type of dataset used.
|
@@ -604,18 +663,20 @@ class VarianceThreshold(BaseTransformer):
|
|
604
663
|
dataset=dataset,
|
605
664
|
inference_method=inference_method,
|
606
665
|
)
|
607
|
-
assert isinstance(
|
666
|
+
assert isinstance(
|
667
|
+
dataset._session, Session
|
668
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
608
669
|
transform_kwargs = dict(
|
609
670
|
session=dataset._session,
|
610
671
|
dependencies=self._deps,
|
611
|
-
drop_input_cols
|
672
|
+
drop_input_cols=self._drop_input_cols,
|
612
673
|
expected_output_cols_type="float",
|
613
674
|
)
|
675
|
+
expected_output_cols = self._align_expected_output_names(
|
676
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
677
|
+
)
|
614
678
|
elif isinstance(dataset, pd.DataFrame):
|
615
|
-
transform_kwargs = dict(
|
616
|
-
snowpark_input_cols = self._snowpark_cols,
|
617
|
-
drop_input_cols = self._drop_input_cols
|
618
|
-
)
|
679
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
619
680
|
|
620
681
|
transform_handlers = ModelTransformerBuilder.build(
|
621
682
|
dataset=dataset,
|
@@ -628,7 +689,7 @@ class VarianceThreshold(BaseTransformer):
|
|
628
689
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
629
690
|
inference_method=inference_method,
|
630
691
|
input_cols=self.input_cols,
|
631
|
-
expected_output_cols=
|
692
|
+
expected_output_cols=expected_output_cols,
|
632
693
|
**transform_kwargs
|
633
694
|
)
|
634
695
|
return output_df
|
@@ -654,30 +715,34 @@ class VarianceThreshold(BaseTransformer):
|
|
654
715
|
Output dataset with results of the decision function for the samples in input dataset.
|
655
716
|
"""
|
656
717
|
super()._check_dataset_type(dataset)
|
657
|
-
inference_method="decision_function"
|
718
|
+
inference_method = "decision_function"
|
658
719
|
|
659
720
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
660
721
|
# are specific to the type of dataset used.
|
661
722
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
662
723
|
|
724
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
725
|
+
|
663
726
|
if isinstance(dataset, DataFrame):
|
664
727
|
self._deps = self._batch_inference_validate_snowpark(
|
665
728
|
dataset=dataset,
|
666
729
|
inference_method=inference_method,
|
667
730
|
)
|
668
|
-
assert isinstance(
|
731
|
+
assert isinstance(
|
732
|
+
dataset._session, Session
|
733
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
669
734
|
transform_kwargs = dict(
|
670
735
|
session=dataset._session,
|
671
736
|
dependencies=self._deps,
|
672
|
-
drop_input_cols
|
737
|
+
drop_input_cols=self._drop_input_cols,
|
673
738
|
expected_output_cols_type="float",
|
674
739
|
)
|
740
|
+
expected_output_cols = self._align_expected_output_names(
|
741
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
742
|
+
)
|
675
743
|
|
676
744
|
elif isinstance(dataset, pd.DataFrame):
|
677
|
-
transform_kwargs = dict(
|
678
|
-
snowpark_input_cols = self._snowpark_cols,
|
679
|
-
drop_input_cols = self._drop_input_cols
|
680
|
-
)
|
745
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
681
746
|
|
682
747
|
transform_handlers = ModelTransformerBuilder.build(
|
683
748
|
dataset=dataset,
|
@@ -690,7 +755,7 @@ class VarianceThreshold(BaseTransformer):
|
|
690
755
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
691
756
|
inference_method=inference_method,
|
692
757
|
input_cols=self.input_cols,
|
693
|
-
expected_output_cols=
|
758
|
+
expected_output_cols=expected_output_cols,
|
694
759
|
**transform_kwargs
|
695
760
|
)
|
696
761
|
return output_df
|
@@ -719,12 +784,14 @@ class VarianceThreshold(BaseTransformer):
|
|
719
784
|
Output dataset with probability of the sample for each class in the model.
|
720
785
|
"""
|
721
786
|
super()._check_dataset_type(dataset)
|
722
|
-
inference_method="score_samples"
|
787
|
+
inference_method = "score_samples"
|
723
788
|
|
724
789
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
725
790
|
# are specific to the type of dataset used.
|
726
791
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
727
792
|
|
793
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
794
|
+
|
728
795
|
if isinstance(dataset, DataFrame):
|
729
796
|
self._deps = self._batch_inference_validate_snowpark(
|
730
797
|
dataset=dataset,
|
@@ -737,6 +804,9 @@ class VarianceThreshold(BaseTransformer):
|
|
737
804
|
drop_input_cols = self._drop_input_cols,
|
738
805
|
expected_output_cols_type="float",
|
739
806
|
)
|
807
|
+
expected_output_cols = self._align_expected_output_names(
|
808
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
809
|
+
)
|
740
810
|
|
741
811
|
elif isinstance(dataset, pd.DataFrame):
|
742
812
|
transform_kwargs = dict(
|
@@ -755,7 +825,7 @@ class VarianceThreshold(BaseTransformer):
|
|
755
825
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
756
826
|
inference_method=inference_method,
|
757
827
|
input_cols=self.input_cols,
|
758
|
-
expected_output_cols=
|
828
|
+
expected_output_cols=expected_output_cols,
|
759
829
|
**transform_kwargs
|
760
830
|
)
|
761
831
|
return output_df
|
@@ -900,50 +970,84 @@ class VarianceThreshold(BaseTransformer):
|
|
900
970
|
)
|
901
971
|
return output_df
|
902
972
|
|
973
|
+
|
974
|
+
|
975
|
+
def to_sklearn(self) -> Any:
|
976
|
+
"""Get sklearn.feature_selection.VarianceThreshold object.
|
977
|
+
"""
|
978
|
+
if self._sklearn_object is None:
|
979
|
+
self._sklearn_object = self._create_sklearn_object()
|
980
|
+
return self._sklearn_object
|
981
|
+
|
982
|
+
def to_xgboost(self) -> Any:
|
983
|
+
raise exceptions.SnowflakeMLException(
|
984
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
985
|
+
original_exception=AttributeError(
|
986
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
987
|
+
"to_xgboost()",
|
988
|
+
"to_sklearn()"
|
989
|
+
)
|
990
|
+
),
|
991
|
+
)
|
992
|
+
|
993
|
+
def to_lightgbm(self) -> Any:
|
994
|
+
raise exceptions.SnowflakeMLException(
|
995
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
996
|
+
original_exception=AttributeError(
|
997
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
998
|
+
"to_lightgbm()",
|
999
|
+
"to_sklearn()"
|
1000
|
+
)
|
1001
|
+
),
|
1002
|
+
)
|
903
1003
|
|
904
|
-
def
|
1004
|
+
def _get_dependencies(self) -> List[str]:
|
1005
|
+
return self._deps
|
1006
|
+
|
1007
|
+
|
1008
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
905
1009
|
self._model_signature_dict = dict()
|
906
1010
|
|
907
1011
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
908
1012
|
|
909
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1013
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
910
1014
|
outputs: List[BaseFeatureSpec] = []
|
911
1015
|
if hasattr(self, "predict"):
|
912
1016
|
# keep mypy happy
|
913
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1017
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
914
1018
|
# For classifier, the type of predict is the same as the type of label
|
915
|
-
if self._sklearn_object._estimator_type ==
|
916
|
-
|
1019
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1020
|
+
# label columns is the desired type for output
|
917
1021
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
918
1022
|
# rename the output columns
|
919
1023
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
920
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
921
|
-
|
922
|
-
|
1024
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1025
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1026
|
+
)
|
923
1027
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
924
1028
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
925
|
-
# Clusterer returns int64 cluster labels.
|
1029
|
+
# Clusterer returns int64 cluster labels.
|
926
1030
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
927
1031
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
928
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
929
|
-
|
930
|
-
|
931
|
-
|
1032
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1033
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1034
|
+
)
|
1035
|
+
|
932
1036
|
# For regressor, the type of predict is float64
|
933
|
-
elif self._sklearn_object._estimator_type ==
|
1037
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
934
1038
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
935
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
936
|
-
|
937
|
-
|
938
|
-
|
1039
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1040
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1041
|
+
)
|
1042
|
+
|
939
1043
|
for prob_func in PROB_FUNCTIONS:
|
940
1044
|
if hasattr(self, prob_func):
|
941
1045
|
output_cols_prefix: str = f"{prob_func}_"
|
942
1046
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
943
1047
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
944
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
945
|
-
|
946
|
-
|
1048
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1049
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1050
|
+
)
|
947
1051
|
|
948
1052
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
949
1053
|
items = list(self._model_signature_dict.items())
|
@@ -956,10 +1060,10 @@ class VarianceThreshold(BaseTransformer):
|
|
956
1060
|
"""Returns model signature of current class.
|
957
1061
|
|
958
1062
|
Raises:
|
959
|
-
|
1063
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
960
1064
|
|
961
1065
|
Returns:
|
962
|
-
Dict
|
1066
|
+
Dict with each method and its input output signature
|
963
1067
|
"""
|
964
1068
|
if self._model_signature_dict is None:
|
965
1069
|
raise exceptions.SnowflakeMLException(
|
@@ -967,35 +1071,3 @@ class VarianceThreshold(BaseTransformer):
|
|
967
1071
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
968
1072
|
)
|
969
1073
|
return self._model_signature_dict
|
970
|
-
|
971
|
-
def to_sklearn(self) -> Any:
|
972
|
-
"""Get sklearn.feature_selection.VarianceThreshold object.
|
973
|
-
"""
|
974
|
-
if self._sklearn_object is None:
|
975
|
-
self._sklearn_object = self._create_sklearn_object()
|
976
|
-
return self._sklearn_object
|
977
|
-
|
978
|
-
def to_xgboost(self) -> Any:
|
979
|
-
raise exceptions.SnowflakeMLException(
|
980
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
981
|
-
original_exception=AttributeError(
|
982
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
983
|
-
"to_xgboost()",
|
984
|
-
"to_sklearn()"
|
985
|
-
)
|
986
|
-
),
|
987
|
-
)
|
988
|
-
|
989
|
-
def to_lightgbm(self) -> Any:
|
990
|
-
raise exceptions.SnowflakeMLException(
|
991
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
992
|
-
original_exception=AttributeError(
|
993
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
994
|
-
"to_lightgbm()",
|
995
|
-
"to_sklearn()"
|
996
|
-
)
|
997
|
-
),
|
998
|
-
)
|
999
|
-
|
1000
|
-
def _get_dependencies(self) -> List[str]:
|
1001
|
-
return self._deps
|
@@ -200,7 +200,7 @@ def get_filtered_valid_sklearn_args(
|
|
200
200
|
):
|
201
201
|
deprecated_version = sklearn_deprecated_keyword_to_version_dict[key]
|
202
202
|
msg = f"Incompatible scikit-learn version: '{key}' deprecated since scikit-learn={deprecated_version}.."
|
203
|
-
warnings.warn(msg, DeprecationWarning)
|
203
|
+
warnings.warn(msg, DeprecationWarning, stacklevel=2)
|
204
204
|
|
205
205
|
# removed sklearn keyword
|
206
206
|
if (
|
@@ -247,3 +247,10 @@ def table_exists(session: snowpark.Session, table_name: str, statement_params: D
|
|
247
247
|
return True
|
248
248
|
except snowpark_exceptions.SnowparkSQLException:
|
249
249
|
return False
|
250
|
+
|
251
|
+
|
252
|
+
def to_float_if_valid(val: Any, col: str, stat: str) -> float:
|
253
|
+
try:
|
254
|
+
return float(val)
|
255
|
+
except TypeError:
|
256
|
+
raise TypeError(f"Invalid stat: {stat}[{col}]: {val} cannot be converted to float.")
|
@@ -51,8 +51,8 @@ class Base:
|
|
51
51
|
input_cols: Input columns.
|
52
52
|
output_cols: Output columns.
|
53
53
|
label_cols: Label column(s).
|
54
|
-
passthrough_cols: List columns not to be used or modified by the estimator/
|
55
|
-
These columns will be passed through all the estimator/
|
54
|
+
passthrough_cols: List columns not to be used or modified by the estimator/transformers.
|
55
|
+
These columns will be passed through all the estimator/transformer operations without any modifications.
|
56
56
|
"""
|
57
57
|
self.input_cols: List[str] = []
|
58
58
|
self.output_cols: List[str] = []
|
@@ -185,7 +185,10 @@ class Base:
|
|
185
185
|
error_code=error_codes.INVALID_ATTRIBUTE,
|
186
186
|
original_exception=RuntimeError(
|
187
187
|
modeling_error_messages.SIZE_MISMATCH.format(
|
188
|
-
"input_cols",
|
188
|
+
"input_cols",
|
189
|
+
len(self.input_cols),
|
190
|
+
"output_cols",
|
191
|
+
len(self.output_cols),
|
189
192
|
)
|
190
193
|
),
|
191
194
|
)
|
@@ -416,8 +419,16 @@ class BaseEstimator(Base):
|
|
416
419
|
"""
|
417
420
|
return []
|
418
421
|
|
422
|
+
@telemetry.send_api_usage_telemetry(
|
423
|
+
project=PROJECT,
|
424
|
+
subproject=SUBPROJECT,
|
425
|
+
)
|
426
|
+
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
|
427
|
+
"""Runs universal logics for all fit implementations."""
|
428
|
+
return self._fit(dataset)
|
429
|
+
|
419
430
|
@abstractmethod
|
420
|
-
def
|
431
|
+
def _fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
|
421
432
|
raise NotImplementedError()
|
422
433
|
|
423
434
|
def _use_input_cols_only(self, dataset: pd.DataFrame) -> pd.DataFrame:
|
@@ -498,7 +509,11 @@ class BaseTransformer(BaseEstimator):
|
|
498
509
|
sample_weight_col: Optional[str] = None,
|
499
510
|
) -> None:
|
500
511
|
"""Base class for all transformers."""
|
501
|
-
super().__init__(
|
512
|
+
super().__init__(
|
513
|
+
file_names=file_names,
|
514
|
+
custom_states=custom_states,
|
515
|
+
sample_weight_col=sample_weight_col,
|
516
|
+
)
|
502
517
|
self._sklearn_object = None
|
503
518
|
self._is_fitted = False
|
504
519
|
self._drop_input_cols = drop_input_cols
|
@@ -705,7 +720,10 @@ class BaseTransformer(BaseEstimator):
|
|
705
720
|
error_code=error_codes.INVALID_ATTRIBUTE,
|
706
721
|
original_exception=RuntimeError(
|
707
722
|
modeling_error_messages.SIZE_MISMATCH.format(
|
708
|
-
"output_cols",
|
723
|
+
"output_cols",
|
724
|
+
len(self.output_cols),
|
725
|
+
"transformed array shape",
|
726
|
+
shape,
|
709
727
|
)
|
710
728
|
),
|
711
729
|
)
|