snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -221,12 +220,7 @@ class IncrementalPCA(BaseTransformer):
|
|
221
220
|
)
|
222
221
|
return selected_cols
|
223
222
|
|
224
|
-
|
225
|
-
project=_PROJECT,
|
226
|
-
subproject=_SUBPROJECT,
|
227
|
-
custom_tags=dict([("autogen", True)]),
|
228
|
-
)
|
229
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IncrementalPCA":
|
223
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "IncrementalPCA":
|
230
224
|
"""Fit the model with X, using minibatches of size batch_size
|
231
225
|
For more details on this function, see [sklearn.decomposition.IncrementalPCA.fit]
|
232
226
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA.fit)
|
@@ -253,12 +247,14 @@ class IncrementalPCA(BaseTransformer):
|
|
253
247
|
|
254
248
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
255
249
|
|
256
|
-
|
250
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
257
251
|
if SNOWML_SPROC_ENV in os.environ:
|
258
252
|
statement_params = telemetry.get_function_usage_statement_params(
|
259
253
|
project=_PROJECT,
|
260
254
|
subproject=_SUBPROJECT,
|
261
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
255
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
256
|
+
inspect.currentframe(), IncrementalPCA.__class__.__name__
|
257
|
+
),
|
262
258
|
api_calls=[Session.call],
|
263
259
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
264
260
|
)
|
@@ -279,7 +275,7 @@ class IncrementalPCA(BaseTransformer):
|
|
279
275
|
)
|
280
276
|
self._sklearn_object = model_trainer.train()
|
281
277
|
self._is_fitted = True
|
282
|
-
self.
|
278
|
+
self._generate_model_signatures(dataset)
|
283
279
|
return self
|
284
280
|
|
285
281
|
def _batch_inference_validate_snowpark(
|
@@ -353,7 +349,9 @@ class IncrementalPCA(BaseTransformer):
|
|
353
349
|
# when it is classifier, infer the datatype from label columns
|
354
350
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
355
351
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
356
|
-
label_cols_signatures = [
|
352
|
+
label_cols_signatures = [
|
353
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
354
|
+
]
|
357
355
|
if len(label_cols_signatures) == 0:
|
358
356
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
359
357
|
raise exceptions.SnowflakeMLException(
|
@@ -361,25 +359,22 @@ class IncrementalPCA(BaseTransformer):
|
|
361
359
|
original_exception=ValueError(error_str),
|
362
360
|
)
|
363
361
|
|
364
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
365
|
-
label_cols_signatures[0].as_snowpark_type()
|
366
|
-
)
|
362
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
367
363
|
|
368
364
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
369
|
-
assert isinstance(
|
365
|
+
assert isinstance(
|
366
|
+
dataset._session, Session
|
367
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
370
368
|
|
371
369
|
transform_kwargs = dict(
|
372
|
-
session
|
373
|
-
dependencies
|
374
|
-
drop_input_cols
|
375
|
-
expected_output_cols_type
|
370
|
+
session=dataset._session,
|
371
|
+
dependencies=self._deps,
|
372
|
+
drop_input_cols=self._drop_input_cols,
|
373
|
+
expected_output_cols_type=expected_type_inferred,
|
376
374
|
)
|
377
375
|
|
378
376
|
elif isinstance(dataset, pd.DataFrame):
|
379
|
-
transform_kwargs = dict(
|
380
|
-
snowpark_input_cols = self._snowpark_cols,
|
381
|
-
drop_input_cols = self._drop_input_cols
|
382
|
-
)
|
377
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
383
378
|
|
384
379
|
transform_handlers = ModelTransformerBuilder.build(
|
385
380
|
dataset=dataset,
|
@@ -421,7 +416,7 @@ class IncrementalPCA(BaseTransformer):
|
|
421
416
|
Transformed dataset.
|
422
417
|
"""
|
423
418
|
super()._check_dataset_type(dataset)
|
424
|
-
inference_method="transform"
|
419
|
+
inference_method = "transform"
|
425
420
|
|
426
421
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
427
422
|
# are specific to the type of dataset used.
|
@@ -458,17 +453,14 @@ class IncrementalPCA(BaseTransformer):
|
|
458
453
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
459
454
|
|
460
455
|
transform_kwargs = dict(
|
461
|
-
session
|
462
|
-
dependencies
|
463
|
-
drop_input_cols
|
464
|
-
expected_output_cols_type
|
456
|
+
session=dataset._session,
|
457
|
+
dependencies=self._deps,
|
458
|
+
drop_input_cols=self._drop_input_cols,
|
459
|
+
expected_output_cols_type=expected_dtype,
|
465
460
|
)
|
466
461
|
|
467
462
|
elif isinstance(dataset, pd.DataFrame):
|
468
|
-
transform_kwargs = dict(
|
469
|
-
snowpark_input_cols = self._snowpark_cols,
|
470
|
-
drop_input_cols = self._drop_input_cols
|
471
|
-
)
|
463
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
472
464
|
|
473
465
|
transform_handlers = ModelTransformerBuilder.build(
|
474
466
|
dataset=dataset,
|
@@ -487,7 +479,11 @@ class IncrementalPCA(BaseTransformer):
|
|
487
479
|
return output_df
|
488
480
|
|
489
481
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
490
|
-
def fit_predict(
|
482
|
+
def fit_predict(
|
483
|
+
self,
|
484
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
485
|
+
output_cols_prefix: str = "fit_predict_",
|
486
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
491
487
|
""" Method not supported for this class.
|
492
488
|
|
493
489
|
|
@@ -512,7 +508,9 @@ class IncrementalPCA(BaseTransformer):
|
|
512
508
|
)
|
513
509
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
514
510
|
drop_input_cols=self._drop_input_cols,
|
515
|
-
expected_output_cols_list=
|
511
|
+
expected_output_cols_list=(
|
512
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
513
|
+
),
|
516
514
|
)
|
517
515
|
self._sklearn_object = fitted_estimator
|
518
516
|
self._is_fitted = True
|
@@ -529,6 +527,62 @@ class IncrementalPCA(BaseTransformer):
|
|
529
527
|
assert self._sklearn_object is not None
|
530
528
|
return self._sklearn_object.embedding_
|
531
529
|
|
530
|
+
|
531
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
532
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
533
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
534
|
+
"""
|
535
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
536
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
537
|
+
if output_cols:
|
538
|
+
output_cols = [
|
539
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
540
|
+
for c in output_cols
|
541
|
+
]
|
542
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
543
|
+
output_cols = [output_cols_prefix]
|
544
|
+
elif self._sklearn_object is not None:
|
545
|
+
classes = self._sklearn_object.classes_
|
546
|
+
if isinstance(classes, numpy.ndarray):
|
547
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
548
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
549
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
550
|
+
output_cols = []
|
551
|
+
for i, cl in enumerate(classes):
|
552
|
+
# For binary classification, there is only one output column for each class
|
553
|
+
# ndarray as the two classes are complementary.
|
554
|
+
if len(cl) == 2:
|
555
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
556
|
+
else:
|
557
|
+
output_cols.extend([
|
558
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
559
|
+
])
|
560
|
+
else:
|
561
|
+
output_cols = []
|
562
|
+
|
563
|
+
# Make sure column names are valid snowflake identifiers.
|
564
|
+
assert output_cols is not None # Make MyPy happy
|
565
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
566
|
+
|
567
|
+
return rv
|
568
|
+
|
569
|
+
def _align_expected_output_names(
|
570
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
571
|
+
) -> List[str]:
|
572
|
+
# in case the inferred output column names dimension is different
|
573
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
574
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
575
|
+
output_df_columns = list(output_df_pd.columns)
|
576
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
577
|
+
if self.sample_weight_col:
|
578
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
579
|
+
# if the dimension of inferred output column names is correct; use it
|
580
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
581
|
+
return expected_output_cols_list
|
582
|
+
# otherwise, use the sklearn estimator's output
|
583
|
+
else:
|
584
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
585
|
+
|
532
586
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
533
587
|
@telemetry.send_api_usage_telemetry(
|
534
588
|
project=_PROJECT,
|
@@ -559,24 +613,28 @@ class IncrementalPCA(BaseTransformer):
|
|
559
613
|
# are specific to the type of dataset used.
|
560
614
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
561
615
|
|
616
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
617
|
+
|
562
618
|
if isinstance(dataset, DataFrame):
|
563
619
|
self._deps = self._batch_inference_validate_snowpark(
|
564
620
|
dataset=dataset,
|
565
621
|
inference_method=inference_method,
|
566
622
|
)
|
567
|
-
assert isinstance(
|
623
|
+
assert isinstance(
|
624
|
+
dataset._session, Session
|
625
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
568
626
|
transform_kwargs = dict(
|
569
627
|
session=dataset._session,
|
570
628
|
dependencies=self._deps,
|
571
|
-
drop_input_cols
|
629
|
+
drop_input_cols=self._drop_input_cols,
|
572
630
|
expected_output_cols_type="float",
|
573
631
|
)
|
632
|
+
expected_output_cols = self._align_expected_output_names(
|
633
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
634
|
+
)
|
574
635
|
|
575
636
|
elif isinstance(dataset, pd.DataFrame):
|
576
|
-
transform_kwargs = dict(
|
577
|
-
snowpark_input_cols = self._snowpark_cols,
|
578
|
-
drop_input_cols = self._drop_input_cols
|
579
|
-
)
|
637
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
580
638
|
|
581
639
|
transform_handlers = ModelTransformerBuilder.build(
|
582
640
|
dataset=dataset,
|
@@ -588,7 +646,7 @@ class IncrementalPCA(BaseTransformer):
|
|
588
646
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
589
647
|
inference_method=inference_method,
|
590
648
|
input_cols=self.input_cols,
|
591
|
-
expected_output_cols=
|
649
|
+
expected_output_cols=expected_output_cols,
|
592
650
|
**transform_kwargs
|
593
651
|
)
|
594
652
|
return output_df
|
@@ -618,7 +676,8 @@ class IncrementalPCA(BaseTransformer):
|
|
618
676
|
Output dataset with log probability of the sample for each class in the model.
|
619
677
|
"""
|
620
678
|
super()._check_dataset_type(dataset)
|
621
|
-
inference_method="predict_log_proba"
|
679
|
+
inference_method = "predict_log_proba"
|
680
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
622
681
|
|
623
682
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
624
683
|
# are specific to the type of dataset used.
|
@@ -629,18 +688,20 @@ class IncrementalPCA(BaseTransformer):
|
|
629
688
|
dataset=dataset,
|
630
689
|
inference_method=inference_method,
|
631
690
|
)
|
632
|
-
assert isinstance(
|
691
|
+
assert isinstance(
|
692
|
+
dataset._session, Session
|
693
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
633
694
|
transform_kwargs = dict(
|
634
695
|
session=dataset._session,
|
635
696
|
dependencies=self._deps,
|
636
|
-
drop_input_cols
|
697
|
+
drop_input_cols=self._drop_input_cols,
|
637
698
|
expected_output_cols_type="float",
|
638
699
|
)
|
700
|
+
expected_output_cols = self._align_expected_output_names(
|
701
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
702
|
+
)
|
639
703
|
elif isinstance(dataset, pd.DataFrame):
|
640
|
-
transform_kwargs = dict(
|
641
|
-
snowpark_input_cols = self._snowpark_cols,
|
642
|
-
drop_input_cols = self._drop_input_cols
|
643
|
-
)
|
704
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
644
705
|
|
645
706
|
transform_handlers = ModelTransformerBuilder.build(
|
646
707
|
dataset=dataset,
|
@@ -653,7 +714,7 @@ class IncrementalPCA(BaseTransformer):
|
|
653
714
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
654
715
|
inference_method=inference_method,
|
655
716
|
input_cols=self.input_cols,
|
656
|
-
expected_output_cols=
|
717
|
+
expected_output_cols=expected_output_cols,
|
657
718
|
**transform_kwargs
|
658
719
|
)
|
659
720
|
return output_df
|
@@ -679,30 +740,34 @@ class IncrementalPCA(BaseTransformer):
|
|
679
740
|
Output dataset with results of the decision function for the samples in input dataset.
|
680
741
|
"""
|
681
742
|
super()._check_dataset_type(dataset)
|
682
|
-
inference_method="decision_function"
|
743
|
+
inference_method = "decision_function"
|
683
744
|
|
684
745
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
685
746
|
# are specific to the type of dataset used.
|
686
747
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
687
748
|
|
749
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
750
|
+
|
688
751
|
if isinstance(dataset, DataFrame):
|
689
752
|
self._deps = self._batch_inference_validate_snowpark(
|
690
753
|
dataset=dataset,
|
691
754
|
inference_method=inference_method,
|
692
755
|
)
|
693
|
-
assert isinstance(
|
756
|
+
assert isinstance(
|
757
|
+
dataset._session, Session
|
758
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
694
759
|
transform_kwargs = dict(
|
695
760
|
session=dataset._session,
|
696
761
|
dependencies=self._deps,
|
697
|
-
drop_input_cols
|
762
|
+
drop_input_cols=self._drop_input_cols,
|
698
763
|
expected_output_cols_type="float",
|
699
764
|
)
|
765
|
+
expected_output_cols = self._align_expected_output_names(
|
766
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
767
|
+
)
|
700
768
|
|
701
769
|
elif isinstance(dataset, pd.DataFrame):
|
702
|
-
transform_kwargs = dict(
|
703
|
-
snowpark_input_cols = self._snowpark_cols,
|
704
|
-
drop_input_cols = self._drop_input_cols
|
705
|
-
)
|
770
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
706
771
|
|
707
772
|
transform_handlers = ModelTransformerBuilder.build(
|
708
773
|
dataset=dataset,
|
@@ -715,7 +780,7 @@ class IncrementalPCA(BaseTransformer):
|
|
715
780
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
716
781
|
inference_method=inference_method,
|
717
782
|
input_cols=self.input_cols,
|
718
|
-
expected_output_cols=
|
783
|
+
expected_output_cols=expected_output_cols,
|
719
784
|
**transform_kwargs
|
720
785
|
)
|
721
786
|
return output_df
|
@@ -744,12 +809,14 @@ class IncrementalPCA(BaseTransformer):
|
|
744
809
|
Output dataset with probability of the sample for each class in the model.
|
745
810
|
"""
|
746
811
|
super()._check_dataset_type(dataset)
|
747
|
-
inference_method="score_samples"
|
812
|
+
inference_method = "score_samples"
|
748
813
|
|
749
814
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
750
815
|
# are specific to the type of dataset used.
|
751
816
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
752
817
|
|
818
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
819
|
+
|
753
820
|
if isinstance(dataset, DataFrame):
|
754
821
|
self._deps = self._batch_inference_validate_snowpark(
|
755
822
|
dataset=dataset,
|
@@ -762,6 +829,9 @@ class IncrementalPCA(BaseTransformer):
|
|
762
829
|
drop_input_cols = self._drop_input_cols,
|
763
830
|
expected_output_cols_type="float",
|
764
831
|
)
|
832
|
+
expected_output_cols = self._align_expected_output_names(
|
833
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
834
|
+
)
|
765
835
|
|
766
836
|
elif isinstance(dataset, pd.DataFrame):
|
767
837
|
transform_kwargs = dict(
|
@@ -780,7 +850,7 @@ class IncrementalPCA(BaseTransformer):
|
|
780
850
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
781
851
|
inference_method=inference_method,
|
782
852
|
input_cols=self.input_cols,
|
783
|
-
expected_output_cols=
|
853
|
+
expected_output_cols=expected_output_cols,
|
784
854
|
**transform_kwargs
|
785
855
|
)
|
786
856
|
return output_df
|
@@ -925,50 +995,84 @@ class IncrementalPCA(BaseTransformer):
|
|
925
995
|
)
|
926
996
|
return output_df
|
927
997
|
|
998
|
+
|
999
|
+
|
1000
|
+
def to_sklearn(self) -> Any:
|
1001
|
+
"""Get sklearn.decomposition.IncrementalPCA object.
|
1002
|
+
"""
|
1003
|
+
if self._sklearn_object is None:
|
1004
|
+
self._sklearn_object = self._create_sklearn_object()
|
1005
|
+
return self._sklearn_object
|
1006
|
+
|
1007
|
+
def to_xgboost(self) -> Any:
|
1008
|
+
raise exceptions.SnowflakeMLException(
|
1009
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1010
|
+
original_exception=AttributeError(
|
1011
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1012
|
+
"to_xgboost()",
|
1013
|
+
"to_sklearn()"
|
1014
|
+
)
|
1015
|
+
),
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
def to_lightgbm(self) -> Any:
|
1019
|
+
raise exceptions.SnowflakeMLException(
|
1020
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1021
|
+
original_exception=AttributeError(
|
1022
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1023
|
+
"to_lightgbm()",
|
1024
|
+
"to_sklearn()"
|
1025
|
+
)
|
1026
|
+
),
|
1027
|
+
)
|
928
1028
|
|
929
|
-
def
|
1029
|
+
def _get_dependencies(self) -> List[str]:
|
1030
|
+
return self._deps
|
1031
|
+
|
1032
|
+
|
1033
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
930
1034
|
self._model_signature_dict = dict()
|
931
1035
|
|
932
1036
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
933
1037
|
|
934
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1038
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
935
1039
|
outputs: List[BaseFeatureSpec] = []
|
936
1040
|
if hasattr(self, "predict"):
|
937
1041
|
# keep mypy happy
|
938
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1042
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
939
1043
|
# For classifier, the type of predict is the same as the type of label
|
940
|
-
if self._sklearn_object._estimator_type ==
|
941
|
-
|
1044
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1045
|
+
# label columns is the desired type for output
|
942
1046
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
943
1047
|
# rename the output columns
|
944
1048
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
945
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
946
|
-
|
947
|
-
|
1049
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1050
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1051
|
+
)
|
948
1052
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
949
1053
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
950
|
-
# Clusterer returns int64 cluster labels.
|
1054
|
+
# Clusterer returns int64 cluster labels.
|
951
1055
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
952
1056
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
953
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
954
|
-
|
955
|
-
|
956
|
-
|
1057
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1058
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1059
|
+
)
|
1060
|
+
|
957
1061
|
# For regressor, the type of predict is float64
|
958
|
-
elif self._sklearn_object._estimator_type ==
|
1062
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
959
1063
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
960
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
961
|
-
|
962
|
-
|
963
|
-
|
1064
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1065
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1066
|
+
)
|
1067
|
+
|
964
1068
|
for prob_func in PROB_FUNCTIONS:
|
965
1069
|
if hasattr(self, prob_func):
|
966
1070
|
output_cols_prefix: str = f"{prob_func}_"
|
967
1071
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
968
1072
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
969
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
970
|
-
|
971
|
-
|
1073
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1074
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1075
|
+
)
|
972
1076
|
|
973
1077
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
974
1078
|
items = list(self._model_signature_dict.items())
|
@@ -981,10 +1085,10 @@ class IncrementalPCA(BaseTransformer):
|
|
981
1085
|
"""Returns model signature of current class.
|
982
1086
|
|
983
1087
|
Raises:
|
984
|
-
|
1088
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
985
1089
|
|
986
1090
|
Returns:
|
987
|
-
Dict
|
1091
|
+
Dict with each method and its input output signature
|
988
1092
|
"""
|
989
1093
|
if self._model_signature_dict is None:
|
990
1094
|
raise exceptions.SnowflakeMLException(
|
@@ -992,35 +1096,3 @@ class IncrementalPCA(BaseTransformer):
|
|
992
1096
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
993
1097
|
)
|
994
1098
|
return self._model_signature_dict
|
995
|
-
|
996
|
-
def to_sklearn(self) -> Any:
|
997
|
-
"""Get sklearn.decomposition.IncrementalPCA object.
|
998
|
-
"""
|
999
|
-
if self._sklearn_object is None:
|
1000
|
-
self._sklearn_object = self._create_sklearn_object()
|
1001
|
-
return self._sklearn_object
|
1002
|
-
|
1003
|
-
def to_xgboost(self) -> Any:
|
1004
|
-
raise exceptions.SnowflakeMLException(
|
1005
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1006
|
-
original_exception=AttributeError(
|
1007
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1008
|
-
"to_xgboost()",
|
1009
|
-
"to_sklearn()"
|
1010
|
-
)
|
1011
|
-
),
|
1012
|
-
)
|
1013
|
-
|
1014
|
-
def to_lightgbm(self) -> Any:
|
1015
|
-
raise exceptions.SnowflakeMLException(
|
1016
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1017
|
-
original_exception=AttributeError(
|
1018
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1019
|
-
"to_lightgbm()",
|
1020
|
-
"to_sklearn()"
|
1021
|
-
)
|
1022
|
-
),
|
1023
|
-
)
|
1024
|
-
|
1025
|
-
def _get_dependencies(self) -> List[str]:
|
1026
|
-
return self._deps
|