snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -286,12 +285,7 @@ class PCA(BaseTransformer):
|
|
286
285
|
)
|
287
286
|
return selected_cols
|
288
287
|
|
289
|
-
|
290
|
-
project=_PROJECT,
|
291
|
-
subproject=_SUBPROJECT,
|
292
|
-
custom_tags=dict([("autogen", True)]),
|
293
|
-
)
|
294
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PCA":
|
288
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PCA":
|
295
289
|
"""Fit the model with X
|
296
290
|
For more details on this function, see [sklearn.decomposition.PCA.fit]
|
297
291
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.fit)
|
@@ -318,12 +312,14 @@ class PCA(BaseTransformer):
|
|
318
312
|
|
319
313
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
320
314
|
|
321
|
-
|
315
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
322
316
|
if SNOWML_SPROC_ENV in os.environ:
|
323
317
|
statement_params = telemetry.get_function_usage_statement_params(
|
324
318
|
project=_PROJECT,
|
325
319
|
subproject=_SUBPROJECT,
|
326
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
320
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
321
|
+
inspect.currentframe(), PCA.__class__.__name__
|
322
|
+
),
|
327
323
|
api_calls=[Session.call],
|
328
324
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
329
325
|
)
|
@@ -344,7 +340,7 @@ class PCA(BaseTransformer):
|
|
344
340
|
)
|
345
341
|
self._sklearn_object = model_trainer.train()
|
346
342
|
self._is_fitted = True
|
347
|
-
self.
|
343
|
+
self._generate_model_signatures(dataset)
|
348
344
|
return self
|
349
345
|
|
350
346
|
def _batch_inference_validate_snowpark(
|
@@ -418,7 +414,9 @@ class PCA(BaseTransformer):
|
|
418
414
|
# when it is classifier, infer the datatype from label columns
|
419
415
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
420
416
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
421
|
-
label_cols_signatures = [
|
417
|
+
label_cols_signatures = [
|
418
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
419
|
+
]
|
422
420
|
if len(label_cols_signatures) == 0:
|
423
421
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
424
422
|
raise exceptions.SnowflakeMLException(
|
@@ -426,25 +424,22 @@ class PCA(BaseTransformer):
|
|
426
424
|
original_exception=ValueError(error_str),
|
427
425
|
)
|
428
426
|
|
429
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
430
|
-
label_cols_signatures[0].as_snowpark_type()
|
431
|
-
)
|
427
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
432
428
|
|
433
429
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
434
|
-
assert isinstance(
|
430
|
+
assert isinstance(
|
431
|
+
dataset._session, Session
|
432
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
435
433
|
|
436
434
|
transform_kwargs = dict(
|
437
|
-
session
|
438
|
-
dependencies
|
439
|
-
drop_input_cols
|
440
|
-
expected_output_cols_type
|
435
|
+
session=dataset._session,
|
436
|
+
dependencies=self._deps,
|
437
|
+
drop_input_cols=self._drop_input_cols,
|
438
|
+
expected_output_cols_type=expected_type_inferred,
|
441
439
|
)
|
442
440
|
|
443
441
|
elif isinstance(dataset, pd.DataFrame):
|
444
|
-
transform_kwargs = dict(
|
445
|
-
snowpark_input_cols = self._snowpark_cols,
|
446
|
-
drop_input_cols = self._drop_input_cols
|
447
|
-
)
|
442
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
448
443
|
|
449
444
|
transform_handlers = ModelTransformerBuilder.build(
|
450
445
|
dataset=dataset,
|
@@ -486,7 +481,7 @@ class PCA(BaseTransformer):
|
|
486
481
|
Transformed dataset.
|
487
482
|
"""
|
488
483
|
super()._check_dataset_type(dataset)
|
489
|
-
inference_method="transform"
|
484
|
+
inference_method = "transform"
|
490
485
|
|
491
486
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
492
487
|
# are specific to the type of dataset used.
|
@@ -523,17 +518,14 @@ class PCA(BaseTransformer):
|
|
523
518
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
524
519
|
|
525
520
|
transform_kwargs = dict(
|
526
|
-
session
|
527
|
-
dependencies
|
528
|
-
drop_input_cols
|
529
|
-
expected_output_cols_type
|
521
|
+
session=dataset._session,
|
522
|
+
dependencies=self._deps,
|
523
|
+
drop_input_cols=self._drop_input_cols,
|
524
|
+
expected_output_cols_type=expected_dtype,
|
530
525
|
)
|
531
526
|
|
532
527
|
elif isinstance(dataset, pd.DataFrame):
|
533
|
-
transform_kwargs = dict(
|
534
|
-
snowpark_input_cols = self._snowpark_cols,
|
535
|
-
drop_input_cols = self._drop_input_cols
|
536
|
-
)
|
528
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
537
529
|
|
538
530
|
transform_handlers = ModelTransformerBuilder.build(
|
539
531
|
dataset=dataset,
|
@@ -552,7 +544,11 @@ class PCA(BaseTransformer):
|
|
552
544
|
return output_df
|
553
545
|
|
554
546
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
555
|
-
def fit_predict(
|
547
|
+
def fit_predict(
|
548
|
+
self,
|
549
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
550
|
+
output_cols_prefix: str = "fit_predict_",
|
551
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
556
552
|
""" Method not supported for this class.
|
557
553
|
|
558
554
|
|
@@ -577,7 +573,9 @@ class PCA(BaseTransformer):
|
|
577
573
|
)
|
578
574
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
579
575
|
drop_input_cols=self._drop_input_cols,
|
580
|
-
expected_output_cols_list=
|
576
|
+
expected_output_cols_list=(
|
577
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
578
|
+
),
|
581
579
|
)
|
582
580
|
self._sklearn_object = fitted_estimator
|
583
581
|
self._is_fitted = True
|
@@ -594,6 +592,62 @@ class PCA(BaseTransformer):
|
|
594
592
|
assert self._sklearn_object is not None
|
595
593
|
return self._sklearn_object.embedding_
|
596
594
|
|
595
|
+
|
596
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
597
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
598
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
599
|
+
"""
|
600
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
601
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
602
|
+
if output_cols:
|
603
|
+
output_cols = [
|
604
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
605
|
+
for c in output_cols
|
606
|
+
]
|
607
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
608
|
+
output_cols = [output_cols_prefix]
|
609
|
+
elif self._sklearn_object is not None:
|
610
|
+
classes = self._sklearn_object.classes_
|
611
|
+
if isinstance(classes, numpy.ndarray):
|
612
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
613
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
614
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
615
|
+
output_cols = []
|
616
|
+
for i, cl in enumerate(classes):
|
617
|
+
# For binary classification, there is only one output column for each class
|
618
|
+
# ndarray as the two classes are complementary.
|
619
|
+
if len(cl) == 2:
|
620
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
621
|
+
else:
|
622
|
+
output_cols.extend([
|
623
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
624
|
+
])
|
625
|
+
else:
|
626
|
+
output_cols = []
|
627
|
+
|
628
|
+
# Make sure column names are valid snowflake identifiers.
|
629
|
+
assert output_cols is not None # Make MyPy happy
|
630
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
631
|
+
|
632
|
+
return rv
|
633
|
+
|
634
|
+
def _align_expected_output_names(
|
635
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
636
|
+
) -> List[str]:
|
637
|
+
# in case the inferred output column names dimension is different
|
638
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
639
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
640
|
+
output_df_columns = list(output_df_pd.columns)
|
641
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
642
|
+
if self.sample_weight_col:
|
643
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
644
|
+
# if the dimension of inferred output column names is correct; use it
|
645
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
646
|
+
return expected_output_cols_list
|
647
|
+
# otherwise, use the sklearn estimator's output
|
648
|
+
else:
|
649
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
650
|
+
|
597
651
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
598
652
|
@telemetry.send_api_usage_telemetry(
|
599
653
|
project=_PROJECT,
|
@@ -624,24 +678,28 @@ class PCA(BaseTransformer):
|
|
624
678
|
# are specific to the type of dataset used.
|
625
679
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
626
680
|
|
681
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
682
|
+
|
627
683
|
if isinstance(dataset, DataFrame):
|
628
684
|
self._deps = self._batch_inference_validate_snowpark(
|
629
685
|
dataset=dataset,
|
630
686
|
inference_method=inference_method,
|
631
687
|
)
|
632
|
-
assert isinstance(
|
688
|
+
assert isinstance(
|
689
|
+
dataset._session, Session
|
690
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
633
691
|
transform_kwargs = dict(
|
634
692
|
session=dataset._session,
|
635
693
|
dependencies=self._deps,
|
636
|
-
drop_input_cols
|
694
|
+
drop_input_cols=self._drop_input_cols,
|
637
695
|
expected_output_cols_type="float",
|
638
696
|
)
|
697
|
+
expected_output_cols = self._align_expected_output_names(
|
698
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
699
|
+
)
|
639
700
|
|
640
701
|
elif isinstance(dataset, pd.DataFrame):
|
641
|
-
transform_kwargs = dict(
|
642
|
-
snowpark_input_cols = self._snowpark_cols,
|
643
|
-
drop_input_cols = self._drop_input_cols
|
644
|
-
)
|
702
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
645
703
|
|
646
704
|
transform_handlers = ModelTransformerBuilder.build(
|
647
705
|
dataset=dataset,
|
@@ -653,7 +711,7 @@ class PCA(BaseTransformer):
|
|
653
711
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
654
712
|
inference_method=inference_method,
|
655
713
|
input_cols=self.input_cols,
|
656
|
-
expected_output_cols=
|
714
|
+
expected_output_cols=expected_output_cols,
|
657
715
|
**transform_kwargs
|
658
716
|
)
|
659
717
|
return output_df
|
@@ -683,7 +741,8 @@ class PCA(BaseTransformer):
|
|
683
741
|
Output dataset with log probability of the sample for each class in the model.
|
684
742
|
"""
|
685
743
|
super()._check_dataset_type(dataset)
|
686
|
-
inference_method="predict_log_proba"
|
744
|
+
inference_method = "predict_log_proba"
|
745
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
687
746
|
|
688
747
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
689
748
|
# are specific to the type of dataset used.
|
@@ -694,18 +753,20 @@ class PCA(BaseTransformer):
|
|
694
753
|
dataset=dataset,
|
695
754
|
inference_method=inference_method,
|
696
755
|
)
|
697
|
-
assert isinstance(
|
756
|
+
assert isinstance(
|
757
|
+
dataset._session, Session
|
758
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
698
759
|
transform_kwargs = dict(
|
699
760
|
session=dataset._session,
|
700
761
|
dependencies=self._deps,
|
701
|
-
drop_input_cols
|
762
|
+
drop_input_cols=self._drop_input_cols,
|
702
763
|
expected_output_cols_type="float",
|
703
764
|
)
|
765
|
+
expected_output_cols = self._align_expected_output_names(
|
766
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
767
|
+
)
|
704
768
|
elif isinstance(dataset, pd.DataFrame):
|
705
|
-
transform_kwargs = dict(
|
706
|
-
snowpark_input_cols = self._snowpark_cols,
|
707
|
-
drop_input_cols = self._drop_input_cols
|
708
|
-
)
|
769
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
709
770
|
|
710
771
|
transform_handlers = ModelTransformerBuilder.build(
|
711
772
|
dataset=dataset,
|
@@ -718,7 +779,7 @@ class PCA(BaseTransformer):
|
|
718
779
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
719
780
|
inference_method=inference_method,
|
720
781
|
input_cols=self.input_cols,
|
721
|
-
expected_output_cols=
|
782
|
+
expected_output_cols=expected_output_cols,
|
722
783
|
**transform_kwargs
|
723
784
|
)
|
724
785
|
return output_df
|
@@ -744,30 +805,34 @@ class PCA(BaseTransformer):
|
|
744
805
|
Output dataset with results of the decision function for the samples in input dataset.
|
745
806
|
"""
|
746
807
|
super()._check_dataset_type(dataset)
|
747
|
-
inference_method="decision_function"
|
808
|
+
inference_method = "decision_function"
|
748
809
|
|
749
810
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
750
811
|
# are specific to the type of dataset used.
|
751
812
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
752
813
|
|
814
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
815
|
+
|
753
816
|
if isinstance(dataset, DataFrame):
|
754
817
|
self._deps = self._batch_inference_validate_snowpark(
|
755
818
|
dataset=dataset,
|
756
819
|
inference_method=inference_method,
|
757
820
|
)
|
758
|
-
assert isinstance(
|
821
|
+
assert isinstance(
|
822
|
+
dataset._session, Session
|
823
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
759
824
|
transform_kwargs = dict(
|
760
825
|
session=dataset._session,
|
761
826
|
dependencies=self._deps,
|
762
|
-
drop_input_cols
|
827
|
+
drop_input_cols=self._drop_input_cols,
|
763
828
|
expected_output_cols_type="float",
|
764
829
|
)
|
830
|
+
expected_output_cols = self._align_expected_output_names(
|
831
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
832
|
+
)
|
765
833
|
|
766
834
|
elif isinstance(dataset, pd.DataFrame):
|
767
|
-
transform_kwargs = dict(
|
768
|
-
snowpark_input_cols = self._snowpark_cols,
|
769
|
-
drop_input_cols = self._drop_input_cols
|
770
|
-
)
|
835
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
771
836
|
|
772
837
|
transform_handlers = ModelTransformerBuilder.build(
|
773
838
|
dataset=dataset,
|
@@ -780,7 +845,7 @@ class PCA(BaseTransformer):
|
|
780
845
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
781
846
|
inference_method=inference_method,
|
782
847
|
input_cols=self.input_cols,
|
783
|
-
expected_output_cols=
|
848
|
+
expected_output_cols=expected_output_cols,
|
784
849
|
**transform_kwargs
|
785
850
|
)
|
786
851
|
return output_df
|
@@ -811,12 +876,14 @@ class PCA(BaseTransformer):
|
|
811
876
|
Output dataset with probability of the sample for each class in the model.
|
812
877
|
"""
|
813
878
|
super()._check_dataset_type(dataset)
|
814
|
-
inference_method="score_samples"
|
879
|
+
inference_method = "score_samples"
|
815
880
|
|
816
881
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
817
882
|
# are specific to the type of dataset used.
|
818
883
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
819
884
|
|
885
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
886
|
+
|
820
887
|
if isinstance(dataset, DataFrame):
|
821
888
|
self._deps = self._batch_inference_validate_snowpark(
|
822
889
|
dataset=dataset,
|
@@ -829,6 +896,9 @@ class PCA(BaseTransformer):
|
|
829
896
|
drop_input_cols = self._drop_input_cols,
|
830
897
|
expected_output_cols_type="float",
|
831
898
|
)
|
899
|
+
expected_output_cols = self._align_expected_output_names(
|
900
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
901
|
+
)
|
832
902
|
|
833
903
|
elif isinstance(dataset, pd.DataFrame):
|
834
904
|
transform_kwargs = dict(
|
@@ -847,7 +917,7 @@ class PCA(BaseTransformer):
|
|
847
917
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
848
918
|
inference_method=inference_method,
|
849
919
|
input_cols=self.input_cols,
|
850
|
-
expected_output_cols=
|
920
|
+
expected_output_cols=expected_output_cols,
|
851
921
|
**transform_kwargs
|
852
922
|
)
|
853
923
|
return output_df
|
@@ -994,50 +1064,84 @@ class PCA(BaseTransformer):
|
|
994
1064
|
)
|
995
1065
|
return output_df
|
996
1066
|
|
1067
|
+
|
1068
|
+
|
1069
|
+
def to_sklearn(self) -> Any:
|
1070
|
+
"""Get sklearn.decomposition.PCA object.
|
1071
|
+
"""
|
1072
|
+
if self._sklearn_object is None:
|
1073
|
+
self._sklearn_object = self._create_sklearn_object()
|
1074
|
+
return self._sklearn_object
|
1075
|
+
|
1076
|
+
def to_xgboost(self) -> Any:
|
1077
|
+
raise exceptions.SnowflakeMLException(
|
1078
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1079
|
+
original_exception=AttributeError(
|
1080
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1081
|
+
"to_xgboost()",
|
1082
|
+
"to_sklearn()"
|
1083
|
+
)
|
1084
|
+
),
|
1085
|
+
)
|
1086
|
+
|
1087
|
+
def to_lightgbm(self) -> Any:
|
1088
|
+
raise exceptions.SnowflakeMLException(
|
1089
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1090
|
+
original_exception=AttributeError(
|
1091
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1092
|
+
"to_lightgbm()",
|
1093
|
+
"to_sklearn()"
|
1094
|
+
)
|
1095
|
+
),
|
1096
|
+
)
|
997
1097
|
|
998
|
-
def
|
1098
|
+
def _get_dependencies(self) -> List[str]:
|
1099
|
+
return self._deps
|
1100
|
+
|
1101
|
+
|
1102
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
999
1103
|
self._model_signature_dict = dict()
|
1000
1104
|
|
1001
1105
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1002
1106
|
|
1003
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1107
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1004
1108
|
outputs: List[BaseFeatureSpec] = []
|
1005
1109
|
if hasattr(self, "predict"):
|
1006
1110
|
# keep mypy happy
|
1007
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1111
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1008
1112
|
# For classifier, the type of predict is the same as the type of label
|
1009
|
-
if self._sklearn_object._estimator_type ==
|
1010
|
-
|
1113
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1114
|
+
# label columns is the desired type for output
|
1011
1115
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1012
1116
|
# rename the output columns
|
1013
1117
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1014
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1015
|
-
|
1016
|
-
|
1118
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1119
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1120
|
+
)
|
1017
1121
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1018
1122
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1019
|
-
# Clusterer returns int64 cluster labels.
|
1123
|
+
# Clusterer returns int64 cluster labels.
|
1020
1124
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1021
1125
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1022
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1126
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1127
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1128
|
+
)
|
1129
|
+
|
1026
1130
|
# For regressor, the type of predict is float64
|
1027
|
-
elif self._sklearn_object._estimator_type ==
|
1131
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1028
1132
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1029
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1133
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1134
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1135
|
+
)
|
1136
|
+
|
1033
1137
|
for prob_func in PROB_FUNCTIONS:
|
1034
1138
|
if hasattr(self, prob_func):
|
1035
1139
|
output_cols_prefix: str = f"{prob_func}_"
|
1036
1140
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1037
1141
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1038
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1039
|
-
|
1040
|
-
|
1142
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1143
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1144
|
+
)
|
1041
1145
|
|
1042
1146
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1043
1147
|
items = list(self._model_signature_dict.items())
|
@@ -1050,10 +1154,10 @@ class PCA(BaseTransformer):
|
|
1050
1154
|
"""Returns model signature of current class.
|
1051
1155
|
|
1052
1156
|
Raises:
|
1053
|
-
|
1157
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1054
1158
|
|
1055
1159
|
Returns:
|
1056
|
-
Dict
|
1160
|
+
Dict with each method and its input output signature
|
1057
1161
|
"""
|
1058
1162
|
if self._model_signature_dict is None:
|
1059
1163
|
raise exceptions.SnowflakeMLException(
|
@@ -1061,35 +1165,3 @@ class PCA(BaseTransformer):
|
|
1061
1165
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1062
1166
|
)
|
1063
1167
|
return self._model_signature_dict
|
1064
|
-
|
1065
|
-
def to_sklearn(self) -> Any:
|
1066
|
-
"""Get sklearn.decomposition.PCA object.
|
1067
|
-
"""
|
1068
|
-
if self._sklearn_object is None:
|
1069
|
-
self._sklearn_object = self._create_sklearn_object()
|
1070
|
-
return self._sklearn_object
|
1071
|
-
|
1072
|
-
def to_xgboost(self) -> Any:
|
1073
|
-
raise exceptions.SnowflakeMLException(
|
1074
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1075
|
-
original_exception=AttributeError(
|
1076
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1077
|
-
"to_xgboost()",
|
1078
|
-
"to_sklearn()"
|
1079
|
-
)
|
1080
|
-
),
|
1081
|
-
)
|
1082
|
-
|
1083
|
-
def to_lightgbm(self) -> Any:
|
1084
|
-
raise exceptions.SnowflakeMLException(
|
1085
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1086
|
-
original_exception=AttributeError(
|
1087
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1088
|
-
"to_lightgbm()",
|
1089
|
-
"to_sklearn()"
|
1090
|
-
)
|
1091
|
-
),
|
1092
|
-
)
|
1093
|
-
|
1094
|
-
def _get_dependencies(self) -> List[str]:
|
1095
|
-
return self._deps
|