snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_classify_text.py +2 -2
- snowflake/cortex/_embed_text_1024.py +37 -0
- snowflake/cortex/_embed_text_768.py +37 -0
- snowflake/cortex/_extract_answer.py +2 -2
- snowflake/cortex/_sentiment.py +2 -2
- snowflake/cortex/_summarize.py +2 -2
- snowflake/cortex/_translate.py +2 -2
- snowflake/cortex/_util.py +4 -4
- snowflake/ml/_internal/env_utils.py +5 -5
- snowflake/ml/_internal/exceptions/error_codes.py +2 -0
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/db_utils.py +50 -0
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/service_logger.py +63 -0
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +26 -2
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/ingestor_utils.py +20 -10
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/access_manager.py +3 -3
- snowflake/ml/feature_store/feature_store.py +60 -19
- snowflake/ml/feature_store/feature_view.py +84 -30
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/fileset/stage_fs.py +2 -1
- snowflake/ml/lineage/lineage_node.py +7 -2
- snowflake/ml/model/__init__.py +1 -2
- snowflake/ml/model/_client/model/model_version_impl.py +96 -12
- snowflake/ml/model/_client/ops/model_ops.py +124 -6
- snowflake/ml/model/_client/ops/service_ops.py +309 -9
- snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
- snowflake/ml/model/_client/sql/_base.py +5 -0
- snowflake/ml/model/_client/sql/model.py +1 -0
- snowflake/ml/model/_client/sql/model_version.py +9 -5
- snowflake/ml/model/_client/sql/service.py +121 -20
- snowflake/ml/model/_model_composer/model_composer.py +11 -39
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
- snowflake/ml/model/_packager/model_env/model_env.py +4 -38
- snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
- snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
- snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
- snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
- snowflake/ml/model/_packager/model_packager.py +14 -8
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/type_hints.py +12 -145
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
- snowflake/ml/modeling/cluster/birch.py +61 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
- snowflake/ml/modeling/cluster/dbscan.py +61 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
- snowflake/ml/modeling/cluster/k_means.py +61 -21
- snowflake/ml/modeling/cluster/mean_shift.py +61 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
- snowflake/ml/modeling/cluster/optics.py +61 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
- snowflake/ml/modeling/compose/column_transformer.py +61 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
- snowflake/ml/modeling/covariance/oas.py +61 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/pca.py +61 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
- snowflake/ml/modeling/impute/knn_imputer.py +61 -21
- snowflake/ml/modeling/impute/missing_indicator.py +61 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/lars.py +61 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/perceptron.py +61 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ridge.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
- snowflake/ml/modeling/manifold/isomap.py +61 -21
- snowflake/ml/modeling/manifold/mds.py +61 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
- snowflake/ml/modeling/manifold/tsne.py +61 -21
- snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
- snowflake/ml/modeling/metrics/ranking.py +0 -3
- snowflake/ml/modeling/metrics/regression.py +0 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
- snowflake/ml/modeling/svm/linear_svc.py +61 -21
- snowflake/ml/modeling/svm/linear_svr.py +61 -21
- snowflake/ml/modeling/svm/nu_svc.py +61 -21
- snowflake/ml/modeling/svm/nu_svr.py +61 -21
- snowflake/ml/modeling/svm/svc.py +61 -21
- snowflake/ml/modeling/svm/svr.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
- snowflake/ml/monitoring/_client/model_monitor.py +126 -0
- snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
- snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
- snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
- snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
- snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
- snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
- snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
- snowflake/ml/monitoring/entities/output_score_type.py +90 -0
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/registry.py +166 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
- snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
- snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
- snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
- snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
- snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
- snowflake/ml/_internal/utils/session_token_manager.py +0 -46
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
- snowflake/ml/_internal/utils/uri.py +0 -77
- snowflake/ml/data/torch_dataset.py +0 -33
- snowflake/ml/model/_api.py +0 -568
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
- snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
- snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
- snowflake/ml/model/deploy_platforms.py +0 -6
- snowflake/ml/model/models/llm.py +0 -104
- snowflake/ml/monitoring/monitor.py +0 -203
- snowflake/ml/registry/_initial_schema.py +0 -142
- snowflake/ml/registry/_schema.py +0 -82
- snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
- snowflake/ml/registry/_schema_version_manager.py +0 -163
- snowflake/ml/registry/model_registry.py +0 -2048
- snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -589,12 +586,23 @@ class DictionaryLearning(BaseTransformer):
|
|
589
586
|
autogenerated=self._autogenerated,
|
590
587
|
subproject=_SUBPROJECT,
|
591
588
|
)
|
592
|
-
|
593
|
-
|
594
|
-
expected_output_cols_list=(
|
595
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
596
|
-
),
|
589
|
+
expected_output_cols = (
|
590
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
597
591
|
)
|
592
|
+
if isinstance(dataset, DataFrame):
|
593
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
594
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
595
|
+
)
|
596
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
597
|
+
drop_input_cols=self._drop_input_cols,
|
598
|
+
expected_output_cols_list=expected_output_cols,
|
599
|
+
example_output_pd_df=example_output_pd_df,
|
600
|
+
)
|
601
|
+
else:
|
602
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
603
|
+
drop_input_cols=self._drop_input_cols,
|
604
|
+
expected_output_cols_list=expected_output_cols,
|
605
|
+
)
|
598
606
|
self._sklearn_object = fitted_estimator
|
599
607
|
self._is_fitted = True
|
600
608
|
return output_result
|
@@ -619,6 +627,7 @@ class DictionaryLearning(BaseTransformer):
|
|
619
627
|
"""
|
620
628
|
self._infer_input_output_cols(dataset)
|
621
629
|
super()._check_dataset_type(dataset)
|
630
|
+
|
622
631
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
623
632
|
estimator=self._sklearn_object,
|
624
633
|
dataset=dataset,
|
@@ -675,12 +684,41 @@ class DictionaryLearning(BaseTransformer):
|
|
675
684
|
|
676
685
|
return rv
|
677
686
|
|
678
|
-
def
|
679
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
680
|
-
) -> List[str]:
|
687
|
+
def _align_expected_output(
|
688
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
689
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
690
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
691
|
+
and output dataframe with 1 line.
|
692
|
+
If the method is fit_predict, run 2 lines of data.
|
693
|
+
"""
|
681
694
|
# in case the inferred output column names dimension is different
|
682
695
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
683
|
-
|
696
|
+
|
697
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
698
|
+
# so change the minimum of number of rows to 2
|
699
|
+
num_examples = 2
|
700
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
701
|
+
project=_PROJECT,
|
702
|
+
subproject=_SUBPROJECT,
|
703
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
704
|
+
inspect.currentframe(), DictionaryLearning.__class__.__name__
|
705
|
+
),
|
706
|
+
api_calls=[Session.call],
|
707
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
708
|
+
)
|
709
|
+
if output_cols_prefix == "fit_predict_":
|
710
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
711
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
712
|
+
num_examples = self._sklearn_object.n_clusters
|
713
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
714
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
715
|
+
num_examples = self._sklearn_object.min_samples
|
716
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
717
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
718
|
+
num_examples = self._sklearn_object.n_neighbors
|
719
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
720
|
+
else:
|
721
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
684
722
|
|
685
723
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
686
724
|
# seen during the fit.
|
@@ -692,12 +730,14 @@ class DictionaryLearning(BaseTransformer):
|
|
692
730
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
693
731
|
if self.sample_weight_col:
|
694
732
|
output_df_columns_set -= set(self.sample_weight_col)
|
733
|
+
|
695
734
|
# if the dimension of inferred output column names is correct; use it
|
696
735
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
697
|
-
return expected_output_cols_list
|
736
|
+
return expected_output_cols_list, output_df_pd
|
698
737
|
# otherwise, use the sklearn estimator's output
|
699
738
|
else:
|
700
|
-
|
739
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
740
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
701
741
|
|
702
742
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
703
743
|
@telemetry.send_api_usage_telemetry(
|
@@ -743,7 +783,7 @@ class DictionaryLearning(BaseTransformer):
|
|
743
783
|
drop_input_cols=self._drop_input_cols,
|
744
784
|
expected_output_cols_type="float",
|
745
785
|
)
|
746
|
-
expected_output_cols = self.
|
786
|
+
expected_output_cols, _ = self._align_expected_output(
|
747
787
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
748
788
|
)
|
749
789
|
|
@@ -809,7 +849,7 @@ class DictionaryLearning(BaseTransformer):
|
|
809
849
|
drop_input_cols=self._drop_input_cols,
|
810
850
|
expected_output_cols_type="float",
|
811
851
|
)
|
812
|
-
expected_output_cols = self.
|
852
|
+
expected_output_cols, _ = self._align_expected_output(
|
813
853
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
814
854
|
)
|
815
855
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -872,7 +912,7 @@ class DictionaryLearning(BaseTransformer):
|
|
872
912
|
drop_input_cols=self._drop_input_cols,
|
873
913
|
expected_output_cols_type="float",
|
874
914
|
)
|
875
|
-
expected_output_cols = self.
|
915
|
+
expected_output_cols, _ = self._align_expected_output(
|
876
916
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
877
917
|
)
|
878
918
|
|
@@ -937,7 +977,7 @@ class DictionaryLearning(BaseTransformer):
|
|
937
977
|
drop_input_cols = self._drop_input_cols,
|
938
978
|
expected_output_cols_type="float",
|
939
979
|
)
|
940
|
-
expected_output_cols = self.
|
980
|
+
expected_output_cols, _ = self._align_expected_output(
|
941
981
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
942
982
|
)
|
943
983
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -526,12 +523,23 @@ class FactorAnalysis(BaseTransformer):
|
|
526
523
|
autogenerated=self._autogenerated,
|
527
524
|
subproject=_SUBPROJECT,
|
528
525
|
)
|
529
|
-
|
530
|
-
|
531
|
-
expected_output_cols_list=(
|
532
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
533
|
-
),
|
526
|
+
expected_output_cols = (
|
527
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
534
528
|
)
|
529
|
+
if isinstance(dataset, DataFrame):
|
530
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
531
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
532
|
+
)
|
533
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
534
|
+
drop_input_cols=self._drop_input_cols,
|
535
|
+
expected_output_cols_list=expected_output_cols,
|
536
|
+
example_output_pd_df=example_output_pd_df,
|
537
|
+
)
|
538
|
+
else:
|
539
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
540
|
+
drop_input_cols=self._drop_input_cols,
|
541
|
+
expected_output_cols_list=expected_output_cols,
|
542
|
+
)
|
535
543
|
self._sklearn_object = fitted_estimator
|
536
544
|
self._is_fitted = True
|
537
545
|
return output_result
|
@@ -556,6 +564,7 @@ class FactorAnalysis(BaseTransformer):
|
|
556
564
|
"""
|
557
565
|
self._infer_input_output_cols(dataset)
|
558
566
|
super()._check_dataset_type(dataset)
|
567
|
+
|
559
568
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
560
569
|
estimator=self._sklearn_object,
|
561
570
|
dataset=dataset,
|
@@ -612,12 +621,41 @@ class FactorAnalysis(BaseTransformer):
|
|
612
621
|
|
613
622
|
return rv
|
614
623
|
|
615
|
-
def
|
616
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
617
|
-
) -> List[str]:
|
624
|
+
def _align_expected_output(
|
625
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
626
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
627
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
628
|
+
and output dataframe with 1 line.
|
629
|
+
If the method is fit_predict, run 2 lines of data.
|
630
|
+
"""
|
618
631
|
# in case the inferred output column names dimension is different
|
619
632
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
620
|
-
|
633
|
+
|
634
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
635
|
+
# so change the minimum of number of rows to 2
|
636
|
+
num_examples = 2
|
637
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
638
|
+
project=_PROJECT,
|
639
|
+
subproject=_SUBPROJECT,
|
640
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
641
|
+
inspect.currentframe(), FactorAnalysis.__class__.__name__
|
642
|
+
),
|
643
|
+
api_calls=[Session.call],
|
644
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
645
|
+
)
|
646
|
+
if output_cols_prefix == "fit_predict_":
|
647
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
648
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
649
|
+
num_examples = self._sklearn_object.n_clusters
|
650
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
651
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
652
|
+
num_examples = self._sklearn_object.min_samples
|
653
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
654
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
655
|
+
num_examples = self._sklearn_object.n_neighbors
|
656
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
657
|
+
else:
|
658
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
621
659
|
|
622
660
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
623
661
|
# seen during the fit.
|
@@ -629,12 +667,14 @@ class FactorAnalysis(BaseTransformer):
|
|
629
667
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
630
668
|
if self.sample_weight_col:
|
631
669
|
output_df_columns_set -= set(self.sample_weight_col)
|
670
|
+
|
632
671
|
# if the dimension of inferred output column names is correct; use it
|
633
672
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
634
|
-
return expected_output_cols_list
|
673
|
+
return expected_output_cols_list, output_df_pd
|
635
674
|
# otherwise, use the sklearn estimator's output
|
636
675
|
else:
|
637
|
-
|
676
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
677
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
638
678
|
|
639
679
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
640
680
|
@telemetry.send_api_usage_telemetry(
|
@@ -680,7 +720,7 @@ class FactorAnalysis(BaseTransformer):
|
|
680
720
|
drop_input_cols=self._drop_input_cols,
|
681
721
|
expected_output_cols_type="float",
|
682
722
|
)
|
683
|
-
expected_output_cols = self.
|
723
|
+
expected_output_cols, _ = self._align_expected_output(
|
684
724
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
685
725
|
)
|
686
726
|
|
@@ -746,7 +786,7 @@ class FactorAnalysis(BaseTransformer):
|
|
746
786
|
drop_input_cols=self._drop_input_cols,
|
747
787
|
expected_output_cols_type="float",
|
748
788
|
)
|
749
|
-
expected_output_cols = self.
|
789
|
+
expected_output_cols, _ = self._align_expected_output(
|
750
790
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
751
791
|
)
|
752
792
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -809,7 +849,7 @@ class FactorAnalysis(BaseTransformer):
|
|
809
849
|
drop_input_cols=self._drop_input_cols,
|
810
850
|
expected_output_cols_type="float",
|
811
851
|
)
|
812
|
-
expected_output_cols = self.
|
852
|
+
expected_output_cols, _ = self._align_expected_output(
|
813
853
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
814
854
|
)
|
815
855
|
|
@@ -876,7 +916,7 @@ class FactorAnalysis(BaseTransformer):
|
|
876
916
|
drop_input_cols = self._drop_input_cols,
|
877
917
|
expected_output_cols_type="float",
|
878
918
|
)
|
879
|
-
expected_output_cols = self.
|
919
|
+
expected_output_cols, _ = self._align_expected_output(
|
880
920
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
881
921
|
)
|
882
922
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -544,12 +541,23 @@ class FastICA(BaseTransformer):
|
|
544
541
|
autogenerated=self._autogenerated,
|
545
542
|
subproject=_SUBPROJECT,
|
546
543
|
)
|
547
|
-
|
548
|
-
|
549
|
-
expected_output_cols_list=(
|
550
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
551
|
-
),
|
544
|
+
expected_output_cols = (
|
545
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
552
546
|
)
|
547
|
+
if isinstance(dataset, DataFrame):
|
548
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
549
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
550
|
+
)
|
551
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
552
|
+
drop_input_cols=self._drop_input_cols,
|
553
|
+
expected_output_cols_list=expected_output_cols,
|
554
|
+
example_output_pd_df=example_output_pd_df,
|
555
|
+
)
|
556
|
+
else:
|
557
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
558
|
+
drop_input_cols=self._drop_input_cols,
|
559
|
+
expected_output_cols_list=expected_output_cols,
|
560
|
+
)
|
553
561
|
self._sklearn_object = fitted_estimator
|
554
562
|
self._is_fitted = True
|
555
563
|
return output_result
|
@@ -574,6 +582,7 @@ class FastICA(BaseTransformer):
|
|
574
582
|
"""
|
575
583
|
self._infer_input_output_cols(dataset)
|
576
584
|
super()._check_dataset_type(dataset)
|
585
|
+
|
577
586
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
578
587
|
estimator=self._sklearn_object,
|
579
588
|
dataset=dataset,
|
@@ -630,12 +639,41 @@ class FastICA(BaseTransformer):
|
|
630
639
|
|
631
640
|
return rv
|
632
641
|
|
633
|
-
def
|
634
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
635
|
-
) -> List[str]:
|
642
|
+
def _align_expected_output(
|
643
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
644
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
645
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
646
|
+
and output dataframe with 1 line.
|
647
|
+
If the method is fit_predict, run 2 lines of data.
|
648
|
+
"""
|
636
649
|
# in case the inferred output column names dimension is different
|
637
650
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
638
|
-
|
651
|
+
|
652
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
653
|
+
# so change the minimum of number of rows to 2
|
654
|
+
num_examples = 2
|
655
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
656
|
+
project=_PROJECT,
|
657
|
+
subproject=_SUBPROJECT,
|
658
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
659
|
+
inspect.currentframe(), FastICA.__class__.__name__
|
660
|
+
),
|
661
|
+
api_calls=[Session.call],
|
662
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
663
|
+
)
|
664
|
+
if output_cols_prefix == "fit_predict_":
|
665
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
666
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
667
|
+
num_examples = self._sklearn_object.n_clusters
|
668
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
669
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
670
|
+
num_examples = self._sklearn_object.min_samples
|
671
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
672
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
673
|
+
num_examples = self._sklearn_object.n_neighbors
|
674
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
675
|
+
else:
|
676
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
639
677
|
|
640
678
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
641
679
|
# seen during the fit.
|
@@ -647,12 +685,14 @@ class FastICA(BaseTransformer):
|
|
647
685
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
648
686
|
if self.sample_weight_col:
|
649
687
|
output_df_columns_set -= set(self.sample_weight_col)
|
688
|
+
|
650
689
|
# if the dimension of inferred output column names is correct; use it
|
651
690
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
652
|
-
return expected_output_cols_list
|
691
|
+
return expected_output_cols_list, output_df_pd
|
653
692
|
# otherwise, use the sklearn estimator's output
|
654
693
|
else:
|
655
|
-
|
694
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
695
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
656
696
|
|
657
697
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
658
698
|
@telemetry.send_api_usage_telemetry(
|
@@ -698,7 +738,7 @@ class FastICA(BaseTransformer):
|
|
698
738
|
drop_input_cols=self._drop_input_cols,
|
699
739
|
expected_output_cols_type="float",
|
700
740
|
)
|
701
|
-
expected_output_cols = self.
|
741
|
+
expected_output_cols, _ = self._align_expected_output(
|
702
742
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
703
743
|
)
|
704
744
|
|
@@ -764,7 +804,7 @@ class FastICA(BaseTransformer):
|
|
764
804
|
drop_input_cols=self._drop_input_cols,
|
765
805
|
expected_output_cols_type="float",
|
766
806
|
)
|
767
|
-
expected_output_cols = self.
|
807
|
+
expected_output_cols, _ = self._align_expected_output(
|
768
808
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
769
809
|
)
|
770
810
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -827,7 +867,7 @@ class FastICA(BaseTransformer):
|
|
827
867
|
drop_input_cols=self._drop_input_cols,
|
828
868
|
expected_output_cols_type="float",
|
829
869
|
)
|
830
|
-
expected_output_cols = self.
|
870
|
+
expected_output_cols, _ = self._align_expected_output(
|
831
871
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
832
872
|
)
|
833
873
|
|
@@ -892,7 +932,7 @@ class FastICA(BaseTransformer):
|
|
892
932
|
drop_input_cols = self._drop_input_cols,
|
893
933
|
expected_output_cols_type="float",
|
894
934
|
)
|
895
|
-
expected_output_cols = self.
|
935
|
+
expected_output_cols, _ = self._align_expected_output(
|
896
936
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
897
937
|
)
|
898
938
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -496,12 +493,23 @@ class IncrementalPCA(BaseTransformer):
|
|
496
493
|
autogenerated=self._autogenerated,
|
497
494
|
subproject=_SUBPROJECT,
|
498
495
|
)
|
499
|
-
|
500
|
-
|
501
|
-
expected_output_cols_list=(
|
502
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
503
|
-
),
|
496
|
+
expected_output_cols = (
|
497
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
504
498
|
)
|
499
|
+
if isinstance(dataset, DataFrame):
|
500
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
501
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
502
|
+
)
|
503
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
504
|
+
drop_input_cols=self._drop_input_cols,
|
505
|
+
expected_output_cols_list=expected_output_cols,
|
506
|
+
example_output_pd_df=example_output_pd_df,
|
507
|
+
)
|
508
|
+
else:
|
509
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
510
|
+
drop_input_cols=self._drop_input_cols,
|
511
|
+
expected_output_cols_list=expected_output_cols,
|
512
|
+
)
|
505
513
|
self._sklearn_object = fitted_estimator
|
506
514
|
self._is_fitted = True
|
507
515
|
return output_result
|
@@ -526,6 +534,7 @@ class IncrementalPCA(BaseTransformer):
|
|
526
534
|
"""
|
527
535
|
self._infer_input_output_cols(dataset)
|
528
536
|
super()._check_dataset_type(dataset)
|
537
|
+
|
529
538
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
530
539
|
estimator=self._sklearn_object,
|
531
540
|
dataset=dataset,
|
@@ -582,12 +591,41 @@ class IncrementalPCA(BaseTransformer):
|
|
582
591
|
|
583
592
|
return rv
|
584
593
|
|
585
|
-
def
|
586
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
587
|
-
) -> List[str]:
|
594
|
+
def _align_expected_output(
|
595
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
596
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
597
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
598
|
+
and output dataframe with 1 line.
|
599
|
+
If the method is fit_predict, run 2 lines of data.
|
600
|
+
"""
|
588
601
|
# in case the inferred output column names dimension is different
|
589
602
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
590
|
-
|
603
|
+
|
604
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
605
|
+
# so change the minimum of number of rows to 2
|
606
|
+
num_examples = 2
|
607
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
608
|
+
project=_PROJECT,
|
609
|
+
subproject=_SUBPROJECT,
|
610
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
611
|
+
inspect.currentframe(), IncrementalPCA.__class__.__name__
|
612
|
+
),
|
613
|
+
api_calls=[Session.call],
|
614
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
615
|
+
)
|
616
|
+
if output_cols_prefix == "fit_predict_":
|
617
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
618
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
619
|
+
num_examples = self._sklearn_object.n_clusters
|
620
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
621
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
622
|
+
num_examples = self._sklearn_object.min_samples
|
623
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
624
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
625
|
+
num_examples = self._sklearn_object.n_neighbors
|
626
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
627
|
+
else:
|
628
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
591
629
|
|
592
630
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
593
631
|
# seen during the fit.
|
@@ -599,12 +637,14 @@ class IncrementalPCA(BaseTransformer):
|
|
599
637
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
600
638
|
if self.sample_weight_col:
|
601
639
|
output_df_columns_set -= set(self.sample_weight_col)
|
640
|
+
|
602
641
|
# if the dimension of inferred output column names is correct; use it
|
603
642
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
604
|
-
return expected_output_cols_list
|
643
|
+
return expected_output_cols_list, output_df_pd
|
605
644
|
# otherwise, use the sklearn estimator's output
|
606
645
|
else:
|
607
|
-
|
646
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
647
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
608
648
|
|
609
649
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
610
650
|
@telemetry.send_api_usage_telemetry(
|
@@ -650,7 +690,7 @@ class IncrementalPCA(BaseTransformer):
|
|
650
690
|
drop_input_cols=self._drop_input_cols,
|
651
691
|
expected_output_cols_type="float",
|
652
692
|
)
|
653
|
-
expected_output_cols = self.
|
693
|
+
expected_output_cols, _ = self._align_expected_output(
|
654
694
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
655
695
|
)
|
656
696
|
|
@@ -716,7 +756,7 @@ class IncrementalPCA(BaseTransformer):
|
|
716
756
|
drop_input_cols=self._drop_input_cols,
|
717
757
|
expected_output_cols_type="float",
|
718
758
|
)
|
719
|
-
expected_output_cols = self.
|
759
|
+
expected_output_cols, _ = self._align_expected_output(
|
720
760
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
721
761
|
)
|
722
762
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -779,7 +819,7 @@ class IncrementalPCA(BaseTransformer):
|
|
779
819
|
drop_input_cols=self._drop_input_cols,
|
780
820
|
expected_output_cols_type="float",
|
781
821
|
)
|
782
|
-
expected_output_cols = self.
|
822
|
+
expected_output_cols, _ = self._align_expected_output(
|
783
823
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
784
824
|
)
|
785
825
|
|
@@ -844,7 +884,7 @@ class IncrementalPCA(BaseTransformer):
|
|
844
884
|
drop_input_cols = self._drop_input_cols,
|
845
885
|
expected_output_cols_type="float",
|
846
886
|
)
|
847
|
-
expected_output_cols = self.
|
887
|
+
expected_output_cols, _ = self._align_expected_output(
|
848
888
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
849
889
|
)
|
850
890
|
|