snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/file_utils.py +3 -3
- snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
- snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
- snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
- snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
- snowflake/ml/_internal/telemetry.py +11 -2
- snowflake/ml/_internal/utils/formatting.py +1 -1
- snowflake/ml/feature_store/feature_store.py +15 -106
- snowflake/ml/fileset/sfcfs.py +4 -3
- snowflake/ml/fileset/stage_fs.py +18 -0
- snowflake/ml/model/_api.py +9 -9
- snowflake/ml/model/_client/model/model_version_impl.py +20 -15
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
- snowflake/ml/model/_model_composer/model_composer.py +10 -8
- snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
- snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
- snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
- snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
- snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
- snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
- snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
- snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
- snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
- snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
- snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
- snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_packager.py +8 -6
- snowflake/ml/model/custom_model.py +3 -1
- snowflake/ml/model/type_hints.py +13 -0
- snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
- snowflake/ml/modeling/_internal/model_specifications.py +3 -1
- snowflake/ml/modeling/_internal/model_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
- snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
- snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
- snowflake/ml/modeling/cluster/birch.py +33 -61
- snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
- snowflake/ml/modeling/cluster/dbscan.py +33 -61
- snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
- snowflake/ml/modeling/cluster/k_means.py +33 -61
- snowflake/ml/modeling/cluster/mean_shift.py +33 -61
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
- snowflake/ml/modeling/cluster/optics.py +33 -61
- snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
- snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
- snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
- snowflake/ml/modeling/compose/column_transformer.py +33 -61
- snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
- snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
- snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
- snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
- snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
- snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
- snowflake/ml/modeling/covariance/oas.py +33 -61
- snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
- snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
- snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
- snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
- snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
- snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
- snowflake/ml/modeling/decomposition/pca.py +33 -61
- snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
- snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
- snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
- snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
- snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
- snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
- snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
- snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
- snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
- snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
- snowflake/ml/modeling/framework/base.py +55 -5
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
- snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
- snowflake/ml/modeling/impute/knn_imputer.py +33 -61
- snowflake/ml/modeling/impute/missing_indicator.py +33 -61
- snowflake/ml/modeling/impute/simple_imputer.py +4 -15
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
- snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
- snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
- snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
- snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/lars.py +33 -61
- snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
- snowflake/ml/modeling/linear_model/lasso.py +33 -61
- snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
- snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
- snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
- snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/perceptron.py +33 -61
- snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/ridge.py +33 -61
- snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
- snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
- snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
- snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
- snowflake/ml/modeling/manifold/isomap.py +33 -61
- snowflake/ml/modeling/manifold/mds.py +33 -61
- snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
- snowflake/ml/modeling/manifold/tsne.py +33 -61
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
- snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
- snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
- snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
- snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
- snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
- snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
- snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
- snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
- snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
- snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
- snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
- snowflake/ml/modeling/svm/linear_svc.py +33 -61
- snowflake/ml/modeling/svm/linear_svr.py +33 -61
- snowflake/ml/modeling/svm/nu_svc.py +33 -61
- snowflake/ml/modeling/svm/nu_svr.py +33 -61
- snowflake/ml/modeling/svm/svc.py +33 -61
- snowflake/ml/modeling/svm/svr.py +33 -61
- snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
- snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
- snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
- snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
- snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
- snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
- snowflake/ml/registry/_manager/model_manager.py +6 -2
- snowflake/ml/registry/model_registry.py +100 -27
- snowflake/ml/registry/registry.py +6 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
- {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
- {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -15,16 +15,16 @@ from snowflake.ml._internal.utils.temp_file_utils import (
|
|
15
15
|
cleanup_temp_files,
|
16
16
|
get_temp_file_path,
|
17
17
|
)
|
18
|
-
from snowflake.
|
18
|
+
from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
|
19
|
+
from snowflake.snowpark import DataFrame, Session, functions as F, types as T
|
19
20
|
from snowflake.snowpark._internal.utils import (
|
20
21
|
TempObjectType,
|
21
22
|
random_name_for_temp_object,
|
22
23
|
)
|
23
|
-
from snowflake.snowpark.functions import pandas_udf, sproc
|
24
|
-
from snowflake.snowpark.types import PandasSeries
|
25
24
|
|
26
25
|
cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
|
27
26
|
cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
|
27
|
+
cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
|
28
28
|
|
29
29
|
_PROJECT = "ModelDevelopment"
|
30
30
|
|
@@ -67,9 +67,9 @@ class SnowparkTransformHandlers:
|
|
67
67
|
inference_method: str,
|
68
68
|
input_cols: List[str],
|
69
69
|
expected_output_cols: List[str],
|
70
|
-
pass_through_cols: List[str],
|
71
70
|
session: Session,
|
72
71
|
dependencies: List[str],
|
72
|
+
drop_input_cols: Optional[bool] = False,
|
73
73
|
expected_output_cols_type: Optional[str] = "",
|
74
74
|
*args: Any,
|
75
75
|
**kwargs: Any,
|
@@ -81,8 +81,8 @@ class SnowparkTransformHandlers:
|
|
81
81
|
dependencies: List of dependencies for the transformer.
|
82
82
|
inference_method: the name of the method used by `estimator` to run inference.
|
83
83
|
input_cols: List of feature columns for inference.
|
84
|
-
pass_through_cols: columns in the dataset not used in inference.
|
85
84
|
expected_output_cols: column names (in order) of the output dataset.
|
85
|
+
drop_input_cols: Boolean to determine whether to drop the input columns from the output dataset.
|
86
86
|
expected_output_cols_type: Expected type of the output columns.
|
87
87
|
args: additional positional arguments.
|
88
88
|
kwargs: additional keyword args.
|
@@ -95,141 +95,94 @@ class SnowparkTransformHandlers:
|
|
95
95
|
estimator = self.estimator
|
96
96
|
# Register vectorized UDF for batch inference
|
97
97
|
batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
|
98
|
-
|
98
|
+
|
99
99
|
dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
|
100
|
+
# Align the input_cols with snowpark dataframe's column name
|
101
|
+
# This step also makes sure that the every col in input_cols exists in the current dataset
|
102
|
+
snowpark_cols = dataset.select(input_cols).columns
|
103
|
+
|
104
|
+
# Infer the datatype from input dataset's schema for batch inference
|
105
|
+
# This is required before registering the UDTF
|
106
|
+
fields = dataset.select(input_cols).schema.fields
|
107
|
+
input_datatypes = []
|
108
|
+
for field in fields:
|
109
|
+
input_datatypes.append(field.datatype)
|
100
110
|
|
101
111
|
statement_params = telemetry.get_function_usage_statement_params(
|
102
112
|
project=_PROJECT,
|
103
113
|
subproject=self._subproject,
|
104
114
|
function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
|
105
|
-
api_calls=[pandas_udf],
|
115
|
+
api_calls=[F.pandas_udf],
|
106
116
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
107
117
|
)
|
108
118
|
|
109
|
-
@pandas_udf( # type: ignore[arg-type, misc]
|
119
|
+
@F.pandas_udf( # type: ignore[arg-type, misc]
|
110
120
|
is_permanent=False,
|
111
121
|
name=batch_inference_udf_name,
|
112
122
|
packages=dependencies, # type: ignore[arg-type]
|
113
123
|
replace=True,
|
114
124
|
session=session,
|
115
125
|
statement_params=statement_params,
|
126
|
+
input_types=[T.PandasDataFrameType(input_datatypes)],
|
116
127
|
)
|
117
|
-
def vec_batch_infer(
|
118
|
-
import
|
119
|
-
|
120
|
-
import numpy as np
|
128
|
+
def vec_batch_infer(input_df: pd.DataFrame) -> T.PandasSeries[dict]: # type: ignore[type-arg]
|
129
|
+
import numpy as np # noqa: F401
|
121
130
|
import pandas as pd
|
122
131
|
|
123
|
-
input_df =
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
# Model expects exact same columns names in the input df for predict call.
|
129
|
-
|
130
|
-
input_df = input_df[input_cols] # Select input columns with quoted column names.
|
131
|
-
if hasattr(estimator, "feature_names_in_"):
|
132
|
-
missing_features = []
|
133
|
-
for i, f in enumerate(getattr(estimator, "feature_names_in_", {})):
|
134
|
-
if i >= len(input_cols) or (input_cols[i] != f and snowpark_cols[i] != f):
|
135
|
-
missing_features.append(f)
|
136
|
-
|
137
|
-
if len(missing_features) > 0:
|
138
|
-
raise ValueError(
|
139
|
-
"The feature names should match with those that were passed during fit.\n"
|
140
|
-
f"Features seen during fit call but not present in the input: {missing_features}\n"
|
141
|
-
f"Features in the input dataframe : {input_cols}\n"
|
142
|
-
)
|
143
|
-
input_df.columns = getattr(estimator, "feature_names_in_", {})
|
144
|
-
else:
|
145
|
-
# Just rename the column names to unquoted identifiers.
|
146
|
-
input_df.columns = snowpark_cols # Replace the quoted columns identifier with unquoted column ids.
|
132
|
+
input_df.columns = snowpark_cols
|
133
|
+
|
134
|
+
if hasattr(estimator, "n_jobs"):
|
135
|
+
# Vectorized UDF cannot handle joblib multiprocessing right now, deactivate the n_jobs
|
136
|
+
estimator.n_jobs = 1
|
147
137
|
inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs)
|
148
|
-
if isinstance(inference_res, list) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray):
|
149
|
-
# In case of multioutput estimators, predict_proba, decision_function etc., functions return a list of
|
150
|
-
# ndarrays. We need to concatenate them.
|
151
|
-
transformed_numpy_array = np.concatenate(inference_res, axis=1)
|
152
|
-
elif (
|
153
|
-
isinstance(inference_res, tuple) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray)
|
154
|
-
):
|
155
|
-
# In case of kneighbors, functions return a tuple of ndarrays.
|
156
|
-
transformed_numpy_array = np.stack(inference_res, axis=1)
|
157
|
-
elif isinstance(inference_res, numbers.Number):
|
158
|
-
# In case of BernoulliRBM, functions return a float
|
159
|
-
transformed_numpy_array = np.array([inference_res])
|
160
|
-
else:
|
161
|
-
transformed_numpy_array = inference_res
|
162
138
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
139
|
+
transformed_numpy_array, output_cols = handle_inference_result(
|
140
|
+
inference_res=inference_res,
|
141
|
+
output_cols=expected_output_cols,
|
142
|
+
inference_method=inference_method,
|
143
|
+
within_udf=True,
|
144
|
+
)
|
168
145
|
|
169
146
|
if len(transformed_numpy_array.shape) > 1:
|
170
|
-
if transformed_numpy_array.shape[1] != len(
|
171
|
-
# HeterogeneousEnsemble's transform method produce results with variying shapes
|
172
|
-
# from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes).
|
173
|
-
# It is hard to predict the response shape without using fragile introspection logic.
|
174
|
-
# So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with
|
175
|
-
# each element being a list.
|
176
|
-
if len(expected_output_cols) != 1:
|
177
|
-
raise TypeError(
|
178
|
-
"expected_output_cols must be same length as transformed array or " "should be of length 1"
|
179
|
-
)
|
147
|
+
if transformed_numpy_array.shape[1] != len(output_cols):
|
180
148
|
series = pd.Series(transformed_numpy_array.tolist())
|
181
|
-
transformed_pandas_df = pd.DataFrame(series, columns=
|
149
|
+
transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
|
182
150
|
else:
|
183
|
-
transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=
|
151
|
+
transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
|
184
152
|
else:
|
185
|
-
transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=
|
153
|
+
transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
|
186
154
|
|
187
155
|
return transformed_pandas_df.to_dict("records") # type: ignore[no-any-return]
|
188
156
|
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
]
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
SELECT
|
219
|
-
{outer_select_stmt}
|
220
|
-
FROM (
|
221
|
-
SELECT
|
222
|
-
{inner_select_stmt}
|
223
|
-
FROM {input_table_name}
|
224
|
-
)
|
225
|
-
""".format(
|
226
|
-
input_table_name=batch_inference_table_name,
|
227
|
-
query=query_from_df,
|
228
|
-
outer_select_stmt=", ".join(outer_select_list),
|
229
|
-
inner_select_stmt=", ".join(inner_select_list),
|
230
|
-
)
|
231
|
-
|
232
|
-
return session.sql(sql)
|
157
|
+
# Run Transform and get intermediate result
|
158
|
+
INTERMEDIATE_OBJ_NAME = "tmp_result"
|
159
|
+
# Use snowpark_cols can make sure the name ordering of the input dataframe
|
160
|
+
# and only select those columns to put into vectorized udf
|
161
|
+
output_obj = F.call_udf(batch_inference_udf_name, [F.col(col_name) for col_name in snowpark_cols])
|
162
|
+
df_res: DataFrame = dataset.with_column(INTERMEDIATE_OBJ_NAME, output_obj)
|
163
|
+
|
164
|
+
# Prepare the output
|
165
|
+
output_cols = []
|
166
|
+
output_col_names = []
|
167
|
+
# When there is no expected_output_cols_type, default set it as StringType
|
168
|
+
# snowpark cannot handle empty string, so this step give "string" value
|
169
|
+
if expected_output_cols_type == "":
|
170
|
+
expected_output_cols_type = "string"
|
171
|
+
assert expected_output_cols_type is not None
|
172
|
+
for output_feature in expected_output_cols:
|
173
|
+
output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature].astype(expected_output_cols_type))
|
174
|
+
output_col_names.append(identifier.get_inferred_name(output_feature))
|
175
|
+
|
176
|
+
# Extract output from INTERMEDIATE_OBJ_NAME and drop that column
|
177
|
+
df_res = df_res.with_columns(
|
178
|
+
output_col_names,
|
179
|
+
output_cols,
|
180
|
+
).drop(INTERMEDIATE_OBJ_NAME)
|
181
|
+
|
182
|
+
if drop_input_cols:
|
183
|
+
df_res = df_res.drop(*input_cols)
|
184
|
+
|
185
|
+
return df_res
|
233
186
|
|
234
187
|
def score(
|
235
188
|
self,
|
@@ -287,7 +240,7 @@ class SnowparkTransformHandlers:
|
|
287
240
|
function_name=telemetry.get_statement_params_full_func_name(
|
288
241
|
inspect.currentframe(), self.__class__.__name__
|
289
242
|
),
|
290
|
-
api_calls=[sproc],
|
243
|
+
api_calls=[F.sproc],
|
291
244
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
292
245
|
)
|
293
246
|
# Put locally serialized score on stage.
|
@@ -299,7 +252,7 @@ class SnowparkTransformHandlers:
|
|
299
252
|
statement_params=statement_params,
|
300
253
|
)
|
301
254
|
|
302
|
-
@sproc( # type: ignore[misc]
|
255
|
+
@F.sproc( # type: ignore[misc]
|
303
256
|
is_permanent=False,
|
304
257
|
name=score_sproc_name,
|
305
258
|
packages=dependencies, # type: ignore[arg-type]
|
@@ -279,7 +279,7 @@ class SnowparkModelTrainer:
|
|
279
279
|
def _build_fit_predict_wrapper_sproc(
|
280
280
|
self,
|
281
281
|
model_spec: ModelSpecifications,
|
282
|
-
) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str],
|
282
|
+
) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str], bool, List[str], str], str]:
|
283
283
|
"""
|
284
284
|
Constructs and returns a python stored procedure function to be used for training model.
|
285
285
|
|
@@ -299,7 +299,7 @@ class SnowparkModelTrainer:
|
|
299
299
|
stage_result_file_name: str,
|
300
300
|
input_cols: List[str],
|
301
301
|
statement_params: Dict[str, str],
|
302
|
-
|
302
|
+
drop_input_cols: bool,
|
303
303
|
expected_output_cols_list: List[str],
|
304
304
|
fit_predict_result_name: str,
|
305
305
|
) -> str:
|
@@ -345,12 +345,12 @@ class SnowparkModelTrainer:
|
|
345
345
|
)
|
346
346
|
|
347
347
|
# store the predict output
|
348
|
-
if
|
349
|
-
df = df.copy()
|
348
|
+
if drop_input_cols:
|
350
349
|
fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
|
351
|
-
fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
|
352
350
|
else:
|
351
|
+
df = df.copy()
|
353
352
|
fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
|
353
|
+
fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
|
354
354
|
|
355
355
|
# write into a temp table in sproc and load the table from outside
|
356
356
|
session.write_pandas(
|
@@ -463,18 +463,18 @@ class SnowparkModelTrainer:
|
|
463
463
|
|
464
464
|
def train_fit_predict(
|
465
465
|
self,
|
466
|
-
pass_through_columns: List[str],
|
467
466
|
expected_output_cols_list: List[str],
|
467
|
+
drop_input_cols: Optional[bool] = False,
|
468
468
|
) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
|
469
469
|
"""Trains the model by pushing down the compute into Snowflake using stored procedures.
|
470
470
|
This API is different from fit itself because it would also provide the predict
|
471
471
|
output.
|
472
472
|
|
473
473
|
Args:
|
474
|
-
pass_through_columns (List[str]): The column names that would
|
475
|
-
display in the returned dataset.
|
476
474
|
expected_output_cols_list (List[str]): The output columns
|
477
475
|
name as a list. Defaults to None.
|
476
|
+
drop_input_cols (Optional[bool]): Boolean to determine drop
|
477
|
+
the input columns from the output dataset or not
|
478
478
|
|
479
479
|
Returns:
|
480
480
|
Tuple[Union[DataFrame, pd.DataFrame], object]: [predicted dataset, estimator]
|
@@ -508,7 +508,7 @@ class SnowparkModelTrainer:
|
|
508
508
|
stage_result_file_name,
|
509
509
|
self.input_cols,
|
510
510
|
statement_params,
|
511
|
-
|
511
|
+
drop_input_cols,
|
512
512
|
expected_output_cols_list,
|
513
513
|
fit_predict_result_name,
|
514
514
|
)
|
@@ -107,9 +107,9 @@ class RemoteModelTransformHandlers(Protocol):
|
|
107
107
|
inference_method: str,
|
108
108
|
input_cols: List[str],
|
109
109
|
expected_output_cols: List[str],
|
110
|
-
pass_through_cols: List[str],
|
111
110
|
session: snowpark.Session,
|
112
111
|
dependencies: List[str],
|
112
|
+
drop_input_cols: Optional[bool] = False,
|
113
113
|
expected_output_cols_type: Optional[str] = "",
|
114
114
|
*args: Any,
|
115
115
|
**kwargs: Any,
|
@@ -121,9 +121,9 @@ class RemoteModelTransformHandlers(Protocol):
|
|
121
121
|
dependencies: List of dependencies for the transformer.
|
122
122
|
inference_method: the name of the method used by `estimator` to run inference.
|
123
123
|
input_cols: List of feature columns for inference.
|
124
|
-
pass_through_cols: columns in the dataset not used in inference.
|
125
124
|
expected_output_cols: column names (in order) of the output dataset.
|
126
125
|
expected_output_cols_type: Expected type of the output columns.
|
126
|
+
drop_input_cols: Boolean to determine drop the input columns from the output dataset or not
|
127
127
|
args: additional positional arguments.
|
128
128
|
kwargs: additional keyword args.
|
129
129
|
|
@@ -175,7 +175,6 @@ class BatchInferenceKwargsTypedDict(TypedDict, total=False):
|
|
175
175
|
|
176
176
|
snowpark_input_cols: Optional[List[str]]
|
177
177
|
drop_input_cols: Optional[bool]
|
178
|
-
pass_through_cols: List[str]
|
179
178
|
session: snowpark.Session
|
180
179
|
dependencies: List[str]
|
181
180
|
expected_output_cols_type: str
|
@@ -328,18 +328,24 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
328
328
|
self._get_model_signatures(dataset)
|
329
329
|
return self
|
330
330
|
|
331
|
-
def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
|
332
|
-
if self._drop_input_cols:
|
333
|
-
return []
|
334
|
-
else:
|
335
|
-
return list(set(dataset.columns) - set(self.output_cols))
|
336
|
-
|
337
331
|
def _batch_inference_validate_snowpark(
|
338
332
|
self,
|
339
333
|
dataset: DataFrame,
|
340
334
|
inference_method: str,
|
341
335
|
) -> List[str]:
|
342
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
336
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe and
|
337
|
+
return the available package that exists in the snowflake anaconda channel
|
338
|
+
|
339
|
+
Args:
|
340
|
+
dataset: snowpark dataframe
|
341
|
+
inference_method: the inference method such as predict, score...
|
342
|
+
|
343
|
+
Raises:
|
344
|
+
SnowflakeMLException: If the estimator is not fitted, raise error
|
345
|
+
SnowflakeMLException: If the session is None, raise error
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
A list of available package that exists in the snowflake anaconda channel
|
343
349
|
"""
|
344
350
|
if not self._is_fitted:
|
345
351
|
raise exceptions.SnowflakeMLException(
|
@@ -413,7 +419,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
413
419
|
transform_kwargs = dict(
|
414
420
|
session = dataset._session,
|
415
421
|
dependencies = self._deps,
|
416
|
-
|
422
|
+
drop_input_cols = self._drop_input_cols,
|
417
423
|
expected_output_cols_type = expected_type_inferred,
|
418
424
|
)
|
419
425
|
|
@@ -473,16 +479,16 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
473
479
|
# from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
|
474
480
|
# based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
|
475
481
|
# each row containing a list of values.
|
476
|
-
expected_dtype = "
|
482
|
+
expected_dtype = "array"
|
477
483
|
|
478
484
|
# If we were unable to assign a type to this transform in the factory, infer the type here.
|
479
485
|
if expected_dtype == "":
|
480
|
-
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "
|
486
|
+
# If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
|
481
487
|
if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
|
482
|
-
expected_dtype = "
|
483
|
-
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "
|
488
|
+
expected_dtype = "array"
|
489
|
+
# If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
|
484
490
|
elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
|
485
|
-
expected_dtype = "
|
491
|
+
expected_dtype = "array"
|
486
492
|
else:
|
487
493
|
output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
|
488
494
|
# We can only infer the output types from the input types if the following two statemetns are true:
|
@@ -500,7 +506,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
500
506
|
transform_kwargs = dict(
|
501
507
|
session = dataset._session,
|
502
508
|
dependencies = self._deps,
|
503
|
-
|
509
|
+
drop_input_cols = self._drop_input_cols,
|
504
510
|
expected_output_cols_type = expected_dtype,
|
505
511
|
)
|
506
512
|
|
@@ -551,7 +557,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
551
557
|
subproject=_SUBPROJECT,
|
552
558
|
)
|
553
559
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
554
|
-
|
560
|
+
drop_input_cols=self._drop_input_cols,
|
555
561
|
expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
|
556
562
|
)
|
557
563
|
self._sklearn_object = fitted_estimator
|
@@ -569,44 +575,6 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
569
575
|
assert self._sklearn_object is not None
|
570
576
|
return self._sklearn_object.embedding_
|
571
577
|
|
572
|
-
|
573
|
-
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
574
|
-
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
575
|
-
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
576
|
-
"""
|
577
|
-
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
578
|
-
if output_cols:
|
579
|
-
output_cols = [
|
580
|
-
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
581
|
-
for c in output_cols
|
582
|
-
]
|
583
|
-
elif getattr(self._sklearn_object, "classes_", None) is None:
|
584
|
-
output_cols = [output_cols_prefix]
|
585
|
-
elif self._sklearn_object is not None:
|
586
|
-
classes = self._sklearn_object.classes_
|
587
|
-
if isinstance(classes, numpy.ndarray):
|
588
|
-
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
589
|
-
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
590
|
-
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
591
|
-
output_cols = []
|
592
|
-
for i, cl in enumerate(classes):
|
593
|
-
# For binary classification, there is only one output column for each class
|
594
|
-
# ndarray as the two classes are complementary.
|
595
|
-
if len(cl) == 2:
|
596
|
-
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
597
|
-
else:
|
598
|
-
output_cols.extend([
|
599
|
-
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
600
|
-
])
|
601
|
-
else:
|
602
|
-
output_cols = []
|
603
|
-
|
604
|
-
# Make sure column names are valid snowflake identifiers.
|
605
|
-
assert output_cols is not None # Make MyPy happy
|
606
|
-
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
607
|
-
|
608
|
-
return rv
|
609
|
-
|
610
578
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
611
579
|
@telemetry.send_api_usage_telemetry(
|
612
580
|
project=_PROJECT,
|
@@ -648,7 +616,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
648
616
|
transform_kwargs = dict(
|
649
617
|
session=dataset._session,
|
650
618
|
dependencies=self._deps,
|
651
|
-
|
619
|
+
drop_input_cols = self._drop_input_cols,
|
652
620
|
expected_output_cols_type="float",
|
653
621
|
)
|
654
622
|
|
@@ -715,7 +683,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
715
683
|
transform_kwargs = dict(
|
716
684
|
session=dataset._session,
|
717
685
|
dependencies=self._deps,
|
718
|
-
|
686
|
+
drop_input_cols = self._drop_input_cols,
|
719
687
|
expected_output_cols_type="float",
|
720
688
|
)
|
721
689
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -776,7 +744,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
776
744
|
transform_kwargs = dict(
|
777
745
|
session=dataset._session,
|
778
746
|
dependencies=self._deps,
|
779
|
-
|
747
|
+
drop_input_cols = self._drop_input_cols,
|
780
748
|
expected_output_cols_type="float",
|
781
749
|
)
|
782
750
|
|
@@ -841,7 +809,7 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
841
809
|
transform_kwargs = dict(
|
842
810
|
session=dataset._session,
|
843
811
|
dependencies=self._deps,
|
844
|
-
|
812
|
+
drop_input_cols = self._drop_input_cols,
|
845
813
|
expected_output_cols_type="float",
|
846
814
|
)
|
847
815
|
|
@@ -897,13 +865,17 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
897
865
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
898
866
|
|
899
867
|
if isinstance(dataset, DataFrame):
|
868
|
+
self._deps = self._batch_inference_validate_snowpark(
|
869
|
+
dataset=dataset,
|
870
|
+
inference_method="score",
|
871
|
+
)
|
900
872
|
selected_cols = self._get_active_columns()
|
901
873
|
if len(selected_cols) > 0:
|
902
874
|
dataset = dataset.select(selected_cols)
|
903
875
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
904
876
|
transform_kwargs = dict(
|
905
877
|
session=dataset._session,
|
906
|
-
dependencies=["snowflake-snowpark-python"] + self.
|
878
|
+
dependencies=["snowflake-snowpark-python"] + self._deps,
|
907
879
|
score_sproc_imports=['sklearn'],
|
908
880
|
)
|
909
881
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -977,9 +949,9 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
977
949
|
transform_kwargs = dict(
|
978
950
|
session = dataset._session,
|
979
951
|
dependencies = self._deps,
|
980
|
-
|
981
|
-
expected_output_cols_type
|
982
|
-
n_neighbors =
|
952
|
+
drop_input_cols = self._drop_input_cols,
|
953
|
+
expected_output_cols_type="array",
|
954
|
+
n_neighbors = n_neighbors,
|
983
955
|
return_distance = return_distance
|
984
956
|
)
|
985
957
|
elif isinstance(dataset, pd.DataFrame):
|