snowflake-ml-python 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_complete.py +1 -1
- snowflake/cortex/_extract_answer.py +1 -1
- snowflake/cortex/_sentiment.py +1 -1
- snowflake/cortex/_summarize.py +1 -1
- snowflake/cortex/_translate.py +1 -1
- snowflake/ml/_internal/env_utils.py +68 -6
- snowflake/ml/_internal/file_utils.py +34 -4
- snowflake/ml/_internal/telemetry.py +79 -91
- snowflake/ml/_internal/utils/identifier.py +78 -72
- snowflake/ml/_internal/utils/retryable_http.py +16 -4
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +122 -0
- snowflake/ml/dataset/dataset.py +1 -1
- snowflake/ml/model/_api.py +21 -14
- snowflake/ml/model/_client/model/model_impl.py +176 -0
- snowflake/ml/model/_client/model/model_method_info.py +19 -0
- snowflake/ml/model/_client/model/model_version_impl.py +291 -0
- snowflake/ml/model/_client/ops/metadata_ops.py +107 -0
- snowflake/ml/model/_client/ops/model_ops.py +308 -0
- snowflake/ml/model/_client/sql/model.py +75 -0
- snowflake/ml/model/_client/sql/model_version.py +213 -0
- snowflake/ml/model/_client/sql/stage.py +40 -0
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -4
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +24 -8
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +23 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +14 -2
- snowflake/ml/model/_deploy_client/utils/constants.py +1 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +2 -2
- snowflake/ml/model/_model_composer/model_composer.py +31 -9
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +25 -10
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -2
- snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +34 -3
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +1 -1
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +3 -1
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +10 -28
- snowflake/ml/model/_packager/model_meta/model_meta.py +18 -16
- snowflake/ml/model/_signatures/snowpark_handler.py +1 -1
- snowflake/ml/model/model_signature.py +108 -53
- snowflake/ml/model/type_hints.py +1 -0
- snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +554 -0
- snowflake/ml/modeling/_internal/estimator_protocols.py +1 -60
- snowflake/ml/modeling/_internal/model_specifications.py +146 -0
- snowflake/ml/modeling/_internal/model_trainer.py +13 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +78 -0
- snowflake/ml/modeling/_internal/pandas_trainer.py +54 -0
- snowflake/ml/modeling/_internal/snowpark_handlers.py +6 -760
- snowflake/ml/modeling/_internal/snowpark_trainer.py +331 -0
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +108 -135
- snowflake/ml/modeling/cluster/affinity_propagation.py +106 -135
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +106 -135
- snowflake/ml/modeling/cluster/birch.py +106 -135
- snowflake/ml/modeling/cluster/bisecting_k_means.py +106 -135
- snowflake/ml/modeling/cluster/dbscan.py +106 -135
- snowflake/ml/modeling/cluster/feature_agglomeration.py +106 -135
- snowflake/ml/modeling/cluster/k_means.py +105 -135
- snowflake/ml/modeling/cluster/mean_shift.py +106 -135
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +105 -135
- snowflake/ml/modeling/cluster/optics.py +106 -135
- snowflake/ml/modeling/cluster/spectral_biclustering.py +106 -135
- snowflake/ml/modeling/cluster/spectral_clustering.py +106 -135
- snowflake/ml/modeling/cluster/spectral_coclustering.py +106 -135
- snowflake/ml/modeling/compose/column_transformer.py +106 -135
- snowflake/ml/modeling/compose/transformed_target_regressor.py +108 -135
- snowflake/ml/modeling/covariance/elliptic_envelope.py +106 -135
- snowflake/ml/modeling/covariance/empirical_covariance.py +99 -128
- snowflake/ml/modeling/covariance/graphical_lasso.py +106 -135
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +106 -135
- snowflake/ml/modeling/covariance/ledoit_wolf.py +104 -133
- snowflake/ml/modeling/covariance/min_cov_det.py +106 -135
- snowflake/ml/modeling/covariance/oas.py +99 -128
- snowflake/ml/modeling/covariance/shrunk_covariance.py +103 -132
- snowflake/ml/modeling/decomposition/dictionary_learning.py +106 -135
- snowflake/ml/modeling/decomposition/factor_analysis.py +106 -135
- snowflake/ml/modeling/decomposition/fast_ica.py +106 -135
- snowflake/ml/modeling/decomposition/incremental_pca.py +106 -135
- snowflake/ml/modeling/decomposition/kernel_pca.py +106 -135
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +106 -135
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +106 -135
- snowflake/ml/modeling/decomposition/pca.py +106 -135
- snowflake/ml/modeling/decomposition/sparse_pca.py +106 -135
- snowflake/ml/modeling/decomposition/truncated_svd.py +106 -135
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +108 -135
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +108 -135
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/bagging_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/bagging_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/isolation_forest.py +106 -135
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/stacking_regressor.py +108 -135
- snowflake/ml/modeling/ensemble/voting_classifier.py +108 -135
- snowflake/ml/modeling/ensemble/voting_regressor.py +108 -135
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +101 -128
- snowflake/ml/modeling/feature_selection/select_fdr.py +99 -126
- snowflake/ml/modeling/feature_selection/select_fpr.py +99 -126
- snowflake/ml/modeling/feature_selection/select_fwe.py +99 -126
- snowflake/ml/modeling/feature_selection/select_k_best.py +100 -127
- snowflake/ml/modeling/feature_selection/select_percentile.py +99 -126
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +106 -135
- snowflake/ml/modeling/feature_selection/variance_threshold.py +95 -124
- snowflake/ml/modeling/framework/base.py +83 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +108 -135
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +108 -135
- snowflake/ml/modeling/impute/iterative_imputer.py +106 -135
- snowflake/ml/modeling/impute/knn_imputer.py +106 -135
- snowflake/ml/modeling/impute/missing_indicator.py +106 -135
- snowflake/ml/modeling/impute/simple_imputer.py +9 -1
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +96 -125
- snowflake/ml/modeling/kernel_approximation/nystroem.py +106 -135
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +106 -135
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +105 -134
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +103 -132
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +108 -135
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +90 -118
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +90 -118
- snowflake/ml/modeling/linear_model/ard_regression.py +108 -135
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +108 -135
- snowflake/ml/modeling/linear_model/elastic_net.py +108 -135
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +108 -135
- snowflake/ml/modeling/linear_model/gamma_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/huber_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/lars.py +108 -135
- snowflake/ml/modeling/linear_model/lars_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +108 -135
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +108 -135
- snowflake/ml/modeling/linear_model/linear_regression.py +108 -135
- snowflake/ml/modeling/linear_model/logistic_regression.py +108 -135
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +108 -135
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +108 -135
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +108 -135
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +107 -135
- snowflake/ml/modeling/linear_model/perceptron.py +107 -135
- snowflake/ml/modeling/linear_model/poisson_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/ransac_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/ridge.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +108 -135
- snowflake/ml/modeling/linear_model/ridge_cv.py +108 -135
- snowflake/ml/modeling/linear_model/sgd_classifier.py +108 -135
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +106 -135
- snowflake/ml/modeling/linear_model/sgd_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +108 -135
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +108 -135
- snowflake/ml/modeling/manifold/isomap.py +106 -135
- snowflake/ml/modeling/manifold/mds.py +106 -135
- snowflake/ml/modeling/manifold/spectral_embedding.py +106 -135
- snowflake/ml/modeling/manifold/tsne.py +106 -135
- snowflake/ml/modeling/metrics/classification.py +196 -55
- snowflake/ml/modeling/metrics/correlation.py +4 -2
- snowflake/ml/modeling/metrics/covariance.py +7 -4
- snowflake/ml/modeling/metrics/ranking.py +32 -16
- snowflake/ml/modeling/metrics/regression.py +60 -32
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +106 -135
- snowflake/ml/modeling/mixture/gaussian_mixture.py +106 -135
- snowflake/ml/modeling/model_selection/grid_search_cv.py +91 -148
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +93 -154
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +105 -132
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +108 -135
- snowflake/ml/modeling/multiclass/output_code_classifier.py +108 -135
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/complement_nb.py +108 -135
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +98 -125
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +107 -134
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +108 -135
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +108 -135
- snowflake/ml/modeling/neighbors/kernel_density.py +106 -135
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +106 -135
- snowflake/ml/modeling/neighbors/nearest_centroid.py +108 -135
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +106 -135
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +108 -135
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +108 -135
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +108 -135
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +106 -135
- snowflake/ml/modeling/neural_network/mlp_classifier.py +108 -135
- snowflake/ml/modeling/neural_network/mlp_regressor.py +108 -135
- snowflake/ml/modeling/parameters/disable_distributed_hpo.py +2 -6
- snowflake/ml/modeling/preprocessing/binarizer.py +25 -8
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +9 -4
- snowflake/ml/modeling/preprocessing/label_encoder.py +31 -11
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +27 -9
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +42 -14
- snowflake/ml/modeling/preprocessing/normalizer.py +9 -4
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +26 -10
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +37 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +106 -135
- snowflake/ml/modeling/preprocessing/robust_scaler.py +39 -13
- snowflake/ml/modeling/preprocessing/standard_scaler.py +36 -12
- snowflake/ml/modeling/semi_supervised/label_propagation.py +108 -135
- snowflake/ml/modeling/semi_supervised/label_spreading.py +108 -135
- snowflake/ml/modeling/svm/linear_svc.py +108 -135
- snowflake/ml/modeling/svm/linear_svr.py +108 -135
- snowflake/ml/modeling/svm/nu_svc.py +108 -135
- snowflake/ml/modeling/svm/nu_svr.py +108 -135
- snowflake/ml/modeling/svm/svc.py +108 -135
- snowflake/ml/modeling/svm/svr.py +108 -135
- snowflake/ml/modeling/tree/decision_tree_classifier.py +108 -135
- snowflake/ml/modeling/tree/decision_tree_regressor.py +108 -135
- snowflake/ml/modeling/tree/extra_tree_classifier.py +108 -135
- snowflake/ml/modeling/tree/extra_tree_regressor.py +108 -135
- snowflake/ml/modeling/xgboost/xgb_classifier.py +108 -136
- snowflake/ml/modeling/xgboost/xgb_regressor.py +108 -136
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +108 -136
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +108 -136
- snowflake/ml/registry/model_registry.py +2 -0
- snowflake/ml/registry/registry.py +215 -0
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/METADATA +34 -1
- snowflake_ml_python-1.1.2.dist-info/RECORD +347 -0
- snowflake_ml_python-1.1.0.dist-info/RECORD +0 -331
- {snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/WHEEL +0 -0
@@ -40,10 +40,14 @@ def d2_absolute_error_score(
|
|
40
40
|
gets a :math:`D^2` score of 0.0.
|
41
41
|
|
42
42
|
Args:
|
43
|
-
df:
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
df: snowpark.DataFrame
|
44
|
+
Input dataframe.
|
45
|
+
y_true_col_names: string or list of strings
|
46
|
+
Column name(s) representing actual values.
|
47
|
+
y_pred_col_names: string or list of strings
|
48
|
+
Column name(s) representing predicted values.
|
49
|
+
sample_weight_col_name: string, default=None
|
50
|
+
Column name representing sample weights.
|
47
51
|
multioutput: {'raw_values', 'uniform_average'} or array-like of shape \
|
48
52
|
(n_outputs,), default='uniform_average'
|
49
53
|
Defines aggregating of multiple output values.
|
@@ -128,11 +132,16 @@ def d2_pinball_score(
|
|
128
132
|
gets a :math:`D^2` score of 0.0.
|
129
133
|
|
130
134
|
Args:
|
131
|
-
df:
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
135
|
+
df: snowpark.DataFrame
|
136
|
+
Input dataframe.
|
137
|
+
y_true_col_names: string or list of strings
|
138
|
+
Column name(s) representing actual values.
|
139
|
+
y_pred_col_names: string or list of strings
|
140
|
+
Column name(s) representing predicted values.
|
141
|
+
sample_weight_col_name: string, default=None
|
142
|
+
Column name representing sample weights.
|
143
|
+
alpha: float, default=0.5
|
144
|
+
Slope of the pinball deviance. It determines the quantile level
|
136
145
|
alpha for which the pinball deviance and also D2 are optimal.
|
137
146
|
The default `alpha=0.5` is equivalent to `d2_absolute_error_score`.
|
138
147
|
multioutput: {'raw_values', 'uniform_average'} or array-like of shape \
|
@@ -233,10 +242,14 @@ def explained_variance_score(
|
|
233
242
|
the :func:`R^2 score <r2_score>` should be preferred.
|
234
243
|
|
235
244
|
Args:
|
236
|
-
df:
|
237
|
-
|
238
|
-
|
239
|
-
|
245
|
+
df: snowpark.DataFrame
|
246
|
+
Input dataframe.
|
247
|
+
y_true_col_names: string or list of strings
|
248
|
+
Column name(s) representing actual values.
|
249
|
+
y_pred_col_names: string or list of strings
|
250
|
+
Column name(s) representing predicted values.
|
251
|
+
sample_weight_col_name: string, default=None
|
252
|
+
Column name representing sample weights.
|
240
253
|
multioutput: {'raw_values', 'uniform_average', 'variance_weighted'} or \
|
241
254
|
array-like of shape (n_outputs,), default='uniform_average'
|
242
255
|
Defines aggregating of multiple output values.
|
@@ -248,7 +261,8 @@ def explained_variance_score(
|
|
248
261
|
'variance_weighted':
|
249
262
|
Scores of all outputs are averaged, weighted by the variances
|
250
263
|
of each individual output.
|
251
|
-
force_finite:
|
264
|
+
force_finite: boolean, default=True
|
265
|
+
Flag indicating if ``NaN`` and ``-Inf`` scores resulting
|
252
266
|
from constant data should be replaced with real numbers (``1.0`` if
|
253
267
|
prediction is perfect, ``0.0`` otherwise). Default is ``True``, a
|
254
268
|
convenient setting for hyperparameters' search procedures (e.g. grid
|
@@ -323,10 +337,14 @@ def mean_absolute_error(
|
|
323
337
|
Mean absolute error regression loss.
|
324
338
|
|
325
339
|
Args:
|
326
|
-
df:
|
327
|
-
|
328
|
-
|
329
|
-
|
340
|
+
df: snowpark.DataFrame
|
341
|
+
Input dataframe.
|
342
|
+
y_true_col_names: string or list of strings
|
343
|
+
Column name(s) representing actual values.
|
344
|
+
y_pred_col_names: string or list of strings
|
345
|
+
Column name(s) representing predicted values.
|
346
|
+
sample_weight_col_name: string, default=None
|
347
|
+
Column name representing sample weights.
|
330
348
|
multioutput: {'raw_values', 'uniform_average'} or array-like of shape \
|
331
349
|
(n_outputs,), default='uniform_average'
|
332
350
|
Defines aggregating of multiple output values.
|
@@ -398,10 +416,14 @@ def mean_absolute_percentage_error(
|
|
398
416
|
regression metrics).
|
399
417
|
|
400
418
|
Args:
|
401
|
-
df:
|
402
|
-
|
403
|
-
|
404
|
-
|
419
|
+
df: snowpark.DataFrame
|
420
|
+
Input dataframe.
|
421
|
+
y_true_col_names: string or list of strings
|
422
|
+
Column name(s) representing actual values.
|
423
|
+
y_pred_col_names: string or list of strings
|
424
|
+
Column name(s) representing predicted values.
|
425
|
+
sample_weight_col_name: string, default=None
|
426
|
+
Column name representing sample weights.
|
405
427
|
multioutput: {'raw_values', 'uniform_average'} or array-like of shape \
|
406
428
|
(n_outputs,), default='uniform_average'
|
407
429
|
Defines aggregating of multiple output values.
|
@@ -472,10 +494,14 @@ def mean_squared_error(
|
|
472
494
|
Mean squared error regression loss.
|
473
495
|
|
474
496
|
Args:
|
475
|
-
df:
|
476
|
-
|
477
|
-
|
478
|
-
|
497
|
+
df: snowpark.DataFrame
|
498
|
+
Input dataframe.
|
499
|
+
y_true_col_names: string or list of strings
|
500
|
+
Column name(s) representing actual values.
|
501
|
+
y_pred_col_names: string or list of strings
|
502
|
+
Column name(s) representing predicted values.
|
503
|
+
sample_weight_col_name: string, default=None
|
504
|
+
Column name representing sample weights.
|
479
505
|
multioutput: {'raw_values', 'uniform_average'} or array-like of shape \
|
480
506
|
(n_outputs,), default='uniform_average'
|
481
507
|
Defines aggregating of multiple output values.
|
@@ -484,7 +510,8 @@ def mean_squared_error(
|
|
484
510
|
Returns a full set of errors in case of multioutput input.
|
485
511
|
'uniform_average':
|
486
512
|
Errors of all outputs are averaged with uniform weight.
|
487
|
-
squared:
|
513
|
+
squared: boolean, default=True
|
514
|
+
If True returns MSE value, if False returns RMSE value.
|
488
515
|
|
489
516
|
Returns:
|
490
517
|
loss: float or ndarray of floats
|
@@ -538,12 +565,13 @@ def r2_score(*, df: snowpark.DataFrame, y_true_col_name: str, y_pred_col_name: s
|
|
538
565
|
non-constant, a constant model that always predicts the average y
|
539
566
|
disregarding the input features would get a :math:`R^2` score of 0.0.
|
540
567
|
|
541
|
-
TODO(pdorairaj): Implement other params from sklearn - sample_weight, multi_output, force_finite.
|
542
|
-
|
543
568
|
Args:
|
544
|
-
df:
|
545
|
-
|
546
|
-
|
569
|
+
df: snowpark.DataFrame
|
570
|
+
Input dataframe.
|
571
|
+
y_true_col_name: string
|
572
|
+
Column name representing actual values.
|
573
|
+
y_pred_col_name: string
|
574
|
+
Column name representing predicted values.
|
547
575
|
|
548
576
|
Returns:
|
549
577
|
R squared metric.
|
@@ -22,17 +22,19 @@ from sklearn.utils.metaestimators import available_if
|
|
22
22
|
from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
23
23
|
from snowflake.ml._internal import telemetry
|
24
24
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
25
|
+
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
25
26
|
from snowflake.ml._internal.utils import pkg_version_utils, identifier
|
26
|
-
from snowflake.snowpark import DataFrame
|
27
|
+
from snowflake.snowpark import DataFrame, Session
|
27
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
28
29
|
from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
|
30
|
+
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
|
+
from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
|
29
32
|
from snowflake.ml.modeling._internal.estimator_utils import (
|
30
33
|
gather_dependencies,
|
31
34
|
original_estimator_has_callable,
|
32
35
|
transform_snowml_obj_to_sklearn_obj,
|
33
36
|
validate_sklearn_args,
|
34
37
|
)
|
35
|
-
from snowflake.ml.modeling._internal.snowpark_handlers import SklearnWrapperProvider
|
36
38
|
from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
|
37
39
|
|
38
40
|
from snowflake.ml.model.model_signature import (
|
@@ -52,7 +54,6 @@ _PROJECT = "ModelDevelopment"
|
|
52
54
|
_SUBPROJECT = "".join([s.capitalize() for s in "sklearn.mixture".replace("sklearn.", "").split("_")])
|
53
55
|
|
54
56
|
|
55
|
-
|
56
57
|
class BayesianGaussianMixture(BaseTransformer):
|
57
58
|
r"""Variational Bayesian estimation of a Gaussian mixture
|
58
59
|
For more details on this class, see [sklearn.mixture.BayesianGaussianMixture]
|
@@ -60,6 +61,49 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
60
61
|
|
61
62
|
Parameters
|
62
63
|
----------
|
64
|
+
|
65
|
+
input_cols: Optional[Union[str, List[str]]]
|
66
|
+
A string or list of strings representing column names that contain features.
|
67
|
+
If this parameter is not specified, all columns in the input DataFrame except
|
68
|
+
the columns specified by label_cols, sample_weight_col, and passthrough_cols
|
69
|
+
parameters are considered input columns. Input columns can also be set after
|
70
|
+
initialization with the `set_input_cols` method.
|
71
|
+
|
72
|
+
label_cols: Optional[Union[str, List[str]]]
|
73
|
+
This parameter is optional and will be ignored during fit. It is present here for API consistency by convention.
|
74
|
+
|
75
|
+
output_cols: Optional[Union[str, List[str]]]
|
76
|
+
A string or list of strings representing column names that will store the
|
77
|
+
output of predict and transform operations. The length of output_cols must
|
78
|
+
match the expected number of output columns from the specific predictor or
|
79
|
+
transformer class used.
|
80
|
+
If you omit this parameter, output column names are derived by adding an
|
81
|
+
OUTPUT_ prefix to the label column names for supervised estimators, or
|
82
|
+
OUTPUT_<IDX>for unsupervised estimators. These inferred output column names
|
83
|
+
work for predictors, but output_cols must be set explicitly for transformers.
|
84
|
+
In general, explicitly specifying output column names is clearer, especially
|
85
|
+
if you don’t specify the input column names.
|
86
|
+
To transform in place, pass the same names for input_cols and output_cols.
|
87
|
+
be set explicitly for transformers. Output columns can also be set after
|
88
|
+
initialization with the `set_output_cols` method.
|
89
|
+
|
90
|
+
sample_weight_col: Optional[str]
|
91
|
+
A string representing the column name containing the sample weights.
|
92
|
+
This argument is only required when working with weighted datasets. Sample
|
93
|
+
weight column can also be set after initialization with the
|
94
|
+
`set_sample_weight_col` method.
|
95
|
+
|
96
|
+
passthrough_cols: Optional[Union[str, List[str]]]
|
97
|
+
A string or a list of strings indicating column names to be excluded from any
|
98
|
+
operations (such as train, transform, or inference). These specified column(s)
|
99
|
+
will remain untouched throughout the process. This option is helpful in scenarios
|
100
|
+
requiring automatic input_cols inference, but need to avoid using specific
|
101
|
+
columns, like index columns, during training or inference. Passthrough columns
|
102
|
+
can also be set after initialization with the `set_passthrough_cols` method.
|
103
|
+
|
104
|
+
drop_input_cols: Optional[bool], default=False
|
105
|
+
If set, the response of predict(), transform() methods will not contain input columns.
|
106
|
+
|
63
107
|
n_components: int, default=1
|
64
108
|
The number of mixture components. Depending on the data and the value
|
65
109
|
of the `weight_concentration_prior` the model can decide to not use
|
@@ -161,35 +205,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
161
205
|
|
162
206
|
verbose_interval: int, default=10
|
163
207
|
Number of iteration done before the next print.
|
164
|
-
|
165
|
-
input_cols: Optional[Union[str, List[str]]]
|
166
|
-
A string or list of strings representing column names that contain features.
|
167
|
-
If this parameter is not specified, all columns in the input DataFrame except
|
168
|
-
the columns specified by label_cols and sample_weight_col parameters are
|
169
|
-
considered input columns.
|
170
|
-
|
171
|
-
label_cols: Optional[Union[str, List[str]]]
|
172
|
-
A string or list of strings representing column names that contain labels.
|
173
|
-
This is a required param for estimators, as there is no way to infer these
|
174
|
-
columns. If this parameter is not specified, then object is fitted without
|
175
|
-
labels (like a transformer).
|
176
|
-
|
177
|
-
output_cols: Optional[Union[str, List[str]]]
|
178
|
-
A string or list of strings representing column names that will store the
|
179
|
-
output of predict and transform operations. The length of output_cols must
|
180
|
-
match the expected number of output columns from the specific estimator or
|
181
|
-
transformer class used.
|
182
|
-
If this parameter is not specified, output column names are derived by
|
183
|
-
adding an OUTPUT_ prefix to the label column names. These inferred output
|
184
|
-
column names work for estimator's predict() method, but output_cols must
|
185
|
-
be set explicitly for transformers.
|
186
|
-
|
187
|
-
sample_weight_col: Optional[str]
|
188
|
-
A string representing the column name containing the sample weights.
|
189
|
-
This argument is only required when working with weighted datasets.
|
190
|
-
|
191
|
-
drop_input_cols: Optional[bool], default=False
|
192
|
-
If set, the response of predict(), transform() methods will not contain input columns.
|
193
208
|
"""
|
194
209
|
|
195
210
|
def __init__( # type: ignore[no-untyped-def]
|
@@ -215,6 +230,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
215
230
|
input_cols: Optional[Union[str, Iterable[str]]] = None,
|
216
231
|
output_cols: Optional[Union[str, Iterable[str]]] = None,
|
217
232
|
label_cols: Optional[Union[str, Iterable[str]]] = None,
|
233
|
+
passthrough_cols: Optional[Union[str, Iterable[str]]] = None,
|
218
234
|
drop_input_cols: Optional[bool] = False,
|
219
235
|
sample_weight_col: Optional[str] = None,
|
220
236
|
) -> None:
|
@@ -223,9 +239,10 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
223
239
|
self.set_input_cols(input_cols)
|
224
240
|
self.set_output_cols(output_cols)
|
225
241
|
self.set_label_cols(label_cols)
|
242
|
+
self.set_passthrough_cols(passthrough_cols)
|
226
243
|
self.set_drop_input_cols(drop_input_cols)
|
227
244
|
self.set_sample_weight_col(sample_weight_col)
|
228
|
-
deps = set(
|
245
|
+
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
229
246
|
|
230
247
|
self._deps = list(deps)
|
231
248
|
|
@@ -250,13 +267,14 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
250
267
|
args=init_args,
|
251
268
|
klass=sklearn.mixture.BayesianGaussianMixture
|
252
269
|
)
|
253
|
-
self._sklearn_object = sklearn.mixture.BayesianGaussianMixture(
|
270
|
+
self._sklearn_object: Any = sklearn.mixture.BayesianGaussianMixture(
|
254
271
|
**cleaned_up_init_args,
|
255
272
|
)
|
256
273
|
self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
|
257
274
|
# If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
|
258
275
|
self._snowpark_cols: Optional[List[str]] = self.input_cols
|
259
|
-
self._handlers: FitPredictHandlers = HandlersImpl(class_name=BayesianGaussianMixture.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True
|
276
|
+
self._handlers: FitPredictHandlers = HandlersImpl(class_name=BayesianGaussianMixture.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
|
277
|
+
self._autogenerated = True
|
260
278
|
|
261
279
|
def _get_rand_id(self) -> str:
|
262
280
|
"""
|
@@ -267,24 +285,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
267
285
|
"""
|
268
286
|
return str(uuid4()).replace("-", "_").upper()
|
269
287
|
|
270
|
-
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
271
|
-
"""
|
272
|
-
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
273
|
-
|
274
|
-
Args:
|
275
|
-
dataset: Input dataset.
|
276
|
-
"""
|
277
|
-
if not self.input_cols:
|
278
|
-
cols = [
|
279
|
-
c for c in dataset.columns
|
280
|
-
if c not in self.get_label_cols() and c != self.sample_weight_col
|
281
|
-
]
|
282
|
-
self.set_input_cols(input_cols=cols)
|
283
|
-
|
284
|
-
if not self.output_cols:
|
285
|
-
cols = [identifier.concat_names(ids=['OUTPUT_', c]) for c in self.label_cols]
|
286
|
-
self.set_output_cols(output_cols=cols)
|
287
|
-
|
288
288
|
def set_input_cols(self, input_cols: Optional[Union[str, Iterable[str]]]) -> "BayesianGaussianMixture":
|
289
289
|
"""
|
290
290
|
Input columns setter.
|
@@ -330,54 +330,48 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
330
330
|
self
|
331
331
|
"""
|
332
332
|
self._infer_input_output_cols(dataset)
|
333
|
-
if isinstance(dataset,
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
self.
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
333
|
+
if isinstance(dataset, DataFrame):
|
334
|
+
session = dataset._session
|
335
|
+
assert session is not None # keep mypy happy
|
336
|
+
# Validate that key package version in user workspace are supported in snowflake conda channel
|
337
|
+
# If customer doesn't have package in conda channel, replace the ones have the closest versions
|
338
|
+
self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
339
|
+
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
340
|
+
|
341
|
+
# Specify input columns so column pruning will be enforced
|
342
|
+
selected_cols = self._get_active_columns()
|
343
|
+
if len(selected_cols) > 0:
|
344
|
+
dataset = dataset.select(selected_cols)
|
345
|
+
|
346
|
+
self._snowpark_cols = dataset.select(self.input_cols).columns
|
347
|
+
|
348
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
349
|
+
if SNOWML_SPROC_ENV in os.environ:
|
350
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
351
|
+
project=_PROJECT,
|
352
|
+
subproject=_SUBPROJECT,
|
353
|
+
function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), BayesianGaussianMixture.__class__.__name__),
|
354
|
+
api_calls=[Session.call],
|
355
|
+
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
356
|
+
)
|
357
|
+
pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
|
358
|
+
pd_df.columns = dataset.columns
|
359
|
+
dataset = pd_df
|
360
|
+
|
361
|
+
model_trainer = ModelTrainerBuilder.build(
|
362
|
+
estimator=self._sklearn_object,
|
363
|
+
dataset=dataset,
|
364
|
+
input_cols=self.input_cols,
|
365
|
+
label_cols=self.label_cols,
|
366
|
+
sample_weight_col=self.sample_weight_col,
|
367
|
+
autogenerated=self._autogenerated,
|
368
|
+
subproject=_SUBPROJECT
|
369
|
+
)
|
370
|
+
self._sklearn_object = model_trainer.train()
|
349
371
|
self._is_fitted = True
|
350
372
|
self._get_model_signatures(dataset)
|
351
373
|
return self
|
352
374
|
|
353
|
-
def _fit_snowpark(self, dataset: DataFrame) -> None:
|
354
|
-
session = dataset._session
|
355
|
-
assert session is not None # keep mypy happy
|
356
|
-
# Validate that key package version in user workspace are supported in snowflake conda channel
|
357
|
-
# If customer doesn't have package in conda channel, replace the ones have the closest versions
|
358
|
-
self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
359
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
360
|
-
|
361
|
-
# Specify input columns so column pruning will be enforced
|
362
|
-
selected_cols = self._get_active_columns()
|
363
|
-
if len(selected_cols) > 0:
|
364
|
-
dataset = dataset.select(selected_cols)
|
365
|
-
|
366
|
-
estimator = self._sklearn_object
|
367
|
-
assert estimator is not None # Keep mypy happy
|
368
|
-
|
369
|
-
self._snowpark_cols = dataset.select(self.input_cols).columns
|
370
|
-
|
371
|
-
self._sklearn_object = self._handlers.fit_snowpark(
|
372
|
-
dataset,
|
373
|
-
session,
|
374
|
-
estimator,
|
375
|
-
["snowflake-snowpark-python"] + self._get_dependencies(),
|
376
|
-
self.input_cols,
|
377
|
-
self.label_cols,
|
378
|
-
self.sample_weight_col,
|
379
|
-
)
|
380
|
-
|
381
375
|
def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
|
382
376
|
if self._drop_input_cols:
|
383
377
|
return []
|
@@ -565,11 +559,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
565
559
|
subproject=_SUBPROJECT,
|
566
560
|
custom_tags=dict([("autogen", True)]),
|
567
561
|
)
|
568
|
-
@telemetry.add_stmt_params_to_df(
|
569
|
-
project=_PROJECT,
|
570
|
-
subproject=_SUBPROJECT,
|
571
|
-
custom_tags=dict([("autogen", True)]),
|
572
|
-
)
|
573
562
|
def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]:
|
574
563
|
"""Predict the labels for the data samples in X using trained model
|
575
564
|
For more details on this function, see [sklearn.mixture.BayesianGaussianMixture.predict]
|
@@ -623,11 +612,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
623
612
|
subproject=_SUBPROJECT,
|
624
613
|
custom_tags=dict([("autogen", True)]),
|
625
614
|
)
|
626
|
-
@telemetry.add_stmt_params_to_df(
|
627
|
-
project=_PROJECT,
|
628
|
-
subproject=_SUBPROJECT,
|
629
|
-
custom_tags=dict([("autogen", True)]),
|
630
|
-
)
|
631
615
|
def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]:
|
632
616
|
"""Method not supported for this class.
|
633
617
|
|
@@ -686,7 +670,8 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
686
670
|
if False:
|
687
671
|
self.fit(dataset)
|
688
672
|
assert self._sklearn_object is not None
|
689
|
-
|
673
|
+
labels : npt.NDArray[Any] = self._sklearn_object.labels_
|
674
|
+
return labels
|
690
675
|
else:
|
691
676
|
raise NotImplementedError
|
692
677
|
|
@@ -722,6 +707,7 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
722
707
|
output_cols = []
|
723
708
|
|
724
709
|
# Make sure column names are valid snowflake identifiers.
|
710
|
+
assert output_cols is not None # Make MyPy happy
|
725
711
|
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
726
712
|
|
727
713
|
return rv
|
@@ -732,11 +718,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
732
718
|
subproject=_SUBPROJECT,
|
733
719
|
custom_tags=dict([("autogen", True)]),
|
734
720
|
)
|
735
|
-
@telemetry.add_stmt_params_to_df(
|
736
|
-
project=_PROJECT,
|
737
|
-
subproject=_SUBPROJECT,
|
738
|
-
custom_tags=dict([("autogen", True)]),
|
739
|
-
)
|
740
721
|
def predict_proba(
|
741
722
|
self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_proba_"
|
742
723
|
) -> Union[DataFrame, pd.DataFrame]:
|
@@ -779,11 +760,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
779
760
|
subproject=_SUBPROJECT,
|
780
761
|
custom_tags=dict([("autogen", True)]),
|
781
762
|
)
|
782
|
-
@telemetry.add_stmt_params_to_df(
|
783
|
-
project=_PROJECT,
|
784
|
-
subproject=_SUBPROJECT,
|
785
|
-
custom_tags=dict([("autogen", True)]),
|
786
|
-
)
|
787
763
|
def predict_log_proba(
|
788
764
|
self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_log_proba_"
|
789
765
|
) -> Union[DataFrame, pd.DataFrame]:
|
@@ -822,16 +798,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
822
798
|
return output_df
|
823
799
|
|
824
800
|
@available_if(original_estimator_has_callable("decision_function")) # type: ignore[misc]
|
825
|
-
@telemetry.send_api_usage_telemetry(
|
826
|
-
project=_PROJECT,
|
827
|
-
subproject=_SUBPROJECT,
|
828
|
-
custom_tags=dict([("autogen", True)]),
|
829
|
-
)
|
830
|
-
@telemetry.add_stmt_params_to_df(
|
831
|
-
project=_PROJECT,
|
832
|
-
subproject=_SUBPROJECT,
|
833
|
-
custom_tags=dict([("autogen", True)]),
|
834
|
-
)
|
835
801
|
def decision_function(
|
836
802
|
self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "decision_function_"
|
837
803
|
) -> Union[DataFrame, pd.DataFrame]:
|
@@ -932,11 +898,6 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
932
898
|
subproject=_SUBPROJECT,
|
933
899
|
custom_tags=dict([("autogen", True)]),
|
934
900
|
)
|
935
|
-
@telemetry.add_stmt_params_to_df(
|
936
|
-
project=_PROJECT,
|
937
|
-
subproject=_SUBPROJECT,
|
938
|
-
custom_tags=dict([("autogen", True)]),
|
939
|
-
)
|
940
901
|
def kneighbors(
|
941
902
|
self,
|
942
903
|
dataset: Union[DataFrame, pd.DataFrame],
|
@@ -996,18 +957,28 @@ class BayesianGaussianMixture(BaseTransformer):
|
|
996
957
|
# For classifier, the type of predict is the same as the type of label
|
997
958
|
if self._sklearn_object._estimator_type == 'classifier':
|
998
959
|
# label columns is the desired type for output
|
999
|
-
outputs = _infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True)
|
960
|
+
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1000
961
|
# rename the output columns
|
1001
|
-
outputs = model_signature_utils.rename_features(outputs, self.output_cols)
|
962
|
+
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
963
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
964
|
+
([] if self._drop_input_cols else inputs)
|
965
|
+
+ outputs)
|
966
|
+
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
967
|
+
# For outlier models, returns -1 for outliers and 1 for inliers.
|
968
|
+
# Clusterer returns int64 cluster labels.
|
969
|
+
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
970
|
+
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1002
971
|
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1003
972
|
([] if self._drop_input_cols else inputs)
|
1004
973
|
+ outputs)
|
974
|
+
|
1005
975
|
# For regressor, the type of predict is float64
|
1006
976
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1007
977
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1008
978
|
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1009
979
|
([] if self._drop_input_cols else inputs)
|
1010
980
|
+ outputs)
|
981
|
+
|
1011
982
|
for prob_func in PROB_FUNCTIONS:
|
1012
983
|
if hasattr(self, prob_func):
|
1013
984
|
output_cols_prefix: str = f"{prob_func}_"
|