PyPI - snowflake-ml-python - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

snowflake/cortex/_complete.py +1 -1
snowflake/cortex/_extract_answer.py +1 -1
snowflake/cortex/_sentiment.py +1 -1
snowflake/cortex/_summarize.py +1 -1
snowflake/cortex/_translate.py +1 -1
snowflake/ml/_internal/env_utils.py +68 -6
snowflake/ml/_internal/file_utils.py +34 -4
snowflake/ml/_internal/telemetry.py +79 -91
snowflake/ml/_internal/utils/identifier.py +78 -72
snowflake/ml/_internal/utils/retryable_http.py +16 -4
snowflake/ml/_internal/utils/spcs_attribution_utils.py +122 -0
snowflake/ml/dataset/dataset.py +1 -1
snowflake/ml/model/_api.py +21 -14
snowflake/ml/model/_client/model/model_impl.py +176 -0
snowflake/ml/model/_client/model/model_method_info.py +19 -0
snowflake/ml/model/_client/model/model_version_impl.py +291 -0
snowflake/ml/model/_client/ops/metadata_ops.py +107 -0
snowflake/ml/model/_client/ops/model_ops.py +308 -0
snowflake/ml/model/_client/sql/model.py +75 -0
snowflake/ml/model/_client/sql/model_version.py +213 -0
snowflake/ml/model/_client/sql/stage.py +40 -0
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -4
snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +24 -8
snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +23 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +14 -2
snowflake/ml/model/_deploy_client/utils/constants.py +1 -0
snowflake/ml/model/_deploy_client/warehouse/deploy.py +2 -2
snowflake/ml/model/_model_composer/model_composer.py +31 -9
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +25 -10
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -2
snowflake/ml/model/_model_composer/model_method/infer_function.py_template +2 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +34 -3
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +1 -1
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +3 -1
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +10 -28
snowflake/ml/model/_packager/model_meta/model_meta.py +18 -16
snowflake/ml/model/_signatures/snowpark_handler.py +1 -1
snowflake/ml/model/model_signature.py +108 -53
snowflake/ml/model/type_hints.py +1 -0
snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +554 -0
snowflake/ml/modeling/_internal/estimator_protocols.py +1 -60
snowflake/ml/modeling/_internal/model_specifications.py +146 -0
snowflake/ml/modeling/_internal/model_trainer.py +13 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +78 -0
snowflake/ml/modeling/_internal/pandas_trainer.py +54 -0
snowflake/ml/modeling/_internal/snowpark_handlers.py +6 -760
snowflake/ml/modeling/_internal/snowpark_trainer.py +331 -0
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +108 -135
snowflake/ml/modeling/cluster/affinity_propagation.py +106 -135
snowflake/ml/modeling/cluster/agglomerative_clustering.py +106 -135
snowflake/ml/modeling/cluster/birch.py +106 -135
snowflake/ml/modeling/cluster/bisecting_k_means.py +106 -135
snowflake/ml/modeling/cluster/dbscan.py +106 -135
snowflake/ml/modeling/cluster/feature_agglomeration.py +106 -135
snowflake/ml/modeling/cluster/k_means.py +105 -135
snowflake/ml/modeling/cluster/mean_shift.py +106 -135
snowflake/ml/modeling/cluster/mini_batch_k_means.py +105 -135
snowflake/ml/modeling/cluster/optics.py +106 -135
snowflake/ml/modeling/cluster/spectral_biclustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_clustering.py +106 -135
snowflake/ml/modeling/cluster/spectral_coclustering.py +106 -135
snowflake/ml/modeling/compose/column_transformer.py +106 -135
snowflake/ml/modeling/compose/transformed_target_regressor.py +108 -135
snowflake/ml/modeling/covariance/elliptic_envelope.py +106 -135
snowflake/ml/modeling/covariance/empirical_covariance.py +99 -128
snowflake/ml/modeling/covariance/graphical_lasso.py +106 -135
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +106 -135
snowflake/ml/modeling/covariance/ledoit_wolf.py +104 -133
snowflake/ml/modeling/covariance/min_cov_det.py +106 -135
snowflake/ml/modeling/covariance/oas.py +99 -128
snowflake/ml/modeling/covariance/shrunk_covariance.py +103 -132
snowflake/ml/modeling/decomposition/dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/factor_analysis.py +106 -135
snowflake/ml/modeling/decomposition/fast_ica.py +106 -135
snowflake/ml/modeling/decomposition/incremental_pca.py +106 -135
snowflake/ml/modeling/decomposition/kernel_pca.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +106 -135
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/pca.py +106 -135
snowflake/ml/modeling/decomposition/sparse_pca.py +106 -135
snowflake/ml/modeling/decomposition/truncated_svd.py +106 -135
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +108 -135
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +108 -135
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +108 -135
snowflake/ml/modeling/ensemble/bagging_classifier.py +108 -135
snowflake/ml/modeling/ensemble/bagging_regressor.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +108 -135
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +108 -135
snowflake/ml/modeling/ensemble/isolation_forest.py +106 -135
snowflake/ml/modeling/ensemble/random_forest_classifier.py +108 -135
snowflake/ml/modeling/ensemble/random_forest_regressor.py +108 -135
snowflake/ml/modeling/ensemble/stacking_regressor.py +108 -135
snowflake/ml/modeling/ensemble/voting_classifier.py +108 -135
snowflake/ml/modeling/ensemble/voting_regressor.py +108 -135
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +101 -128
snowflake/ml/modeling/feature_selection/select_fdr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fpr.py +99 -126
snowflake/ml/modeling/feature_selection/select_fwe.py +99 -126
snowflake/ml/modeling/feature_selection/select_k_best.py +100 -127
snowflake/ml/modeling/feature_selection/select_percentile.py +99 -126
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +106 -135
snowflake/ml/modeling/feature_selection/variance_threshold.py +95 -124
snowflake/ml/modeling/framework/base.py +83 -1
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +108 -135
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +108 -135
snowflake/ml/modeling/impute/iterative_imputer.py +106 -135
snowflake/ml/modeling/impute/knn_imputer.py +106 -135
snowflake/ml/modeling/impute/missing_indicator.py +106 -135
snowflake/ml/modeling/impute/simple_imputer.py +9 -1
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +96 -125
snowflake/ml/modeling/kernel_approximation/nystroem.py +106 -135
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +106 -135
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +105 -134
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +103 -132
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +108 -135
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +90 -118
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +90 -118
snowflake/ml/modeling/linear_model/ard_regression.py +108 -135
snowflake/ml/modeling/linear_model/bayesian_ridge.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/gamma_regressor.py +108 -135
snowflake/ml/modeling/linear_model/huber_regressor.py +108 -135
snowflake/ml/modeling/linear_model/lars.py +108 -135
snowflake/ml/modeling/linear_model/lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso.py +108 -135
snowflake/ml/modeling/linear_model/lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +108 -135
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +108 -135
snowflake/ml/modeling/linear_model/linear_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression.py +108 -135
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso.py +108 -135
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +108 -135
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +108 -135
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +107 -135
snowflake/ml/modeling/linear_model/perceptron.py +107 -135
snowflake/ml/modeling/linear_model/poisson_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ransac_regressor.py +108 -135
snowflake/ml/modeling/linear_model/ridge.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier.py +108 -135
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +108 -135
snowflake/ml/modeling/linear_model/ridge_cv.py +108 -135
snowflake/ml/modeling/linear_model/sgd_classifier.py +108 -135
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +106 -135
snowflake/ml/modeling/linear_model/sgd_regressor.py +108 -135
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +108 -135
snowflake/ml/modeling/linear_model/tweedie_regressor.py +108 -135
snowflake/ml/modeling/manifold/isomap.py +106 -135
snowflake/ml/modeling/manifold/mds.py +106 -135
snowflake/ml/modeling/manifold/spectral_embedding.py +106 -135
snowflake/ml/modeling/manifold/tsne.py +106 -135
snowflake/ml/modeling/metrics/classification.py +196 -55
snowflake/ml/modeling/metrics/correlation.py +4 -2
snowflake/ml/modeling/metrics/covariance.py +7 -4
snowflake/ml/modeling/metrics/ranking.py +32 -16
snowflake/ml/modeling/metrics/regression.py +60 -32
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +106 -135
snowflake/ml/modeling/mixture/gaussian_mixture.py +106 -135
snowflake/ml/modeling/model_selection/grid_search_cv.py +91 -148
snowflake/ml/modeling/model_selection/randomized_search_cv.py +93 -154
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +105 -132
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +108 -135
snowflake/ml/modeling/multiclass/output_code_classifier.py +108 -135
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/categorical_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/complement_nb.py +108 -135
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +98 -125
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +107 -134
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neighbors/kernel_density.py +106 -135
snowflake/ml/modeling/neighbors/local_outlier_factor.py +106 -135
snowflake/ml/modeling/neighbors/nearest_centroid.py +108 -135
snowflake/ml/modeling/neighbors/nearest_neighbors.py +106 -135
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +108 -135
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +108 -135
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +106 -135
snowflake/ml/modeling/neural_network/mlp_classifier.py +108 -135
snowflake/ml/modeling/neural_network/mlp_regressor.py +108 -135
snowflake/ml/modeling/parameters/disable_distributed_hpo.py +2 -6
snowflake/ml/modeling/preprocessing/binarizer.py +25 -8
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +9 -4
snowflake/ml/modeling/preprocessing/label_encoder.py +31 -11
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +27 -9
snowflake/ml/modeling/preprocessing/min_max_scaler.py +42 -14
snowflake/ml/modeling/preprocessing/normalizer.py +9 -4
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +26 -10
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +37 -13
snowflake/ml/modeling/preprocessing/polynomial_features.py +106 -135
snowflake/ml/modeling/preprocessing/robust_scaler.py +39 -13
snowflake/ml/modeling/preprocessing/standard_scaler.py +36 -12
snowflake/ml/modeling/semi_supervised/label_propagation.py +108 -135
snowflake/ml/modeling/semi_supervised/label_spreading.py +108 -135
snowflake/ml/modeling/svm/linear_svc.py +108 -135
snowflake/ml/modeling/svm/linear_svr.py +108 -135
snowflake/ml/modeling/svm/nu_svc.py +108 -135
snowflake/ml/modeling/svm/nu_svr.py +108 -135
snowflake/ml/modeling/svm/svc.py +108 -135
snowflake/ml/modeling/svm/svr.py +108 -135
snowflake/ml/modeling/tree/decision_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/decision_tree_regressor.py +108 -135
snowflake/ml/modeling/tree/extra_tree_classifier.py +108 -135
snowflake/ml/modeling/tree/extra_tree_regressor.py +108 -135
snowflake/ml/modeling/xgboost/xgb_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgb_regressor.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +108 -136
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +108 -136
snowflake/ml/registry/model_registry.py +2 -0
snowflake/ml/registry/registry.py +215 -0
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/METADATA +34 -1
snowflake_ml_python-1.1.2.dist-info/RECORD +347 -0
snowflake_ml_python-1.1.0.dist-info/RECORD +0 -331
{snowflake_ml_python-1.1.0.dist-info → snowflake_ml_python-1.1.2.dist-info}/WHEEL +0 -0

snowflake/ml/modeling/metrics/classification.py CHANGED Viewed

@@ -5,7 +5,6 @@ import warnings
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 import cloudpickle
-import numpy
 import numpy as np
 import numpy.typing as npt
 from sklearn import exceptions, metrics
@@ -43,12 +42,17 @@ def accuracy_score(
     corresponding set of labels in the y true columns.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        normalize: If ``False``, return the number of correctly classified samples.
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        normalize: boolean, default=True
+            If ``False``, return the number of correctly classified samples.
             Otherwise, return the fraction of correctly classified samples.
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
     Returns:
         If ``normalize == True``, return the fraction of correctly
@@ -102,14 +106,19 @@ def confusion_matrix(
     :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
     Args:
-        df: Input dataframe.
-        y_true_col_name: Column name representing actual values.
-        y_pred_col_name: Column name representing predicted values.
-        labels: List of labels to index the matrix. This may be used to
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_name: string or list of strings
+            Column name representing actual values.
+        y_pred_col_name: string or list of strings
+            Column name representing predicted values.
+        labels: list of labels, default=None
+            List of labels to index the matrix. This may be used to
             reorder or select a subset of labels.
             If ``None`` is given, those that appear at least once in the
             y true or y pred column are used in sorted order.
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         normalize: {'true', 'pred', 'all'}, default=None
             Normalizes confusion matrix over the true (rows), predicted (columns)
             conditions or all the population. If None, confusion matrix will not be
@@ -124,7 +133,9 @@ def confusion_matrix(
     Raises:
         ValueError: The given ``labels`` is empty.
         ValueError: No label specified in the given ``labels`` is in the y true column.
         ValueError: ``normalize`` is not one of {'true', 'pred', 'all', None}.
     """
     assert df._session is not None
@@ -252,7 +263,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
             self._batched_rows[self._cur_count, :] = input_row
             self._cur_count += 1
-            # 2. Compute incremental sum and dot_prod for the batch.
+            # 2. Compute incremental confusion matrix for the batch.
             if self._cur_count >= self.BATCH_SIZE:
                 self.update_confusion_matrix()
                 self._cur_count = 0
@@ -265,10 +276,16 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
                 yield cloudpickle.dumps(self._confusion_matrix[i, :]), "row_" + str(i)
         def update_confusion_matrix(self) -> None:
+            # Update the confusion matrix by adding values from the 1st column of the batched rows to specific
+            # locations in the confusion matrix determined by row and column indices from the 2nd and 3rd columns of
+            # the batched rows.
             np.add.at(
                 self._confusion_matrix,
-                (self._batched_rows[:, 1].astype(int), self._batched_rows[:, 2].astype(int)),
-                self._batched_rows[:, 0],
+                (
+                    self._batched_rows[: self._cur_count][:, 1].astype(int),
+                    self._batched_rows[: self._cur_count][:, 2].astype(int),
+                ),
+                self._batched_rows[: self._cur_count][:, 0],
             )
     confusion_matrix_computer = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE_FUNCTION)
@@ -317,17 +334,22 @@ def f1_score(
     parameter.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        labels: The set of labels to include when ``average != 'binary'``, and
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        labels: list of labels, default=None
+            The set of labels to include when ``average != 'binary'``, and
             their order if ``average is None``. Labels present in the data can be
             excluded, for example to calculate a multiclass average ignoring a
             majority negative class, while labels not present in the data will
             result in 0 components in a macro average. For multilabel targets,
             labels are column indices. By default, all labels in the y true and
             y pred columns are used in sorted order.
-        pos_label: The class to report if ``average='binary'`` and the data is
+        pos_label:  string or integer, default=1
+            The class to report if ``average='binary'`` and the data is
             binary. If the data are multiclass or multilabel, this will be ignored;
             setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
             scores for that label only.
@@ -353,7 +375,8 @@ def f1_score(
                 Calculate metrics for each instance, and find their average (only
                 meaningful for multilabel classification where this differs from
                 func`accuracy_score`).
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         zero_division: "warn", 0 or 1, default="warn"
             Sets the value to return when there is a zero division, i.e. when all
             predictions and labels are negative. If set to "warn", this acts as 0,
@@ -402,18 +425,24 @@ def fbeta_score(
     only recall).
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        beta: Determines the weight of recall in the combined score.
-        labels: The set of labels to include when ``average != 'binary'``, and
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        beta: float
+            Determines the weight of recall in the combined score.
+        labels: list of labels, default=None
+            The set of labels to include when ``average != 'binary'``, and
             their order if ``average is None``. Labels present in the data can be
             excluded, for example to calculate a multiclass average ignoring a
             majority negative class, while labels not present in the data will
             result in 0 components in a macro average. For multilabel targets,
             labels are column indices. By default, all labels in the y true and
             y pred columns are used in sorted order.
-        pos_label: The class to report if ``average='binary'`` and the data is
+        pos_label: string or integer, default=1
+            The class to report if ``average='binary'`` and the data is
             binary. If the data are multiclass or multilabel, this will be ignored;
             setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
             scores for that label only.
@@ -439,7 +468,8 @@ def fbeta_score(
                 Calculate metrics for each instance, and find their average (only
                 meaningful for multilabel classification where this differs from
                 func`accuracy_score`).
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         zero_division: "warn", 0 or 1, default="warn"
             Sets the value to return when there is a zero division, i.e. when all
             predictions and labels are negative. If set to "warn", this acts as 0,
@@ -492,9 +522,12 @@ def log_loss(
         L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p))
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted probabilities,
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted probabilities,
             as returned by a classifier's predict_proba method.
             If ``y_pred.shape = (n_samples,)`` the probabilities provided are
             assumed to be that of the positive class. The labels in ``y_pred``
@@ -503,10 +536,13 @@ def log_loss(
             Log loss is undefined for p=0 or p=1, so probabilities are
             clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
             data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
-        normalize: If true, return the mean loss per sample.
+        normalize: boolean, default=True
+            If true, return the mean loss per sample.
             Otherwise, return the sum of the per-sample losses.
-        sample_weight_col_name: Column name representing sample weights.
-        labels: If not provided, labels will be inferred from y_true. If ``labels``
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
+        labels: list of labels, default=None
+            If not provided, labels will be inferred from y_true. If ``labels``
             is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
             assumed to be binary and are inferred from ``y_true``.
@@ -691,18 +727,24 @@ def precision_recall_fscore_support(
     is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        beta: The strength of recall versus precision in the F-score.
-        labels: The set of labels to include when ``average != 'binary'``, and
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        beta: float, default=1.0
+            The strength of recall versus precision in the F-score.
+        labels: list of labels, default=None
+            The set of labels to include when ``average != 'binary'``, and
             their order if ``average is None``. Labels present in the data can be
             excluded, for example to calculate a multiclass average ignoring a
             majority negative class, while labels not present in the data will
             result in 0 components in a macro average. For multilabel targets,
             labels are column indices. By default, all labels in the y true and
             y pred columns are used in sorted order.
-        pos_label: The class to report if ``average='binary'`` and the data is
+        pos_label: string or integer, default=1
+            The class to report if ``average='binary'`` and the data is
             binary. If the data are multiclass or multilabel, this will be ignored;
             setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
             scores for that label only.
@@ -727,9 +769,11 @@ def precision_recall_fscore_support(
                 Calculate metrics for each instance, and find their average (only
                 meaningful for multilabel classification where this differs from
                 :func:`accuracy_score`).
-        warn_for: This determines which warnings will be made in the case that this
+        warn_for: tuple or set containing "precision", "recall", or "f-score"
+            This determines which warnings will be made in the case that this
             function is being used to return only one of its metrics.
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         zero_division: "warn", 0 or 1, default="warn"
             Sets the value to return when there is a zero division:
                * recall - when there are no positive labels
@@ -974,6 +1018,78 @@ def _register_multilabel_confusion_matrix_computer(
     return multilabel_confusion_matrix_computer
+def _binary_precision_score(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    pos_label: Union[str, int] = 1,
+    sample_weight_col_name: Optional[str] = None,
+    zero_division: Union[str, int] = "warn",
+) -> Union[float, npt.NDArray[np.float_]]:
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    if isinstance(y_true_col_names, str):
+        y_true_col_names = [y_true_col_names]
+    if isinstance(y_pred_col_names, str):
+        y_pred_col_names = [y_pred_col_names]
+    if len(y_pred_col_names) != len(y_true_col_names):
+        raise ValueError(
+            "precision_score: `y_true_col_names` and `y_pred_column_names` must be lists of the same length "
+            "or both strings."
+        )
+    # Confirm that the data is binary.
+    labels_set = set()
+    columns = y_true_col_names + y_pred_col_names
+    column_labels_lists = df.select(*[F.array_unique_agg(col) for col in columns]).collect(
+        statement_params=statement_params
+    )[0]
+    for column_labels_list in column_labels_lists:
+        for column_label in json.loads(column_labels_list):
+            labels_set.add(column_label)
+    labels = sorted(list(labels_set))
+    _ = _check_binary_labels(labels, pos_label=pos_label)
+    sample_weight_column = df[sample_weight_col_name] if sample_weight_col_name else None
+    scores = []
+    for y_true, y_pred in zip(y_true_col_names, y_pred_col_names):
+        tp_col = F.iff((F.col(y_true) == pos_label) & (F.col(y_pred) == pos_label), 1, 0)
+        fp_col = F.iff((F.col(y_true) != pos_label) & (F.col(y_pred) == pos_label), 1, 0)
+        tp = metrics_utils.weighted_sum(
+            df=df,
+            sample_score_column=tp_col,
+            sample_weight_column=sample_weight_column,
+            statement_params=statement_params,
+        )
+        fp = metrics_utils.weighted_sum(
+            df=df,
+            sample_score_column=fp_col,
+            sample_weight_column=sample_weight_column,
+            statement_params=statement_params,
+        )
+        try:
+            score = tp / (tp + fp)
+        except ZeroDivisionError:
+            if zero_division == "warn":
+                msg = "precision_score: division by zero: score value will be 0."
+                warnings.warn(msg, exceptions.UndefinedMetricWarning, stacklevel=2)
+                score = 0.0
+            else:
+                score = float(zero_division)
+        scores.append(score)
+    if len(scores) == 1:
+        return scores[0]
+    return np.array(scores)
 @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
 def precision_score(
     *,
@@ -997,17 +1113,22 @@ def precision_score(
     The best value is 1 and the worst value is 0.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        labels: The set of labels to include when ``average != 'binary'``, and
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        labels: list of labels, default=None
+            The set of labels to include when ``average != 'binary'``, and
             their order if ``average is None``. Labels present in the data can be
             excluded, for example to calculate a multiclass average ignoring a
             majority negative class, while labels not present in the data will
             result in 0 components in a macro average. For multilabel targets,
             labels are column indices. By default, all labels in the y true and
             y pred columns are used in sorted order.
-        pos_label: The class to report if ``average='binary'`` and the data is
+        pos_label: string or integer, default=1
+            The class to report if ``average='binary'`` and the data is
             binary. If the data are multiclass or multilabel, this will be ignored;
             setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
             scores for that label only.
@@ -1032,7 +1153,8 @@ def precision_score(
                 Calculate metrics for each instance, and find their average (only
                 meaningful for multilabel classification where this differs from
                 func`accuracy_score`).
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         zero_division: "warn", 0 or 1, default="warn"
             Sets the value to return when there is a zero division. If set to
             "warn", this acts as 0, but warnings are also raised.
@@ -1042,6 +1164,16 @@ def precision_score(
             Precision of the positive class in binary classification or weighted
             average of the precision of each class for the multiclass task.
     """
+    if average == "binary":
+        return _binary_precision_score(
+            df=df,
+            y_true_col_names=y_true_col_names,
+            y_pred_col_names=y_pred_col_names,
+            pos_label=pos_label,
+            sample_weight_col_name=sample_weight_col_name,
+            zero_division=zero_division,
+        )
     p, _, _, _ = precision_recall_fscore_support(
         df=df,
         y_true_col_names=y_true_col_names,
@@ -1078,17 +1210,22 @@ def recall_score(
     The best value is 1 and the worst value is 0.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing actual values.
-        y_pred_col_names: Column name(s) representing predicted values.
-        labels: The set of labels to include when ``average != 'binary'``, and
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing actual values.
+        y_pred_col_names: string or list of strings
+            Column name(s) representing predicted values.
+        labels: list of labels, default=None
+            The set of labels to include when ``average != 'binary'``, and
             their order if ``average is None``. Labels present in the data can be
             excluded, for example to calculate a multiclass average ignoring a
             majority negative class, while labels not present in the data will
             result in 0 components in a macro average. For multilabel targets,
             labels are column indices. By default, all labels in the y true and
             y pred columns are used in sorted order.
-        pos_label: The class to report if ``average='binary'`` and the data is
+        pos_label: string or integer, default=1
+            The class to report if ``average='binary'`` and the data is
             binary. If the data are multiclass or multilabel, this will be ignored;
             setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
             scores for that label only.
@@ -1115,7 +1252,8 @@ def recall_score(
                 Calculate metrics for each instance, and find their average (only
                 meaningful for multilabel classification where this differs from
                 func`accuracy_score`).
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         zero_division: "warn", 0 or 1, default="warn"
             Sets the value to return when there is a zero division. If set to
             "warn", this acts as 0, but warnings are also raised.
@@ -1184,10 +1322,13 @@ def _check_binary_labels(
     """
     if len(labels) <= 2:
         if len(labels) == 2 and pos_label not in labels:
-            raise ValueError(f"pos_label={pos_label} is not a valid label. It should be one of {labels}")
+            raise ValueError(f"pos_label={pos_label} is not a valid label. It must be one of {labels}")
         labels = [pos_label]
     else:
-        raise ValueError("Please choose another average setting.")
+        raise ValueError(
+            "Cannot compute precision score with binary average: there are more than two labels present."
+            "Please choose another average setting."
+        )
     return labels

snowflake/ml/modeling/metrics/correlation.py CHANGED Viewed

@@ -36,8 +36,10 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] =
     as a post-processing step.
     Args:
-        df (snowpark.DataFrame): Snowpark Dataframe for which correlation matrix has to be computed.
-        columns (Optional[Collection[str]]): List of column names for which the correlation matrix has to be computed.
+        df: snowpark.DataFrame
+            Snowpark Dataframe for which correlation matrix has to be computed.
+        columns: List of strings
+            List of column names for which the correlation matrix has to be computed.
             If None, correlation matrix is computed for all numeric columns in the snowpark dataframe.
     Returns:

snowflake/ml/modeling/metrics/covariance.py CHANGED Viewed

@@ -36,11 +36,14 @@ def covariance(*, df: DataFrame, columns: Optional[Collection[str]] = None, ddof
     as a post-processing step.
     Args:
-        df (DataFrame): Snowpark Dataframe for which covariance matrix has to be computed.
-        columns (Optional[Collection[str]]): List of column names for which the covariance matrix has to be computed.
+        df: snowpark.DataFrame
+            Snowpark Dataframe for which covariance matrix has to be computed.
+        columns: list of strings, default=None
+            List of column names for which the covariance matrix has to be computed.
             If None, covariance matrix is computed for all numeric columns in the snowpark dataframe.
-        ddof (int): default 1. Delta degrees of freedom.
-            The divisor used in calculations is N - ddof, where N represents the number of rows.
+        ddof: int, default=1
+            Delta degrees of freedom. The divisor used in calculations is N - ddof, where N represents the
+            number of rows.
     Returns:
         Covariance matrix in pandas.DataFrame format.

snowflake/ml/modeling/metrics/ranking.py CHANGED Viewed

@@ -49,18 +49,23 @@ def precision_recall_curve(
     which corresponds to a classifier that always predicts the positive class.
     Args:
-        df: Input dataframe.
-        y_true_col_name: Column name representing true binary labels.
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_name: string
+            Column name representing true binary labels.
             If labels are not either {-1, 1} or {0, 1}, then pos_label should be
             explicitly given.
-        probas_pred_col_name: Column name representing target scores.
+        probas_pred_col_name: string
+            Column name representing target scores.
             Can either be probability estimates of the positive
             class, or non-thresholded measure of decisions (as returned by
             `decision_function` on some classifiers).
-        pos_label: The label of the positive class.
+        pos_label: string or int, default=None
+            The label of the positive class.
             When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
             ``pos_label`` is set to 1, otherwise an error will be raised.
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
     Returns:
         Tuple containing following items
@@ -142,12 +147,15 @@ def roc_auc_score(
     multilabel classification, but some restrictions apply.
     Args:
-        df: Input dataframe.
-        y_true_col_names: Column name(s) representing true labels or binary label indicators.
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_names: string or list of strings
+            Column name(s) representing true labels or binary label indicators.
             The binary and multiclass cases expect labels with shape (n_samples,)
             while the multilabel case expects binary label indicators with shape
             (n_samples, n_classes).
-        y_score_col_names: Column name(s) representing target scores.
+        y_score_col_names: string or list of strings
+            Column name(s) representing target scores.
             * In the binary case, it corresponds to an array of shape
               `(n_samples,)`. Both probability estimates and non-thresholded
               decision values can be provided. The probability estimates correspond
@@ -186,7 +194,8 @@ def roc_auc_score(
             ``'samples'``
                 Calculate metrics for each instance, and find their average.
             Will be ignored when ``y_true`` is binary.
-        sample_weight_col_name: Column name representing sample weights.
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
         max_fpr: float > 0 and <= 1, default=None
             If not ``None``, the standardized partial AUC [2]_ over the range
             [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
@@ -208,7 +217,8 @@ def roc_auc_score(
                 possible pairwise combinations of classes [5]_.
                 Insensitive to class imbalance when
                 ``average == 'macro'``.
-        labels: Only used for multiclass targets. List of labels that index the
+        labels: list of labels, default=None
+            Only used for multiclass targets. List of labels that index the
             classes in ``y_score``. If ``None``, the numerical or lexicographical
             order of the labels in ``y_true`` is used.
@@ -282,19 +292,25 @@ def roc_curve(
     Note: this implementation is restricted to the binary classification task.
     Args:
-        df: Input dataframe.
-        y_true_col_name: Column name representing true binary labels.
+        df: snowpark.DataFrame
+            Input dataframe.
+        y_true_col_name: string
+            Column name representing true binary labels.
             If labels are not either {-1, 1} or {0, 1}, then pos_label should be
             explicitly given.
-        y_score_col_name: Column name representing target scores, can either
+        y_score_col_name: string
+            Column name representing target scores, can either
             be probability estimates of the positive class, confidence values,
             or non-thresholded measure of decisions (as returned by
             "decision_function" on some classifiers).
-        pos_label: The label of the positive class.
+        pos_label: string, default=None
+            The label of the positive class.
             When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},
             ``pos_label`` is set to 1, otherwise an error will be raised.
-        sample_weight_col_name: Column name representing sample weights.
-        drop_intermediate: Whether to drop some suboptimal thresholds which would
+        sample_weight_col_name: string, default=None
+            Column name representing sample weights.
+        drop_intermediate: boolean, default=True
+            Whether to drop some suboptimal thresholds which would
             not appear on a plotted ROC curve. This is useful in order to create
             lighter ROC curves.

snowflake-ml-python 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

snowflake-ml-python 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl