PyPI - snowflake-ml-python - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

snowflake-ml-python 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

snowflake/ml/_internal/file_utils.py +8 -35
snowflake/ml/_internal/utils/identifier.py +74 -7
snowflake/ml/model/_core_requirements.py +1 -1
snowflake/ml/model/_deploy_client/warehouse/deploy.py +5 -26
snowflake/ml/model/_deploy_client/warehouse/infer_template.py +2 -2
snowflake/ml/model/_handlers/_base.py +3 -1
snowflake/ml/model/_handlers/sklearn.py +1 -0
snowflake/ml/model/_handlers/xgboost.py +1 -1
snowflake/ml/model/_model.py +24 -19
snowflake/ml/model/_model_meta.py +24 -15
snowflake/ml/model/type_hints.py +5 -11
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +28 -17
snowflake/ml/modeling/cluster/affinity_propagation.py +28 -17
snowflake/ml/modeling/cluster/agglomerative_clustering.py +28 -17
snowflake/ml/modeling/cluster/birch.py +28 -17
snowflake/ml/modeling/cluster/bisecting_k_means.py +28 -17
snowflake/ml/modeling/cluster/dbscan.py +28 -17
snowflake/ml/modeling/cluster/feature_agglomeration.py +28 -17
snowflake/ml/modeling/cluster/k_means.py +28 -17
snowflake/ml/modeling/cluster/mean_shift.py +28 -17
snowflake/ml/modeling/cluster/mini_batch_k_means.py +28 -17
snowflake/ml/modeling/cluster/optics.py +28 -17
snowflake/ml/modeling/cluster/spectral_biclustering.py +28 -17
snowflake/ml/modeling/cluster/spectral_clustering.py +28 -17
snowflake/ml/modeling/cluster/spectral_coclustering.py +28 -17
snowflake/ml/modeling/compose/column_transformer.py +28 -17
snowflake/ml/modeling/compose/transformed_target_regressor.py +28 -17
snowflake/ml/modeling/covariance/elliptic_envelope.py +28 -17
snowflake/ml/modeling/covariance/empirical_covariance.py +28 -17
snowflake/ml/modeling/covariance/graphical_lasso.py +28 -17
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +28 -17
snowflake/ml/modeling/covariance/ledoit_wolf.py +28 -17
snowflake/ml/modeling/covariance/min_cov_det.py +28 -17
snowflake/ml/modeling/covariance/oas.py +28 -17
snowflake/ml/modeling/covariance/shrunk_covariance.py +28 -17
snowflake/ml/modeling/decomposition/dictionary_learning.py +28 -17
snowflake/ml/modeling/decomposition/factor_analysis.py +28 -17
snowflake/ml/modeling/decomposition/fast_ica.py +28 -17
snowflake/ml/modeling/decomposition/incremental_pca.py +28 -17
snowflake/ml/modeling/decomposition/kernel_pca.py +28 -17
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +28 -17
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +28 -17
snowflake/ml/modeling/decomposition/pca.py +28 -17
snowflake/ml/modeling/decomposition/sparse_pca.py +28 -17
snowflake/ml/modeling/decomposition/truncated_svd.py +28 -17
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +28 -17
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +28 -17
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +28 -17
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +28 -17
snowflake/ml/modeling/ensemble/bagging_classifier.py +28 -17
snowflake/ml/modeling/ensemble/bagging_regressor.py +28 -17
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +28 -17
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +28 -17
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +28 -17
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +28 -17
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +28 -17
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +28 -17
snowflake/ml/modeling/ensemble/isolation_forest.py +28 -17
snowflake/ml/modeling/ensemble/random_forest_classifier.py +28 -17
snowflake/ml/modeling/ensemble/random_forest_regressor.py +28 -17
snowflake/ml/modeling/ensemble/stacking_regressor.py +28 -17
snowflake/ml/modeling/ensemble/voting_classifier.py +28 -17
snowflake/ml/modeling/ensemble/voting_regressor.py +28 -17
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +28 -17
snowflake/ml/modeling/feature_selection/select_fdr.py +28 -17
snowflake/ml/modeling/feature_selection/select_fpr.py +28 -17
snowflake/ml/modeling/feature_selection/select_fwe.py +28 -17
snowflake/ml/modeling/feature_selection/select_k_best.py +28 -17
snowflake/ml/modeling/feature_selection/select_percentile.py +28 -17
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +28 -17
snowflake/ml/modeling/feature_selection/variance_threshold.py +28 -17
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +28 -17
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +28 -17
snowflake/ml/modeling/impute/iterative_imputer.py +28 -17
snowflake/ml/modeling/impute/knn_imputer.py +28 -17
snowflake/ml/modeling/impute/missing_indicator.py +28 -17
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +28 -17
snowflake/ml/modeling/kernel_approximation/nystroem.py +28 -17
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +28 -17
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +28 -17
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +28 -17
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +28 -17
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +28 -17
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +28 -17
snowflake/ml/modeling/linear_model/ard_regression.py +28 -17
snowflake/ml/modeling/linear_model/bayesian_ridge.py +28 -17
snowflake/ml/modeling/linear_model/elastic_net.py +28 -17
snowflake/ml/modeling/linear_model/elastic_net_cv.py +28 -17
snowflake/ml/modeling/linear_model/gamma_regressor.py +28 -17
snowflake/ml/modeling/linear_model/huber_regressor.py +28 -17
snowflake/ml/modeling/linear_model/lars.py +28 -17
snowflake/ml/modeling/linear_model/lars_cv.py +28 -17
snowflake/ml/modeling/linear_model/lasso.py +28 -17
snowflake/ml/modeling/linear_model/lasso_cv.py +28 -17
snowflake/ml/modeling/linear_model/lasso_lars.py +28 -17
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +28 -17
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +28 -17
snowflake/ml/modeling/linear_model/linear_regression.py +28 -17
snowflake/ml/modeling/linear_model/logistic_regression.py +28 -17
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +28 -17
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +28 -17
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +28 -17
snowflake/ml/modeling/linear_model/multi_task_lasso.py +28 -17
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +28 -17
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +28 -17
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +28 -17
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +28 -17
snowflake/ml/modeling/linear_model/perceptron.py +28 -17
snowflake/ml/modeling/linear_model/poisson_regressor.py +28 -17
snowflake/ml/modeling/linear_model/ransac_regressor.py +28 -17
snowflake/ml/modeling/linear_model/ridge.py +28 -17
snowflake/ml/modeling/linear_model/ridge_classifier.py +28 -17
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +28 -17
snowflake/ml/modeling/linear_model/ridge_cv.py +28 -17
snowflake/ml/modeling/linear_model/sgd_classifier.py +28 -17
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +28 -17
snowflake/ml/modeling/linear_model/sgd_regressor.py +28 -17
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +28 -17
snowflake/ml/modeling/linear_model/tweedie_regressor.py +28 -17
snowflake/ml/modeling/manifold/isomap.py +28 -17
snowflake/ml/modeling/manifold/mds.py +28 -17
snowflake/ml/modeling/manifold/spectral_embedding.py +28 -17
snowflake/ml/modeling/manifold/tsne.py +28 -17
snowflake/ml/modeling/metrics/classification.py +6 -1
snowflake/ml/modeling/metrics/regression.py +517 -9
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +28 -17
snowflake/ml/modeling/mixture/gaussian_mixture.py +28 -17
snowflake/ml/modeling/model_selection/grid_search_cv.py +28 -17
snowflake/ml/modeling/model_selection/randomized_search_cv.py +28 -17
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +28 -17
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +28 -17
snowflake/ml/modeling/multiclass/output_code_classifier.py +28 -17
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +28 -17
snowflake/ml/modeling/naive_bayes/categorical_nb.py +28 -17
snowflake/ml/modeling/naive_bayes/complement_nb.py +28 -17
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +28 -17
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +28 -17
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +28 -17
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +28 -17
snowflake/ml/modeling/neighbors/kernel_density.py +28 -17
snowflake/ml/modeling/neighbors/local_outlier_factor.py +28 -17
snowflake/ml/modeling/neighbors/nearest_centroid.py +28 -17
snowflake/ml/modeling/neighbors/nearest_neighbors.py +28 -17
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +28 -17
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +28 -17
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +28 -17
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +28 -17
snowflake/ml/modeling/neural_network/mlp_classifier.py +28 -17
snowflake/ml/modeling/neural_network/mlp_regressor.py +28 -17
snowflake/ml/modeling/pipeline/pipeline.py +24 -0
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
snowflake/ml/modeling/preprocessing/polynomial_features.py +28 -17
snowflake/ml/modeling/semi_supervised/label_propagation.py +28 -17
snowflake/ml/modeling/semi_supervised/label_spreading.py +28 -17
snowflake/ml/modeling/svm/linear_svc.py +28 -17
snowflake/ml/modeling/svm/linear_svr.py +28 -17
snowflake/ml/modeling/svm/nu_svc.py +28 -17
snowflake/ml/modeling/svm/nu_svr.py +28 -17
snowflake/ml/modeling/svm/svc.py +28 -17
snowflake/ml/modeling/svm/svr.py +28 -17
snowflake/ml/modeling/tree/decision_tree_classifier.py +28 -17
snowflake/ml/modeling/tree/decision_tree_regressor.py +28 -17
snowflake/ml/modeling/tree/extra_tree_classifier.py +28 -17
snowflake/ml/modeling/tree/extra_tree_regressor.py +28 -17
snowflake/ml/modeling/xgboost/xgb_classifier.py +28 -17
snowflake/ml/modeling/xgboost/xgb_regressor.py +28 -17
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +28 -17
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +28 -17
snowflake/ml/registry/model_registry.py +49 -65
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/METADATA +24 -1
snowflake_ml_python-1.0.2.dist-info/RECORD +246 -0
snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
{snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.2.dist-info}/WHEEL +0 -0

snowflake/ml/modeling/manifold/tsne.py CHANGED Viewed

@@ -745,26 +745,37 @@ class TSNE(BaseTransformer):
         # input cols need to match unquoted / quoted
         input_cols = self.input_cols
         unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+        quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
         estimator = self._sklearn_object
-        input_df = dataset[input_cols] # Select input columns with quoted column names.
-        if hasattr(estimator, "feature_names_in_"):
-            missing_features = []
-            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
-                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
-                    missing_features.append(f)
-            if len(missing_features) > 0:
-                raise ValueError(
-                    "The feature names should match with those that were passed during fit.\n"
-                    f"Features seen during fit call but not present in the input: {missing_features}\n"
-                    f"Features in the input dataframe : {input_cols}\n"
-                )
-            input_df.columns = getattr(estimator, "feature_names_in_")
-        else:
-            # Just rename the column names to unquoted identifiers.
-            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+        features_required_by_estimator =  getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
+        missing_features = []
+        features_in_dataset = set(dataset.columns)
+        columns_to_select = []
+        for i, f in enumerate(features_required_by_estimator):
+            if (
+                    i >= len(input_cols)
+                or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
+                or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
+                    and quoted_input_cols[i] not in features_in_dataset)
+                ):
+                missing_features.append(f)
+            elif input_cols[i] in features_in_dataset:
+                columns_to_select.append(input_cols[i])
+            elif unquoted_input_cols[i] in features_in_dataset:
+                columns_to_select.append(unquoted_input_cols[i])
+            else:
+                columns_to_select.append(quoted_input_cols[i])
+        if len(missing_features) > 0:
+            raise ValueError(
+                "The feature names should match with those that were passed during fit.\n"
+                f"Features seen during fit call but not present in the input: {missing_features}\n"
+                f"Features in the input dataframe : {input_cols}\n"
+            )
+        input_df = dataset[columns_to_select]
+        input_df.columns = features_required_by_estimator
         transformed_numpy_array = getattr(estimator, inference_method)(
             input_df

snowflake/ml/modeling/metrics/classification.py CHANGED Viewed

@@ -54,7 +54,12 @@ def accuracy_score(
     metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
     if isinstance(y_true_col_names, str) or (len(y_true_col_names) == 1):
-        score_column = F.iff(df[y_true_col_names] == df[y_pred_col_names], 1, 0)  # type: ignore[arg-type]
+        y_true, y_pred = (
+            (y_true_col_names, y_pred_col_names)
+            if isinstance(y_true_col_names, str)
+            else (y_true_col_names[0], y_pred_col_names[0])
+        )
+        score_column = F.iff(df[y_true] == df[y_pred], 1, 0)  # type: ignore[arg-type]
     # multilabel
     else:
         expr = " and ".join([f"({y_true_col_names[i]} = {y_pred_col_names[i]})" for i in range(len(y_true_col_names))])

snowflake/ml/modeling/metrics/regression.py CHANGED Viewed

@@ -3,19 +3,527 @@
 #
 import inspect
+from typing import List, Optional, Union
+import cloudpickle
+import numpy as np
+import numpy.typing as npt
+import sklearn
+from packaging import version
+from sklearn import metrics
+from snowflake import snowpark
 from snowflake.ml._internal import telemetry
-from snowflake.snowpark import DataFrame, functions as F
+from snowflake.ml.modeling.metrics import metrics_utils
+from snowflake.snowpark import functions as F
+from snowflake.snowpark._internal import utils as snowpark_utils
 _PROJECT = "ModelDevelopment"
 _SUBPROJECT = "Metrics"
-@telemetry.send_api_usage_telemetry(
-    project=_PROJECT,
-    subproject=_SUBPROJECT,
-)
-def r2_score(*, df: DataFrame, y_true_col_name: str, y_pred_col_name: str) -> float:
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def d2_absolute_error_score(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    :math:`D^2` regression score function, \
+    fraction of absolute error explained.
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical median of `y_true`
+    as constant prediction, disregarding the input features,
+    gets a :math:`D^2` score of 0.0.
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        multioutput: {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of errors in case of multioutput input.
+            'uniform_average':
+                Errors of all outputs are averaged with uniform weight.
+    Returns:
+        score: float or ndarray of floats
+            The :math:`D^2` score with an absolute error deviance
+            or ndarray of scores if 'multioutput' is 'raw_values'.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"d2_absolute_error_score_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def d2_absolute_error_score_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        score = metrics.d2_absolute_error_score(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+        )
+        return cloudpickle.dumps(score)  # type: ignore[no-any-return]
+    score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return score
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def d2_pinball_score(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    alpha: float = 0.5,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    :math:`D^2` regression score function, fraction of pinball loss explained.
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical alpha-quantile of
+    `y_true` as constant prediction, disregarding the input features,
+    gets a :math:`D^2` score of 0.0.
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        alpha: Slope of the pinball deviance. It determines the quantile level
+            alpha for which the pinball deviance and also D2 are optimal.
+            The default `alpha=0.5` is equivalent to `d2_absolute_error_score`.
+        multioutput: {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of errors in case of multioutput input.
+            'uniform_average':
+                Scores of all outputs are averaged with uniform weight.
+    Returns:
+        score: float or ndarray of floats
+            The :math:`D^2` score with a pinball deviance
+            or ndarray of scores if `multioutput='raw_values'`.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"d2_pinball_score_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def d2_pinball_score_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        score = metrics.d2_pinball_score(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            alpha=alpha,
+            multioutput=multioutput,
+        )
+        return cloudpickle.dumps(score)  # type: ignore[no-any-return]
+    score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return score
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def explained_variance_score(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+    force_finite: bool = True,
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    Explained variance regression score function.
+    Best possible score is 1.0, lower values are worse.
+    In the particular case when ``y_true`` is constant, the explained variance
+    score is not finite: it is either ``NaN`` (perfect predictions) or
+    ``-Inf`` (imperfect predictions). To prevent such non-finite numbers to
+    pollute higher-level experiments such as a grid search cross-validation,
+    by default these cases are replaced with 1.0 (perfect predictions) or 0.0
+    (imperfect predictions) respectively. If ``force_finite``
+    is set to ``False``, this score falls back on the original :math:`R^2`
+    definition.
+    Note:
+       The Explained Variance score is similar to the
+       :func:`R^2 score <r2_score>`, with the notable difference that it
+       does not account for systematic offsets in the prediction. Most often
+       the :func:`R^2 score <r2_score>` should be preferred.
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        multioutput: {'raw_values', 'uniform_average', 'variance_weighted'} or \
+            array-like of shape (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of scores in case of multioutput input.
+            'uniform_average':
+                Scores of all outputs are averaged with uniform weight.
+            'variance_weighted':
+                Scores of all outputs are averaged, weighted by the variances
+                of each individual output.
+        force_finite: Flag indicating if ``NaN`` and ``-Inf`` scores resulting
+            from constant data should be replaced with real numbers (``1.0`` if
+            prediction is perfect, ``0.0`` otherwise). Default is ``True``, a
+            convenient setting for hyperparameters' search procedures (e.g. grid
+            search cross-validation).
+    Returns:
+        score: float or ndarray of floats
+            The explained variance or ndarray if 'multioutput' is 'raw_values'.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"explained_variance_score_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def explained_variance_score_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        score = metrics.explained_variance_score(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+            force_finite=force_finite,
+        )
+        return cloudpickle.dumps(score)  # type: ignore[no-any-return]
+    score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return score
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def mean_absolute_error(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    Mean absolute error regression loss.
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        multioutput: {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of errors in case of multioutput input.
+            'uniform_average':
+                Errors of all outputs are averaged with uniform weight.
+    Returns:
+        loss: float or ndarray of floats
+            If multioutput is 'raw_values', then mean absolute error is returned
+            for each output separately.
+            If multioutput is 'uniform_average' or an ndarray of weights, then the
+            weighted average of all output errors is returned.
+            MAE output is non-negative floating point. The best value is 0.0.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"mean_absolute_error_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def mean_absolute_error_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        loss = metrics.mean_absolute_error(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+        )
+        return cloudpickle.dumps(loss)  # type: ignore[no-any-return]
+    loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return loss
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def mean_absolute_percentage_error(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    Mean absolute percentage error (MAPE) regression loss.
+    Note here that the output is not a percentage in the range [0, 100]
+    and a value of 100 does not mean 100% but 1e2. Furthermore, the output
+    can be arbitrarily high when `y_true` is small (which is specific to the
+    metric) or when `abs(y_true - y_pred)` is large (which is common for most
+    regression metrics).
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        multioutput: {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of errors in case of multioutput input.
+            'uniform_average':
+                Errors of all outputs are averaged with uniform weight.
+    Returns:
+        loss: float or ndarray of floats
+            If multioutput is 'raw_values', then mean absolute percentage error
+            is returned for each output separately.
+            If multioutput is 'uniform_average' or an ndarray of weights, then the
+            weighted average of all output errors is returned.
+            MAPE output is non-negative floating point. The best value is 0.0.
+            But note that bad predictions can lead to arbitrarily large
+            MAPE values, especially if some `y_true` values are very close to zero.
+            Note that we return a large value instead of `inf` when `y_true` is zero.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"mean_absolute_percentage_error_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def mean_absolute_percentage_error_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        loss = metrics.mean_absolute_percentage_error(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+        )
+        return cloudpickle.dumps(loss)  # type: ignore[no-any-return]
+    loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return loss
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def mean_squared_error(
+    *,
+    df: snowpark.DataFrame,
+    y_true_col_names: Union[str, List[str]],
+    y_pred_col_names: Union[str, List[str]],
+    sample_weight_col_name: Optional[str] = None,
+    multioutput: Union[str, npt.ArrayLike] = "uniform_average",
+    squared: bool = True,
+) -> Union[float, npt.NDArray[np.float_]]:
+    """
+    Mean squared error regression loss.
+    Args:
+        df: Input dataframe.
+        y_true_col_names: Column name(s) representing actual values.
+        y_pred_col_names: Column name(s) representing predicted values.
+        sample_weight_col_name: Column name representing sample weights.
+        multioutput: {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+            Defines aggregating of multiple output values.
+            Array-like value defines weights used to average errors.
+            'raw_values':
+                Returns a full set of errors in case of multioutput input.
+            'uniform_average':
+                Errors of all outputs are averaged with uniform weight.
+        squared: If True returns MSE value, if False returns RMSE value.
+    Returns:
+        loss: float or ndarray of floats
+            A non-negative floating point value (the best value is 0.0), or an
+            array of floating point values, one for each individual target.
+    """
+    metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
+    session = df._session
+    assert session is not None
+    sproc_name = f"mean_squared_error_{snowpark_utils.generate_random_alphanumeric()}"
+    sklearn_release = version.parse(sklearn.__version__).release
+    statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT)
+    cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name])
+    query = df[cols].queries["queries"][-1]
+    @F.sproc(  # type: ignore[misc]
+        session=session,
+        name=sproc_name,
+        replace=True,
+        packages=[
+            "cloudpickle",
+            f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
+            "snowflake-snowpark-python",
+        ],
+        statement_params=statement_params,
+    )
+    def mean_squared_error_sproc(session: snowpark.Session) -> bytes:
+        df = session.sql(query).to_pandas(statement_params=statement_params)
+        y_true = df[y_true_col_names]
+        y_pred = df[y_pred_col_names]
+        sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None
+        loss = metrics.mean_squared_error(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+            squared=squared,
+        )
+        return cloudpickle.dumps(loss)  # type: ignore[no-any-return]
+    loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads(
+        session.call(sproc_name, statement_params=statement_params)
+    )
+    return loss
+@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
+def r2_score(*, df: snowpark.DataFrame, y_true_col_name: str, y_pred_col_name: str) -> float:
     """:math:`R^2` (coefficient of determination) regression score function.
     Returns R squared metric on 2 columns in the dataframe.
@@ -27,9 +535,9 @@ def r2_score(*, df: DataFrame, y_true_col_name: str, y_pred_col_name: str) -> fl
     TODO(pdorairaj): Implement other params from sklearn - sample_weight, multi_output, force_finite.
     Args:
-        df (DataFrame): Input dataframe.
-        y_true_col_name (str): Column name representing actual values.
-        y_pred_col_name (str): Column name representing predicted values.
+        df: Input dataframe.
+        y_true_col_name: Column name representing actual values.
+        y_pred_col_name: Column name representing predicted values.
     Returns:
         R squared metric.

snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py CHANGED Viewed

@@ -742,26 +742,37 @@ class BayesianGaussianMixture(BaseTransformer):
         # input cols need to match unquoted / quoted
         input_cols = self.input_cols
         unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+        quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
         estimator = self._sklearn_object
-        input_df = dataset[input_cols] # Select input columns with quoted column names.
-        if hasattr(estimator, "feature_names_in_"):
-            missing_features = []
-            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
-                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
-                    missing_features.append(f)
-            if len(missing_features) > 0:
-                raise ValueError(
-                    "The feature names should match with those that were passed during fit.\n"
-                    f"Features seen during fit call but not present in the input: {missing_features}\n"
-                    f"Features in the input dataframe : {input_cols}\n"
-                )
-            input_df.columns = getattr(estimator, "feature_names_in_")
-        else:
-            # Just rename the column names to unquoted identifiers.
-            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+        features_required_by_estimator =  getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
+        missing_features = []
+        features_in_dataset = set(dataset.columns)
+        columns_to_select = []
+        for i, f in enumerate(features_required_by_estimator):
+            if (
+                    i >= len(input_cols)
+                or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
+                or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
+                    and quoted_input_cols[i] not in features_in_dataset)
+                ):
+                missing_features.append(f)
+            elif input_cols[i] in features_in_dataset:
+                columns_to_select.append(input_cols[i])
+            elif unquoted_input_cols[i] in features_in_dataset:
+                columns_to_select.append(unquoted_input_cols[i])
+            else:
+                columns_to_select.append(quoted_input_cols[i])
+        if len(missing_features) > 0:
+            raise ValueError(
+                "The feature names should match with those that were passed during fit.\n"
+                f"Features seen during fit call but not present in the input: {missing_features}\n"
+                f"Features in the input dataframe : {input_cols}\n"
+            )
+        input_df = dataset[columns_to_select]
+        input_df.columns = features_required_by_estimator
         transformed_numpy_array = getattr(estimator, inference_method)(
             input_df

snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

snowflake-ml-python 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl