PyPI - snowflake-ml-python - Versions diffs - 1.6.4__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

snowflake-ml-python 1.6.4py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

snowflake/ml/modeling/preprocessing/ordinal_encoder.py CHANGED Viewed

@@ -5,16 +5,20 @@ from typing import Any, Dict, Iterable, List, Optional, Union
 import numpy as np
 import pandas as pd
-from sklearn import preprocessing, utils as sklearn_utils
+from sklearn import preprocessing
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry, type_utils
 from snowflake.ml._internal.exceptions import error_codes, exceptions
-from snowflake.ml._internal.utils import identifier
+from snowflake.ml._internal.utils import identifier, import_utils
 from snowflake.ml.modeling.framework import _utils, base
 from snowflake.snowpark import functions as F, types as T
 from snowflake.snowpark._internal import utils as snowpark_utils
+is_scalar_nan = import_utils.import_with_fallbacks(
+    "sklearn.utils.is_scalar_nan", "sklearn.utils._missing.is_scalar_nan"
+)
 _COLUMN_NAME = "_COLUMN_NAME"
 _CATEGORY = "_CATEGORY"
 _INDEX = "_INDEX"
@@ -440,7 +444,7 @@ class OrdinalEncoder(base.BaseTransformer):
                 used to encode a known category.
         """
         if self._missing_indices:
-            if not sklearn_utils.is_scalar_nan(self.encoded_missing_value):
+            if not is_scalar_nan(self.encoded_missing_value):
                 # Features are invalid when they contain a missing category
                 # and encoded_missing_value was already used to encode a
                 # known category
@@ -624,9 +628,7 @@ class OrdinalEncoder(base.BaseTransformer):
             )
         if self.handle_unknown == "use_encoded_value":
-            if not (
-                sklearn_utils.is_scalar_nan(self.unknown_value) or isinstance(self.unknown_value, numbers.Integral)
-            ):
+            if not (is_scalar_nan(self.unknown_value) or isinstance(self.unknown_value, numbers.Integral)):
                 raise exceptions.SnowflakeMLException(
                     error_code=error_codes.INVALID_ATTRIBUTE,
                     original_exception=TypeError(
@@ -663,7 +665,7 @@ class OrdinalEncoder(base.BaseTransformer):
         if self.handle_unknown == "use_encoded_value":
             # left outer join has already filled unknown values with null
-            if not (self.unknown_value is None or sklearn_utils.is_scalar_nan(self.unknown_value)):
+            if not (self.unknown_value is None or is_scalar_nan(self.unknown_value)):
                 transformed_dataset = transformed_dataset.na.fill(self.unknown_value, self.output_cols)
         return transformed_dataset

snowflake/ml/modeling/svm/linear_svc.py CHANGED Viewed

@@ -120,12 +120,12 @@ class LinearSVC(BaseTransformer):
         square of the hinge loss. The combination of ``penalty='l1'``
         and ``loss='hinge'`` is not supported.
-    dual: "auto" or bool, default=True
+    dual: "auto" or bool, default="auto"
         Select the algorithm to either solve the dual or primal
         optimization problem. Prefer dual=False when n_samples > n_features.
         `dual="auto"` will choose the value of the parameter automatically,
         based on the values of `n_samples`, `n_features`, `loss`, `multi_class`
-        and `penalty`. If `n_samples` < `n_features` and optmizer supports
+        and `penalty`. If `n_samples` < `n_features` and optimizer supports
         chosen `loss`, `multi_class` and `penalty`, then dual will be set to True,
         otherwise it will be set to False.
@@ -135,6 +135,9 @@ class LinearSVC(BaseTransformer):
     C: float, default=1.0
         Regularization parameter. The strength of the regularization is
         inversely proportional to C. Must be strictly positive.
+        For an intuitive visualization of the effects of scaling
+        the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
     multi_class: {'ovr', 'crammer_singer'}, default='ovr'
         Determines the multi-class strategy if `y` contains more than
@@ -148,20 +151,26 @@ class LinearSVC(BaseTransformer):
         will be ignored.
     fit_intercept: bool, default=True
-        Whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (i.e. data is expected to be already centered).
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
     intercept_scaling: float, default=1.0
-        When self.fit_intercept is True, instance vector x becomes
-        ``[x, self.intercept_scaling]``,
-        i.e. a "synthetic" feature with constant value equals to
-        intercept_scaling is appended to the instance vector.
-        The intercept becomes intercept_scaling * synthetic feature weight
-        Note! the synthetic feature weight is subject to l1/l2 regularization
-        as all other features.
-        To lessen the effect of regularization on synthetic feature weight
-        (and therefore on the intercept) intercept_scaling has to be increased.
+        When `fit_intercept` is True, the instance vector x becomes ``[x_1,
+        ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a
+        constant value equal to `intercept_scaling` is appended to the instance
+        vector. The intercept becomes intercept_scaling * synthetic feature
+        weight. Note that liblinear internally penalizes the intercept,
+        treating it like any other term in the feature vector. To reduce the
+        impact of the regularization on the intercept, the `intercept_scaling`
+        parameter can be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
     class_weight: dict or 'balanced', default=None
         Set the parameter C of class i to ``class_weight[i]*C`` for
@@ -193,7 +202,7 @@ class LinearSVC(BaseTransformer):
         *,
         penalty="l2",
         loss="squared_hinge",
-        dual="warn",
+        dual="auto",
         tol=0.0001,
         C=1.0,
         multi_class="ovr",
@@ -226,7 +235,7 @@ class LinearSVC(BaseTransformer):
         init_args = {'penalty':(penalty, "l2", False),
             'loss':(loss, "squared_hinge", False),
-            'dual':(dual, "warn", False),
+            'dual':(dual, "auto", False),
             'tol':(tol, 0.0001, False),
             'C':(C, 1.0, False),
             'multi_class':(multi_class, "ovr", False),

snowflake/ml/modeling/svm/linear_svr.py CHANGED Viewed

@@ -127,27 +127,33 @@ class LinearSVR(BaseTransformer):
         loss ('squared_epsilon_insensitive') is the L2 loss.
     fit_intercept: bool, default=True
-        Whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (i.e. data is expected to be already centered).
+        Whether or not to fit an intercept. If set to True, the feature vector
+        is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where
+        1 corresponds to the intercept. If set to False, no intercept will be
+        used in calculations (i.e. data is expected to be already centered).
     intercept_scaling: float, default=1.0
-        When self.fit_intercept is True, instance vector x becomes
-        [x, self.intercept_scaling],
-        i.e. a "synthetic" feature with constant value equals to
-        intercept_scaling is appended to the instance vector.
-        The intercept becomes intercept_scaling * synthetic feature weight
-        Note! the synthetic feature weight is subject to l1/l2 regularization
-        as all other features.
-        To lessen the effect of regularization on synthetic feature weight
-        (and therefore on the intercept) intercept_scaling has to be increased.
-    dual: "auto" or bool, default=True
+        When `fit_intercept` is True, the instance vector x becomes `[x_1, ...,
+        x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant
+        value equal to `intercept_scaling` is appended to the instance vector.
+        The intercept becomes intercept_scaling * synthetic feature weight.
+        Note that liblinear internally penalizes the intercept, treating it
+        like any other term in the feature vector. To reduce the impact of the
+        regularization on the intercept, the `intercept_scaling` parameter can
+        be set to a value greater than 1; the higher the value of
+        `intercept_scaling`, the lower the impact of regularization on it.
+        Then, the weights become `[w_x_1, ..., w_x_n,
+        w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent
+        the feature weights and the intercept weight is scaled by
+        `intercept_scaling`. This scaling allows the intercept term to have a
+        different regularization behavior compared to the other features.
+    dual: "auto" or bool, default="auto"
         Select the algorithm to either solve the dual or primal
         optimization problem. Prefer dual=False when n_samples > n_features.
         `dual="auto"` will choose the value of the parameter automatically,
         based on the values of `n_samples`, `n_features` and `loss`. If
-        `n_samples` < `n_features` and optmizer supports chosen `loss`,
+        `n_samples` < `n_features` and optimizer supports chosen `loss`,
         then dual will be set to True, otherwise it will be set to False.
     verbose: int, default=0
@@ -173,7 +179,7 @@ class LinearSVR(BaseTransformer):
         loss="epsilon_insensitive",
         fit_intercept=True,
         intercept_scaling=1.0,
-        dual="warn",
+        dual="auto",
         verbose=0,
         random_state=None,
         max_iter=1000,
@@ -204,7 +210,7 @@ class LinearSVR(BaseTransformer):
             'loss':(loss, "epsilon_insensitive", False),
             'fit_intercept':(fit_intercept, True, False),
             'intercept_scaling':(intercept_scaling, 1.0, False),
-            'dual':(dual, "warn", False),
+            'dual':(dual, "auto", False),
             'verbose':(verbose, 0, False),
             'random_state':(random_state, None, False),
             'max_iter':(max_iter, 1000, False),}

snowflake/ml/modeling/svm/nu_svc.py CHANGED Viewed

@@ -115,9 +115,11 @@ class NuSVC(BaseTransformer):
         Should be in the interval (0, 1].
     kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,          default='rbf'
-         Specifies the kernel type to be used in the algorithm.
-         If none is given, 'rbf' will be used. If a callable is given it is
-         used to precompute the kernel matrix.
+        Specifies the kernel type to be used in the algorithm.
+        If none is given, 'rbf' will be used. If a callable is given it is
+        used to precompute the kernel matrix. For an intuitive
+        visualization of different kernel types see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.
     degree: int, default=3
         Degree of the polynomial kernel function ('poly').

snowflake/ml/modeling/svm/nu_svr.py CHANGED Viewed

@@ -115,7 +115,9 @@ class NuSVR(BaseTransformer):
         default 0.5 will be taken.
     C: float, default=1.0
-        Penalty parameter C of the error term.
+        Penalty parameter C of the error term. For an intuitive visualization
+        of the effects of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
     kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,          default='rbf'
          Specifies the kernel type to be used in the algorithm.

snowflake/ml/modeling/svm/svc.py CHANGED Viewed

@@ -112,13 +112,17 @@ class SVC(BaseTransformer):
     C: float, default=1.0
         Regularization parameter. The strength of the regularization is
         inversely proportional to C. Must be strictly positive. The penalty
-        is a squared l2 penalty.
+        is a squared l2 penalty. For an intuitive visualization of the effects
+        of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
     kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,          default='rbf'
-        Specifies the kernel type to be used in the algorithm.
-        If none is given, 'rbf' will be used. If a callable is given it is
-        used to pre-compute the kernel matrix from data matrices; that matrix
-        should be an array of shape ``(n_samples, n_samples)``.
+        Specifies the kernel type to be used in the algorithm. If
+        none is given, 'rbf' will be used. If a callable is given it is used to
+        pre-compute the kernel matrix from data matrices; that matrix should be
+        an array of shape ``(n_samples, n_samples)``. For an intuitive
+        visualization of different kernel types see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.
     degree: int, default=3
         Degree of the polynomial kernel function ('poly').

snowflake/ml/modeling/svm/svr.py CHANGED Viewed

@@ -136,7 +136,9 @@ class SVR(BaseTransformer):
     C: float, default=1.0
         Regularization parameter. The strength of the regularization is
         inversely proportional to C. Must be strictly positive.
-        The penalty is a squared l2 penalty.
+        The penalty is a squared l2. For an intuitive visualization of the
+        effects of scaling the regularization parameter C, see
+        :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.
     epsilon: float, default=0.1
          Epsilon in the epsilon-SVR model. It specifies the epsilon-tube

snowflake/ml/modeling/tree/decision_tree_classifier.py CHANGED Viewed

@@ -149,7 +149,7 @@ class DecisionTreeClassifier(BaseTransformer):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
-    max_features: int, float or {"auto", "sqrt", "log2"}, default=None
+    max_features: int, float or {"sqrt", "log2"}, default=None
         The number of features to consider when looking for the best split:
             - If int, then consider `max_features` features at each split.
@@ -223,6 +223,23 @@ class DecisionTreeClassifier(BaseTransformer):
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
         :ref:`minimal_cost_complexity_pruning` for details.
+    monotonic_cst: array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+        If monotonic_cst is None, no constraints are applied.
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+        The constraints hold over the probability of the positive class.
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -240,6 +257,7 @@ class DecisionTreeClassifier(BaseTransformer):
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        monotonic_cst=None,
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -272,7 +290,8 @@ class DecisionTreeClassifier(BaseTransformer):
             'max_leaf_nodes':(max_leaf_nodes, None, False),
             'min_impurity_decrease':(min_impurity_decrease, 0.0, False),
             'class_weight':(class_weight, None, False),
-            'ccp_alpha':(ccp_alpha, 0.0, False),}
+            'ccp_alpha':(ccp_alpha, 0.0, False),
+            'monotonic_cst':(monotonic_cst, None, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.tree.DecisionTreeClassifier

snowflake/ml/modeling/tree/decision_tree_regressor.py CHANGED Viewed

@@ -154,7 +154,7 @@ class DecisionTreeRegressor(BaseTransformer):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
-    max_features: int, float or {"auto", "sqrt", "log2"}, default=None
+    max_features: int, float or {"sqrt", "log2"}, default=None
         The number of features to consider when looking for the best split:
         - If int, then consider `max_features` features at each split.
@@ -207,6 +207,20 @@ class DecisionTreeRegressor(BaseTransformer):
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
         :ref:`minimal_cost_complexity_pruning` for details.
+    monotonic_cst: array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+        If monotonic_cst is None, no constraints are applied.
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -223,6 +237,7 @@ class DecisionTreeRegressor(BaseTransformer):
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
+        monotonic_cst=None,
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -254,7 +269,8 @@ class DecisionTreeRegressor(BaseTransformer):
             'random_state':(random_state, None, False),
             'max_leaf_nodes':(max_leaf_nodes, None, False),
             'min_impurity_decrease':(min_impurity_decrease, 0.0, False),
-            'ccp_alpha':(ccp_alpha, 0.0, False),}
+            'ccp_alpha':(ccp_alpha, 0.0, False),
+            'monotonic_cst':(monotonic_cst, None, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.tree.DecisionTreeRegressor

snowflake/ml/modeling/tree/extra_tree_classifier.py CHANGED Viewed

@@ -149,16 +149,16 @@ class ExtraTreeClassifier(BaseTransformer):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
-    max_features: int, float, {"auto", "sqrt", "log2"} or None, default="sqrt"
+    max_features: int, float, {"sqrt", "log2"} or None, default="sqrt"
         The number of features to consider when looking for the best split:
-            - If int, then consider `max_features` features at each split.
-            - If float, then `max_features` is a fraction and
-              `max(1, int(max_features * n_features_in_))` features are considered at
-              each split.
-            - If "sqrt", then `max_features=sqrt(n_features)`.
-            - If "log2", then `max_features=log2(n_features)`.
-            - If None, then `max_features=n_features`.
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at
+          each split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
         Note: the search for a split does not stop until at least one
         valid partition of the node samples is found, even if it requires to
@@ -215,6 +215,23 @@ class ExtraTreeClassifier(BaseTransformer):
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
         :ref:`minimal_cost_complexity_pruning` for details.
+    monotonic_cst: array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+        If monotonic_cst is None, no constraints are applied.
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+        The constraints hold over the probability of the positive class.
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -232,6 +249,7 @@ class ExtraTreeClassifier(BaseTransformer):
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        monotonic_cst=None,
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -264,7 +282,8 @@ class ExtraTreeClassifier(BaseTransformer):
             'max_leaf_nodes':(max_leaf_nodes, None, False),
             'min_impurity_decrease':(min_impurity_decrease, 0.0, False),
             'class_weight':(class_weight, None, False),
-            'ccp_alpha':(ccp_alpha, 0.0, False),}
+            'ccp_alpha':(ccp_alpha, 0.0, False),
+            'monotonic_cst':(monotonic_cst, None, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.tree.ExtraTreeClassifier

snowflake/ml/modeling/tree/extra_tree_regressor.py CHANGED Viewed

@@ -154,7 +154,7 @@ class ExtraTreeRegressor(BaseTransformer):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
-    max_features: int, float, {"auto", "sqrt", "log2"} or None, default=1.0
+    max_features: int, float, {"sqrt", "log2"} or None, default=1.0
         The number of features to consider when looking for the best split:
         - If int, then consider `max_features` features at each split.
@@ -199,6 +199,20 @@ class ExtraTreeRegressor(BaseTransformer):
         subtree with the largest cost complexity that is smaller than
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
         :ref:`minimal_cost_complexity_pruning` for details.
+    monotonic_cst: array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+        If monotonic_cst is None, no constraints are applied.
+        Monotonicity constraints are not supported for:
+          - multioutput regressions (i.e. when `n_outputs_ > 1`),
+          - regressions trained on data with missing values.
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -215,6 +229,7 @@ class ExtraTreeRegressor(BaseTransformer):
         min_impurity_decrease=0.0,
         max_leaf_nodes=None,
         ccp_alpha=0.0,
+        monotonic_cst=None,
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -246,7 +261,8 @@ class ExtraTreeRegressor(BaseTransformer):
             'random_state':(random_state, None, False),
             'min_impurity_decrease':(min_impurity_decrease, 0.0, False),
             'max_leaf_nodes':(max_leaf_nodes, None, False),
-            'ccp_alpha':(ccp_alpha, 0.0, False),}
+            'ccp_alpha':(ccp_alpha, 0.0, False),
+            'monotonic_cst':(monotonic_cst, None, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.tree.ExtraTreeRegressor

snowflake-ml-python 1.6.4__py3-none-any.whl → 1.7.1__py3-none-any.whl

snowflake-ml-python 1.6.4py3-none-any.whl → 1.7.1py3-none-any.whl