PyPI - snowflake-ml-python - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

snowflake-ml-python 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

snowflake/ml/modeling/manifold/isomap.py CHANGED Viewed

@@ -165,7 +165,7 @@ class Isomap(BaseTransformer):
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square. X may be a :term:`Glossary <sparse graph>`.
-    p: int, default=2
+    p: float, default=2
         Parameter for the Minkowski metric from
         sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance

snowflake/ml/modeling/manifold/mds.py CHANGED Viewed

@@ -157,7 +157,7 @@ class MDS(BaseTransformer):
             Pre-computed dissimilarities are passed directly to ``fit`` and
             ``fit_transform``.
-    normalized_stress: bool or "auto" default=False
+    normalized_stress: bool or "auto" default="auto"
         Whether use and return normed stress value (Stress-1) instead of raw
         stress calculated by default. Only supported in non-metric MDS.
     """
@@ -174,7 +174,7 @@ class MDS(BaseTransformer):
         n_jobs=None,
         random_state=None,
         dissimilarity="euclidean",
-        normalized_stress="warn",
+        normalized_stress="auto",
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -205,7 +205,7 @@ class MDS(BaseTransformer):
             'n_jobs':(n_jobs, None, False),
             'random_state':(random_state, None, False),
             'dissimilarity':(dissimilarity, "euclidean", False),
-            'normalized_stress':(normalized_stress, "warn", False),}
+            'normalized_stress':(normalized_stress, "auto", False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.manifold.MDS

snowflake/ml/modeling/manifold/tsne.py CHANGED Viewed

@@ -143,7 +143,7 @@ class TSNE(BaseTransformer):
         to `max(N / early_exaggeration / 4, 50)` where N is the sample size,
         following [4] and [5].
-    n_iter: int, default=1000
+    max_iter: int, default=1000
         Maximum number of iterations for the optimization. Should be at
         least 250.
@@ -211,6 +211,10 @@ class TSNE(BaseTransformer):
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
+    n_iter: int
+        Maximum number of iterations for the optimization. Should be at
+        least 250.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -220,7 +224,7 @@ class TSNE(BaseTransformer):
         perplexity=30.0,
         early_exaggeration=12.0,
         learning_rate="auto",
-        n_iter=1000,
+        max_iter=None,
         n_iter_without_progress=300,
         min_grad_norm=1e-07,
         metric="euclidean",
@@ -231,6 +235,7 @@ class TSNE(BaseTransformer):
         method="barnes_hut",
         angle=0.5,
         n_jobs=None,
+        n_iter="deprecated",
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -256,7 +261,7 @@ class TSNE(BaseTransformer):
             'perplexity':(perplexity, 30.0, False),
             'early_exaggeration':(early_exaggeration, 12.0, False),
             'learning_rate':(learning_rate, "auto", False),
-            'n_iter':(n_iter, 1000, False),
+            'max_iter':(max_iter, None, False),
             'n_iter_without_progress':(n_iter_without_progress, 300, False),
             'min_grad_norm':(min_grad_norm, 1e-07, False),
             'metric':(metric, "euclidean", False),
@@ -266,7 +271,8 @@ class TSNE(BaseTransformer):
             'random_state':(random_state, None, False),
             'method':(method, "barnes_hut", False),
             'angle':(angle, 0.5, False),
-            'n_jobs':(n_jobs, None, False),}
+            'n_jobs':(n_jobs, None, False),
+            'n_iter':(n_iter, "deprecated", False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.manifold.TSNE

snowflake/ml/modeling/metrics/classification.py CHANGED Viewed

@@ -300,7 +300,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
             ]
         ),
         input_types=[T.ArrayType(), T.IntegerType()],
-        packages=["numpy", "cloudpickle"],
+        packages=[f"numpy=={np.__version__}", f"cloudpickle=={cloudpickle.__version__}"],
         name=confusion_matrix_computer,
         is_permanent=False,
         replace=True,
@@ -535,9 +535,8 @@ def log_loss(
             assumed to be that of the positive class. The labels in ``y_pred``
             are assumed to be ordered alphabetically, as done by `LabelBinarizer`.
         eps: float or "auto", default="auto"
-            Log loss is undefined for p=0 or p=1, so probabilities are
-            clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
-            data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
+            Deprecated: if specified, it will be ignored and a warning emitted. Retained
+            for backward compatibility.
         normalize: boolean, default=True
             If true, return the mean loss per sample.
             Otherwise, return the sum of the per-sample losses.
@@ -557,8 +556,11 @@ def log_loss(
     y_true = y_true_col_names if isinstance(y_true_col_names, list) else [y_true_col_names]
     y_pred = y_pred_col_names if isinstance(y_pred_col_names, list) else [y_pred_col_names]
+    if eps != "auto":
+        warnings.warn("log_loss eps argument is deprecated and will be ignored.", DeprecationWarning, stacklevel=2)
     # If it is binary classification, use SQL because it is faster.
-    if len(y_pred) == 1 and eps == "auto":
+    if len(y_pred) == 1:
         metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names)
         eps = float(np.finfo(float).eps)
         y_true_col = y_true[0]
@@ -592,7 +594,6 @@ def log_loss(
     log_loss_computer = _register_log_loss_computer(
         session=session,
         statement_params=statement_params,
-        eps=eps,
         labels=labels,
     )
     log_loss_computer_udtf = F.table_function(log_loss_computer)
@@ -625,7 +626,6 @@ def _register_log_loss_computer(
     *,
     session: snowpark.Session,
     statement_params: Dict[str, Any],
-    eps: Union[float, str] = "auto",
     labels: Optional[npt.ArrayLike] = None,
 ) -> str:
     """Registers log loss computation UDTF in Snowflake and returns the name of the UDTF.
@@ -633,10 +633,6 @@ def _register_log_loss_computer(
     Args:
         session: Snowpark session.
         statement_params: Dictionary used for tagging queries for tracking purposes.
-        eps: float or "auto", default="auto"
-            Log loss is undefined for p=0 or p=1, so probabilities are
-            clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
-            data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
         labels: If not provided, labels will be inferred from y_true. If ``labels``
             is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
             assumed to be binary and are inferred from ``y_true``.
@@ -647,7 +643,6 @@ def _register_log_loss_computer(
     class LogLossComputer:
         def __init__(self) -> None:
-            self._eps = eps
             self._labels = labels
             self._y_true: List[List[int]] = []
             self._y_pred: List[List[float]] = []
@@ -662,7 +657,6 @@ def _register_log_loss_computer(
             res = metrics.log_loss(
                 self._y_true,
                 self._y_pred,
-                eps=self._eps,
                 normalize=False,
                 sample_weight=self._sample_weight,
                 labels=self._labels,
@@ -670,6 +664,7 @@ def _register_log_loss_computer(
             yield (float(res),)
     log_loss_computer = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION)
+    sklearn_release = version.parse(sklearn.__version__).release
     session.udtf.register(
         LogLossComputer,
         output_schema=T.StructType(
@@ -677,7 +672,7 @@ def _register_log_loss_computer(
                 T.StructField("log_loss", T.FloatType()),
             ]
         ),
-        packages=["scikit-learn<1.4"],
+        packages=[f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*"],
         name=log_loss_computer,
         is_permanent=False,
         replace=True,
@@ -814,7 +809,7 @@ def precision_recall_fscore_support(
             name=sproc_name,
             replace=True,
             packages=[
-                "cloudpickle",
+                f"cloudpickle=={cloudpickle.__version__}",
                 f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
                 "snowflake-snowpark-python",
             ],
@@ -1071,6 +1066,7 @@ def _register_multilabel_confusion_matrix_computer(
             yield (tp_sum, pred_sum, true_sum)
     multilabel_confusion_matrix_computer = random_name_for_temp_object(TempObjectType.TABLE_FUNCTION)
+    sklearn_release = version.parse(sklearn.__version__).release
     session.udtf.register(
         MultilabelConfusionMatrixComputer,
         output_schema=T.StructType(
@@ -1080,7 +1076,7 @@ def _register_multilabel_confusion_matrix_computer(
                 T.StructField("TRUE_SUM", T.ArrayType()),
             ]
         ),
-        packages=["numpy", "scikit-learn<1.4"],
+        packages=[f"numpy=={np.__version__}", f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*"],
         name=multilabel_confusion_matrix_computer,
         is_permanent=False,
         replace=True,

snowflake/ml/modeling/metrics/ranking.py CHANGED Viewed

@@ -96,7 +96,7 @@ def precision_recall_curve(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],
@@ -243,7 +243,7 @@ def roc_auc_score(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],
@@ -346,7 +346,7 @@ def roc_curve(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],

snowflake/ml/modeling/metrics/regression.py CHANGED Viewed

@@ -81,7 +81,7 @@ def d2_absolute_error_score(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],
@@ -178,7 +178,7 @@ def d2_pinball_score(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],
@@ -293,7 +293,7 @@ def explained_variance_score(
         name=sproc_name,
         replace=True,
         packages=[
-            "cloudpickle",
+            f"cloudpickle=={cloudpickle.__version__}",
             f"scikit-learn=={sklearn_release[0]}.{sklearn_release[1]}.*",
             "snowflake-snowpark-python",
         ],

snowflake/ml/modeling/naive_bayes/bernoulli_nb.py CHANGED Viewed

@@ -113,7 +113,7 @@ class BernoulliNB(BaseTransformer):
         Additive (Laplace/Lidstone) smoothing parameter
         (set alpha=0 and force_alpha=True, for no smoothing).
-    force_alpha: bool, default=False
+    force_alpha: bool, default=True
         If False and alpha is less than 1e-10, it will set alpha to
         1e-10. If True, alpha will remain unchanged. This may cause
         numerical errors if alpha is too close to 0.
@@ -135,7 +135,7 @@ class BernoulliNB(BaseTransformer):
         self,
         *,
         alpha=1.0,
-        force_alpha="warn",
+        force_alpha=True,
         binarize=0.0,
         fit_prior=True,
         class_prior=None,
@@ -161,7 +161,7 @@ class BernoulliNB(BaseTransformer):
         self._deps = list(deps)
         init_args = {'alpha':(alpha, 1.0, False),
-            'force_alpha':(force_alpha, "warn", False),
+            'force_alpha':(force_alpha, True, False),
             'binarize':(binarize, 0.0, False),
             'fit_prior':(fit_prior, True, False),
             'class_prior':(class_prior, None, False),}

snowflake/ml/modeling/naive_bayes/categorical_nb.py CHANGED Viewed

@@ -113,7 +113,7 @@ class CategoricalNB(BaseTransformer):
         Additive (Laplace/Lidstone) smoothing parameter
         (set alpha=0 and force_alpha=True, for no smoothing).
-    force_alpha: bool, default=False
+    force_alpha: bool, default=True
         If False and alpha is less than 1e-10, it will set alpha to
         1e-10. If True, alpha will remain unchanged. This may cause
         numerical errors if alpha is too close to 0.
@@ -141,7 +141,7 @@ class CategoricalNB(BaseTransformer):
         self,
         *,
         alpha=1.0,
-        force_alpha="warn",
+        force_alpha=True,
         fit_prior=True,
         class_prior=None,
         min_categories=None,
@@ -167,7 +167,7 @@ class CategoricalNB(BaseTransformer):
         self._deps = list(deps)
         init_args = {'alpha':(alpha, 1.0, False),
-            'force_alpha':(force_alpha, "warn", False),
+            'force_alpha':(force_alpha, True, False),
             'fit_prior':(fit_prior, True, False),
             'class_prior':(class_prior, None, False),
             'min_categories':(min_categories, None, False),}

snowflake/ml/modeling/naive_bayes/complement_nb.py CHANGED Viewed

@@ -113,7 +113,7 @@ class ComplementNB(BaseTransformer):
         Additive (Laplace/Lidstone) smoothing parameter
         (set alpha=0 and force_alpha=True, for no smoothing).
-    force_alpha: bool, default=False
+    force_alpha: bool, default=True
         If False and alpha is less than 1e-10, it will set alpha to
         1e-10. If True, alpha will remain unchanged. This may cause
         numerical errors if alpha is too close to 0.
@@ -135,7 +135,7 @@ class ComplementNB(BaseTransformer):
         self,
         *,
         alpha=1.0,
-        force_alpha="warn",
+        force_alpha=True,
         fit_prior=True,
         class_prior=None,
         norm=False,
@@ -161,7 +161,7 @@ class ComplementNB(BaseTransformer):
         self._deps = list(deps)
         init_args = {'alpha':(alpha, 1.0, False),
-            'force_alpha':(force_alpha, "warn", False),
+            'force_alpha':(force_alpha, True, False),
             'fit_prior':(fit_prior, True, False),
             'class_prior':(class_prior, None, False),
             'norm':(norm, False, False),}

snowflake/ml/modeling/naive_bayes/multinomial_nb.py CHANGED Viewed

@@ -113,7 +113,7 @@ class MultinomialNB(BaseTransformer):
         Additive (Laplace/Lidstone) smoothing parameter
         (set alpha=0 and force_alpha=True, for no smoothing).
-    force_alpha: bool, default=False
+    force_alpha: bool, default=True
         If False and alpha is less than 1e-10, it will set alpha to
         1e-10. If True, alpha will remain unchanged. This may cause
         numerical errors if alpha is too close to 0.
@@ -131,7 +131,7 @@ class MultinomialNB(BaseTransformer):
         self,
         *,
         alpha=1.0,
-        force_alpha="warn",
+        force_alpha=True,
         fit_prior=True,
         class_prior=None,
         input_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -156,7 +156,7 @@ class MultinomialNB(BaseTransformer):
         self._deps = list(deps)
         init_args = {'alpha':(alpha, 1.0, False),
-            'force_alpha':(force_alpha, "warn", False),
+            'force_alpha':(force_alpha, True, False),
             'fit_prior':(fit_prior, True, False),
             'class_prior':(class_prior, None, False),}
         cleaned_up_init_args = validate_sklearn_args(

snowflake/ml/modeling/neighbors/k_neighbors_classifier.py CHANGED Viewed

@@ -124,6 +124,11 @@ class KNeighborsClassifier(BaseTransformer):
           array of distances, and returns an array of the same shape
           containing the weights.
+        Refer to the example entitled
+        :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
+        showing the impact of the `weights` parameter on the decision
+        boundary.
     algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
         Algorithm used to compute the nearest neighbors:
@@ -142,10 +147,11 @@ class KNeighborsClassifier(BaseTransformer):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
-    p: int, default=2
-        Power parameter for the Minkowski metric. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+    p: float, default=2
+        Power parameter for the Minkowski metric. When p = 1, this is equivalent
+        to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
+        For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
+        to be positive.
     metric: str or callable, default='minkowski'
         Metric to use for distance computation. Default is "minkowski", which

snowflake/ml/modeling/neighbors/k_neighbors_regressor.py CHANGED Viewed

@@ -144,12 +144,12 @@ class KNeighborsRegressor(BaseTransformer):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
-    p: int, default=2
+    p: float, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-    metric: str or callable, default='minkowski'
+    metric: str, DistanceMetric object or callable, default='minkowski'
         Metric to use for distance computation. Default is "minkowski", which
         results in the standard Euclidean distance when p = 2. See the
         documentation of `scipy.spatial.distance
@@ -167,6 +167,9 @@ class KNeighborsRegressor(BaseTransformer):
         between those vectors. This works for Scipy's metrics, but is less
         efficient than passing the metric name as a string.
+        If metric is a DistanceMetric object, it will be passed directly to
+        the underlying computation routines.
     metric_params: dict, default=None
         Additional keyword arguments for the metric function.

snowflake/ml/modeling/neighbors/local_outlier_factor.py CHANGED Viewed

@@ -150,9 +150,9 @@ class LocalOutlierFactor(BaseTransformer):
         between those vectors. This works for Scipy's metrics, but is less
         efficient than passing the metric name as a string.
-    p: int, default=2
+    p: float, default=2
         Parameter for the Minkowski metric from
-        :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this
+        :func:`sklearn.metrics.pairwise_distances`. When p = 1, this
         is equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

snowflake/ml/modeling/neighbors/nearest_centroid.py CHANGED Viewed

@@ -109,20 +109,13 @@ class NearestCentroid(BaseTransformer):
     drop_input_cols: Optional[bool], default=False
         If set, the response of predict(), transform() methods will not contain input columns.
-    metric: str or callable, default="euclidean"
-        Metric to use for distance computation. See the documentation of
-        `scipy.spatial.distance
-        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
-        the metrics listed in
-        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
-        values. Note that "wminkowski", "seuclidean" and "mahalanobis" are not
-        supported.
-        The centroids for the samples corresponding to each class is
-        the point from which the sum of the distances (according to the metric)
-        of all samples that belong to that particular class are minimized.
-        If the `"manhattan"` metric is provided, this centroid is the median
-        and for all other metrics, the centroid is now set to be the mean.
+    metric: {"euclidean", "manhattan"}, default="euclidean"
+        Metric to use for distance computation.
+        If `metric="euclidean"`, the centroid for the samples corresponding to each
+        class is the arithmetic mean, which minimizes the sum of squared L1 distances.
+        If `metric="manhattan"`, the centroid is the feature-wise median, which
+        minimizes the sum of L1 distances.
     shrink_threshold: float, default=None
         Threshold for shrinking centroids to remove features.

snowflake/ml/modeling/neighbors/nearest_neighbors.py CHANGED Viewed

@@ -152,7 +152,7 @@ class NearestNeighbors(BaseTransformer):
         between those vectors. This works for Scipy's metrics, but is less
         efficient than passing the metric name as a string.
-    p: float, default=2
+    p: float (positive), default=2
         Parameter for the Minkowski metric from
         sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance

snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py CHANGED Viewed

@@ -145,10 +145,11 @@ class RadiusNeighborsClassifier(BaseTransformer):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
-    p: int, default=2
+    p: float, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+        This parameter is expected to be positive.
     metric: str or callable, default='minkowski'
         Metric to use for distance computation. Default is "minkowski", which
@@ -176,6 +177,10 @@ class RadiusNeighborsClassifier(BaseTransformer):
         - 'most_frequent': assign the most frequent label of y to outliers.
         - None: when any outlier is detected, ValueError will be raised.
+        The outlier label should be selected from among the unique 'Y' labels.
+        If it is specified with a different value a warning will be raised and
+        all class probabilities of outliers will be assigned to be 0.
     metric_params: dict, default=None
         Additional keyword arguments for the metric function.

snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py CHANGED Viewed

@@ -145,7 +145,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
-    p: int, default=2
+    p: float, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

snowflake/ml/modeling/neural_network/mlp_classifier.py CHANGED Viewed

@@ -138,6 +138,9 @@ class MLPClassifier(BaseTransformer):
         - 'adam' refers to a stochastic gradient-based optimizer proposed
           by Kingma, Diederik, and Jimmy Ba
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.
@@ -148,6 +151,9 @@ class MLPClassifier(BaseTransformer):
         Strength of the L2 regularization term. The L2 regularization term
         is divided by the sample size when added to the loss.
+        For an example usage and visualization of varying regularization, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.
     batch_size: int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
@@ -224,7 +230,7 @@ class MLPClassifier(BaseTransformer):
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to true, it will automatically set
         aside 10% of training data as validation and terminate training when
-        validation score is not improving by at least tol for
+        validation score is not improving by at least ``tol`` for
         ``n_iter_no_change`` consecutive epochs. The split is stratified,
         except in a multilabel setting.
         If early stopping is False, then the training stops when the training

snowflake/ml/modeling/neural_network/mlp_regressor.py CHANGED Viewed

@@ -138,6 +138,9 @@ class MLPRegressor(BaseTransformer):
         - 'adam' refers to a stochastic gradient-based optimizer proposed by
           Kingma, Diederik, and Jimmy Ba
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.

snowflake/ml/modeling/pipeline/pipeline.py CHANGED Viewed

@@ -863,21 +863,23 @@ class Pipeline(base.BaseTransformer):
         ct.sparse_output_ = False
         # ColumnTransformer internally replaces the "passthrough" string in the "remainder" step with a
-        # fitted FunctionTransformer, saved in the _name_to_fitted_passthrough dict, during the transform()
-        # call. So we need to populate _name_to_fitted_passthrough dict with fitted FunctionTransformer so
-        # that the replacements works correctly during the transform() call.
-        ft = FunctionTransformer(
-            accept_sparse=True,
-            check_inverse=False,
-            feature_names_out="one-to-one",
-        )
+        # fitted FunctionTransformer during the fit() call. So we need to manually replace the "passthrough"
+        # string with a fitted FunctionTransformer
+        for i, (step, transform, indices) in enumerate(ct.transformers_):
+            if transform == "passthrough":
+                ft = FunctionTransformer(
+                    accept_sparse=True,
+                    check_inverse=False,
+                    feature_names_out="one-to-one",
+                )
+                if step == "remainder":
+                    ft.feature_names_in_ = remaining
+                    ft.n_features_in_ = len(remaining)
+                else:
+                    ft.feature_names_in_ = self._feature_names_in[step_index_in_pipeline]
+                    ft.n_features_in_ = self._n_features_in[step_index_in_pipeline]
+                ct.transformers_[i] = (step, ft, indices)
-        if remainder_action == "passthrough":
-            ft.n_features_in_ = len(remaining)
-            ct._name_to_fitted_passthrough = {"remainder": ft}
-        elif step_transformer_obj == "passthrough":
-            ft.n_features_in_ = self._n_features_in[step_index_in_pipeline]
-            ct._name_to_fitted_passthrough = {step_name_in_ct: ft}
         return ct
     def _fit_ml_runtime(self, dataset: snowpark.DataFrame) -> None:

snowflake/ml/modeling/preprocessing/one_hot_encoder.py CHANGED Viewed

@@ -9,12 +9,12 @@ import pandas as pd
 import sklearn
 from packaging import version
 from scipy import sparse
-from sklearn import preprocessing, utils as sklearn_utils
+from sklearn import preprocessing
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry, type_utils
 from snowflake.ml._internal.exceptions import error_codes, exceptions
-from snowflake.ml._internal.utils import identifier
+from snowflake.ml._internal.utils import identifier, import_utils
 from snowflake.ml.model import model_signature
 from snowflake.ml.modeling.framework import _utils, base
 from snowflake.snowpark import functions as F, types as T
@@ -24,6 +24,10 @@ from snowflake.snowpark._internal.utils import (
     random_name_for_temp_object,
 )
+is_scalar_nan = import_utils.import_with_fallbacks(
+    "sklearn.utils.is_scalar_nan", "sklearn.utils._missing.is_scalar_nan"
+)
 _INFREQUENT_CATEGORY = "_INFREQUENT"
 _COLUMN_NAME = "_COLUMN_NAME"
 _CATEGORY = "_CATEGORY"
@@ -1293,7 +1297,7 @@ class OneHotEncoder(base.BaseTransformer):
             missing_drops = []
             drop_indices = []
             for feature_idx, (drop_val, cat_list) in enumerate(zip(drop_array, self._categories_list)):
-                if not sklearn_utils.is_scalar_nan(drop_val):
+                if not is_scalar_nan(drop_val):
                     drop_idx = np.where(cat_list == drop_val)[0]
                     if drop_idx.size:  # found drop idx
                         drop_indices.append(self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0]))
@@ -1303,7 +1307,7 @@ class OneHotEncoder(base.BaseTransformer):
                 # drop_val is nan, find nan in categories manually
                 for cat_idx, cat in enumerate(cat_list):
-                    if sklearn_utils.is_scalar_nan(cat):
+                    if is_scalar_nan(cat):
                         drop_indices.append(self._map_drop_idx_to_infrequent(feature_idx, cat_idx))
                         break
                 else:  # loop did not break thus drop is missing

snowflake-ml-python 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

snowflake-ml-python 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl