PyPI - snowflake-ml-python - Versions diffs - 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl - Mend

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

snowflake/ml/modeling/xgboost/xgbrf_classifier.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "xgboost".replace("sklearn.", "")
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class XGBRFClassifier(BaseTransformer):
     r"""scikit-learn API for XGBoost random forest classification
     For more details on this class, see [xgboost.XGBRFClassifier]
@@ -125,111 +128,171 @@ class XGBRFClassifier(BaseTransformer):
         can seriously hurt performance in gradient boosting. Set the batch_size as large as possible
         based on the available memory.
-    n_estimators: int
+    n_estimators: Optional[int]
             Number of trees in random forest to fit.
-        max_depth:  Optional[int]
+        max_depth:  typing.Optional[int]
             Maximum tree depth for base learners.
-        max_leaves :
+        max_leaves: typing.Optional[int]
             Maximum number of leaves; 0 indicates no limit.
-        max_bin :
+        max_bin: typing.Optional[int]
             If using histogram-based algorithm, maximum number of bins per feature
-        grow_policy :
-            Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
-            depth-wise. 1: favor splitting at nodes with highest loss change.
-        learning_rate: Optional[float]
+        grow_policy: typing.Optional[str]
+            Tree growing policy.
+            - depthwise: Favors splitting at nodes closest to the node,
+            - lossguide: Favors splitting at nodes with highest loss change.
+        learning_rate: typing.Optional[float]
             Boosting learning rate (xgb's "eta")
-        verbosity: Optional[int]
+        verbosity: typing.Optional[int]
             The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
-        objective: typing.Union[str, typing.Callable[[numpy.ndarray, numpy.ndarray], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]
-            Specify the learning task and the corresponding learning objective or
-            a custom objective function to be used (see note below).
-        booster: Optional[str]
-            Specify which booster to use: gbtree, gblinear or dart.
-        tree_method: Optional[str]
+        objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]
+            Specify the learning task and the corresponding learning objective or a custom
+            objective function to be used.
+            For custom objective, see :doc:`/tutorials/custom_metric_obj` and
+            :ref:`custom-obj-metric` for more information, along with the end note for
+            function signatures.
+        booster: typing.Optional[str]
+            Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
+        tree_method: typing.Optional[str]
             Specify which tree method to use.  Default to auto.  If this parameter is set to
             default, XGBoost will choose the most conservative option available.  It's
             recommended to study this option from the parameters document :doc:`tree method
             </treemethod>`
-        n_jobs: Optional[int]
+        n_jobs: typing.Optional[int]
             Number of parallel threads used to run xgboost.  When used with other
             Scikit-Learn algorithms like grid search, you may choose which algorithm to
             parallelize and balance the threads.  Creating thread contention will
             significantly slow down both algorithms.
-        gamma: Optional[float]
-            (min_split_loss) Minimum loss reduction required to make a further partition on a
-            leaf node of the tree.
-        min_child_weight: Optional[float]
+        gamma: typing.Optional[float]
+            (min_split_loss) Minimum loss reduction required to make a further partition on
+            a leaf node of the tree.
+        min_child_weight: typing.Optional[float]
             Minimum sum of instance weight(hessian) needed in a child.
-        max_delta_step: Optional[float]
+        max_delta_step: typing.Optional[float]
             Maximum delta step we allow each tree's weight estimation to be.
-        subsample: Optional[float]
+        subsample: typing.Optional[float]
             Subsample ratio of the training instance.
-        sampling_method :
-            Sampling method. Used only by `gpu_hist` tree method.
-              - `uniform`: select random training instances uniformly.
-              - `gradient_based` select random training instances with higher probability when
-                the gradient and hessian are larger. (cf. CatBoost)
-        colsample_bytree: Optional[float]
+        sampling_method: typing.Optional[str]
+            Sampling method. Used only by the GPU version of ``hist`` tree method.
+            - ``uniform``: Select random training instances uniformly.
+            - ``gradient_based``: Select random training instances with higher probability
+                when the gradient and hessian are larger. (cf. CatBoost)
+        colsample_bytree: typing.Optional[float]
             Subsample ratio of columns when constructing each tree.
-        colsample_bylevel: Optional[float]
+        colsample_bylevel: typing.Optional[float]
             Subsample ratio of columns for each level.
-        colsample_bynode: Optional[float]
+        colsample_bynode: typing.Optional[float]
             Subsample ratio of columns for each split.
-        reg_alpha: Optional[float]
+        reg_alpha: typing.Optional[float]
             L1 regularization term on weights (xgb's alpha).
-        reg_lambda: Optional[float]
+        reg_lambda: typing.Optional[float]
             L2 regularization term on weights (xgb's lambda).
-        scale_pos_weight: Optional[float]
+        scale_pos_weight: typing.Optional[float]
             Balancing of positive and negative weights.
-        base_score: Optional[float]
+        base_score: typing.Optional[float]
             The initial prediction score of all instances, global bias.
-        random_state: Optional[Union[numpy.random.RandomState, int]]
+        random_state: typing.Union[numpy.random.mtrand.RandomState, numpy.random._generator.Generator, int, NoneType]
             Random number seed.
                Using gblinear booster with shotgun updater is nondeterministic as
                it uses Hogwild algorithm.
-        missing: float, default np.nan
-            Value in the data which needs to be present as a missing value.
-        num_parallel_tree: Optional[int]
+        missing: float
+            Value in the data which needs to be present as a missing value. Default to
+            :py:data:`numpy.nan`.
+        num_parallel_tree: typing.Optional[int]
             Used for boosting random forest.
-        monotone_constraints: Optional[Union[Dict[str, int], str]]
+        monotone_constraints: typing.Union[typing.Dict[str, int], str, NoneType]
             Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`
             for more information.
-        interaction_constraints: Optional[Union[str, List[Tuple[str]]]]
+        interaction_constraints: typing.Union[str, typing.List[typing.Tuple[str]], NoneType]
             Constraints for interaction representing permitted interactions.  The
             constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
             3, 4]]``, where each inner list is a group of indices of features that are
             allowed to interact with each other.  See :doc:`tutorial
             </tutorials/feature_interaction_constraint>` for more information
-        importance_type: Optional[str]
+        importance_type: typing.Optional[str]
             The feature importance type for the feature_importances\_ property:
             * For tree model, it's either "gain", "weight", "cover", "total_gain" or
               "total_cover".
-            * For linear model, only "weight" is defined and it's the normalized coefficients
-              without bias.
+            * For linear model, only "weight" is defined and it's the normalized
+              coefficients without bias.
+        device: typing.Optional[str]
+            Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
+        validate_parameters: typing.Optional[bool]
-        gpu_id: Optional[int]
-            Device ordinal.
-        validate_parameters: Optional[bool]
             Give warnings for unknown parameter.
-        predictor: Optional[str]
-            Force XGBoost to use specific predictor, available choices are [cpu_predictor,
-            gpu_predictor].
         enable_categorical: bool
-            Experimental support for categorical data.  When enabled, cudf/pandas.DataFrame
-            should be used to specify categorical data type.  Also, JSON/UBJSON
-            serialization format is required.
+            See the same parameter of :py:class:`DMatrix` for details.
-        feature_types: FeatureTypes
+        feature_types: typing.Optional[typing.Sequence[str]]
             Used for specifying feature types without constructing a dataframe. See
             :py:class:`DMatrix` for details.
-        max_cat_to_onehot: Optional[int]
+        max_cat_to_onehot: typing.Optional[int]
             A threshold for deciding whether XGBoost should use one-hot encoding based split
             for categorical data.  When number of categories is lesser than the threshold
@@ -238,36 +301,41 @@ class XGBRFClassifier(BaseTransformer):
             categorical feature support. See :doc:`Categorical Data
             </tutorials/categorical>` and :ref:`cat-param` for details.
-        max_cat_threshold: Optional[int]
+        max_cat_threshold: typing.Optional[int]
             Maximum number of categories considered for each split. Used only by
             partition-based splits for preventing over-fitting. Also, `enable_categorical`
             needs to be set to have categorical feature support. See :doc:`Categorical Data
             </tutorials/categorical>` and :ref:`cat-param` for details.
-        eval_metric: Optional[Union[str, List[str], Callable]]
+        multi_strategy: typing.Optional[str]
+            The strategy used for training multi-target models, including multi-target
+            regression and multi-class classification. See :doc:`/tutorials/multioutput` for
+            more information.
+            - ``one_output_per_tree``: One model for each target.
+            - ``multi_output_tree``:  Use multi-target trees.
+        eval_metric: typing.Union[str, typing.List[str], typing.Callable, NoneType]
             Metric used for monitoring the training result and early stopping.  It can be a
             string or list of strings as names of predefined metric in XGBoost (See
-            doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
-            user defined metric that looks like `sklearn.metrics`.
+            doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
+            other user defined metric that looks like `sklearn.metrics`.
             If custom objective is also provided, then custom metric should implement the
             corresponding reverse link function.
             Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
-            object is provided, it's assumed to be a cost function and by default XGBoost will
-            minimize the result during early stopping.
-            For advanced usage on Early stopping like directly choosing to maximize instead of
-            minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
+            object is provided, it's assumed to be a cost function and by default XGBoost
+            will minimize the result during early stopping.
-            See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
-            for more.
+            For advanced usage on Early stopping like directly choosing to maximize instead
+            of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
-                 This parameter replaces `eval_metric` in :py:meth:`fit` method.  The old one
-                 receives un-transformed prediction regardless of whether custom objective is
-                 being used.
+            See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
+            information.
                 from sklearn.datasets import load_diabetes
                 from sklearn.metrics import mean_absolute_error
@@ -278,24 +346,29 @@ class XGBRFClassifier(BaseTransformer):
                 )
                 reg.fit(X, y, eval_set=[(X, y)])
-        early_stopping_rounds: Optional[int]
+        early_stopping_rounds: typing.Optional[int]
-            Activates early stopping. Validation metric needs to improve at least once in
-            every **early_stopping_rounds** round(s) to continue training.  Requires at least
-            one item in **eval_set** in :py:meth:`fit`.
+            - Activates early stopping. Validation metric needs to improve at least once in
+              every **early_stopping_rounds** round(s) to continue training.  Requires at
+              least one item in **eval_set** in :py:meth:`fit`.
-            The method returns the model from the last iteration (not the best one).  If
-            there's more than one item in **eval_set**, the last entry will be used for early
-            stopping.  If there's more than one metric in **eval_metric**, the last metric
-            will be used for early stopping.
+            - If early stopping occurs, the model will have two additional attributes:
+              :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
+              :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
+              number of trees during inference. If users want to access the full model
+              (including trees built after early stopping), they can specify the
+              `iteration_range` in these inference methods. In addition, other utilities
+              like model plotting can also use the entire model.
-            If early stopping occurs, the model will have three additional fields:
-            :py:attr:`best_score`, :py:attr:`best_iteration` and
-            :py:attr:`best_ntree_limit`.
+            - If you prefer to discard the trees after `best_iteration`, consider using the
+              callback function :py:class:`xgboost.callback.EarlyStopping`.
-                This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
+            - If there's more than one item in **eval_set**, the last entry will be used for
+              early stopping.  If there's more than one metric in **eval_metric**, the last
+              metric will be used for early stopping.
+        callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]]
-        callbacks: Optional[List[TrainingCallback]]
             List of callback functions that are applied at end of each iteration.
             It is possible to use predefined callbacks by using
             :ref:`Callback API <callback_api>`.
@@ -307,9 +380,11 @@ class XGBRFClassifier(BaseTransformer):
                 for params in parameters_grid:
                     # be sure to (re)initialize the callbacks before each run
                     callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]
-                    xgboost.train(params, Xy, callbacks=callbacks)
+                    reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
+                    reg.fit(X, y)
+        kwargs: typing.Optional[typing.Any]
-        kwargs: dict, optional
             Keyword arguments for XGBoost Booster object.  Full documentation of parameters
             can be found :doc:`here </parameter>`.
             Attempting to set a parameter via the constructor args and \*\*kwargs
@@ -320,13 +395,16 @@ class XGBRFClassifier(BaseTransformer):
                 with scikit-learn.
                 A custom objective function can be provided for the ``objective``
-                parameter. In this case, it should have the signature
-                ``objective(y_true, y_pred) -> grad, hess``:
+                parameter. In this case, it should have the signature ``objective(y_true,
+                y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
+                -> [grad, hess]``:
                 y_true: array_like of shape [n_samples]
                     The target values
                 y_pred: array_like of shape [n_samples]
                     The predicted values
+                sample_weight :
+                    Optional sample weights.
                 grad: array_like of shape [n_samples]
                     The value of the gradient for each sample point.
@@ -632,7 +710,7 @@ class XGBRFClassifier(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1293,7 +1371,7 @@ class XGBRFClassifier(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1301,7 +1379,7 @@ class XGBRFClassifier(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.4py3-none-any.whl