PyPI - bartz - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

bartz 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bartz/BART.py +464 -254
bartz/__init__.py +2 -2
bartz/_version.py +1 -1
bartz/debug.py +1259 -79
bartz/grove.py +139 -93
bartz/jaxext/__init__.py +213 -0
bartz/jaxext/_autobatch.py +238 -0
bartz/jaxext/scipy/__init__.py +25 -0
bartz/jaxext/scipy/special.py +240 -0
bartz/jaxext/scipy/stats.py +36 -0
bartz/mcmcloop.py +468 -311
bartz/mcmcstep.py +734 -453
bartz/prepcovars.py +139 -43
{bartz-0.6.0.dist-info → bartz-0.7.0.dist-info}/METADATA +2 -3
bartz-0.7.0.dist-info/RECORD +17 -0
{bartz-0.6.0.dist-info → bartz-0.7.0.dist-info}/WHEEL +1 -1
bartz/jaxext.py +0 -423
bartz-0.6.0.dist-info/RECORD +0 -13

bartz/BART.py CHANGED Viewed

@@ -22,25 +22,73 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-"""Implement a user interface that mimics the R BART package."""
+"""Implement a class `gbart` that mimics the R BART package."""
-import functools
 import math
-from typing import Any, Literal
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Any, Literal, Protocol
 import jax
 import jax.numpy as jnp
-from jax.scipy.special import ndtri
-from jaxtyping import Array, Bool, Float, Float32
-from . import grove, jaxext, mcmcloop, mcmcstep, prepcovars
+from equinox import Module, field
+from jax.scipy.special import ndtr
+from jaxtyping import (
+    Array,
+    Bool,
+    Float,
+    Float32,
+    Int32,
+    Integer,
+    Key,
+    Real,
+    Shaped,
+    UInt,
+)
+from numpy import ndarray
+from bartz import mcmcloop, mcmcstep, prepcovars
+from bartz.jaxext.scipy.special import ndtri
+from bartz.jaxext.scipy.stats import invgamma
 FloatLike = float | Float[Any, '']
-class gbart:
+class DataFrame(Protocol):
+    """DataFrame duck-type for `gbart`.
+    Attributes
+    ----------
+    columns : Sequence[str]
+        The names of the columns.
     """
-    Nonparametric regression with Bayesian Additive Regression Trees (BART).
+    columns: Sequence[str]
+    def to_numpy(self) -> ndarray:
+        """Convert the dataframe to a 2d numpy array with columns on the second axis."""
+        ...
+class Series(Protocol):
+    """Series duck-type for `gbart`.
+    Attributes
+    ----------
+    name : str | None
+        The name of the series.
+    """
+    name: str | None
+    def to_numpy(self) -> ndarray:
+        """Convert the series to a 1d numpy array."""
+        ...
+class gbart(Module):
+    R"""
+    Nonparametric regression with Bayesian Additive Regression Trees (BART) [2]_.
     Regress `y_train` on `x_train` with a latent mean function represented as
     a sum of decision trees. The inference is carried out by sampling the
@@ -48,36 +96,79 @@ class gbart:
     Parameters
     ----------
-    x_train : array (p, n) or DataFrame
+    x_train
         The training predictors.
-    y_train : array (n,) or Series
+    y_train
         The training responses.
-    x_test : array (p, m) or DataFrame, optional
+    x_test
         The test predictors.
     type
         The type of regression. 'wbart' for continuous regression, 'pbart' for
         binary regression with probit link.
-    usequants : bool, default False
+    sparse
+        Whether to activate variable selection on the predictors as done in
+        [1]_.
+    theta
+    a
+    b
+    rho
+        Hyperparameters of the sparsity prior used for variable selection.
+        The prior distribution on the choice of predictor for each decision rule
+        is
+        .. math::
+            (s_1, \ldots, s_p) \sim
+            \operatorname{Dirichlet}(\mathtt{theta}/p, \ldots, \mathtt{theta}/p).
+        If `theta` is not specified, it's a priori distributed according to
+        .. math::
+            \frac{\mathtt{theta}}{\mathtt{theta} + \mathtt{rho}} \sim
+            \operatorname{Beta}(\mathtt{a}, \mathtt{b}).
+        If not specified, `rho` is set to the number of predictors p. To tune
+        the prior, consider setting a lower `rho` to prefer more sparsity.
+        If setting `theta` directly, it should be in the ballpark of p or lower
+        as well.
+    xinfo
+        A matrix with the cutpoins to use to bin each predictor. If not
+        specified, it is generated automatically according to `usequants` and
+        `numcut`.
+        Each row shall contain a sorted list of cutpoints for a predictor. If
+        there are less cutpoints than the number of columns in the matrix,
+        fill the remaining cells with NaN.
+        `xinfo` shall be a matrix even if `x_train` is a dataframe.
+    usequants
         Whether to use predictors quantiles instead of a uniform grid to bin
-        predictors.
-    sigest : float, optional
+        predictors. Ignored if `xinfo` is specified.
+    rm_const
+        How to treat predictors with no associated decision rules (i.e., there
+        are no available cutpoints for that predictor). If `True` (default),
+        they are ignored. If `False`, an error is raised if there are any. If
+        `None`, no check is performed, and the output of the MCMC may not make
+        sense if there are predictors without cutpoints. The option `None` is
+        provided only to allow jax tracing.
+    sigest
         An estimate of the residual standard deviation on `y_train`, used to set
         `lamda`. If not specified, it is estimated by linear regression (with
         intercept, and without taking into account `w`). If `y_train` has less
         than two elements, it is set to 1. If n <= p, it is set to the standard
         deviation of `y_train`. Ignored if `lamda` is specified.
-    sigdf : int, default 3
+    sigdf
         The degrees of freedom of the scaled inverse-chisquared prior on the
         noise variance.
-    sigquant : float, default 0.9
+    sigquant
         The quantile of the prior on the noise variance that shall match
         `sigest` to set the scale of the prior. Ignored if `lamda` is specified.
-    k : float, default 2
+    k
         The inverse scale of the prior standard deviation on the latent mean
         function, relative to half the observed range of `y_train`. If `y_train`
         has less than two elements, `k` is ignored and the scale is set to 1.
-    power : float, default 2
-    base : float, default 0.95
+    power
+    base
         Parameters of the prior on tree node generation. The probability that a
         node at depth `d` (0-based) is non-terminal is ``base / (1 + d) **
         power``.
@@ -94,16 +185,19 @@ class gbart:
         The prior mean of the latent mean function. If not specified, it is set
         to the mean of `y_train` for continuous regression, and to
         ``Phi^-1(mean(y_train))`` for binary regression. If `y_train` is empty,
-        `offset` is set to 0.
-    w : array (n,), optional
+        `offset` is set to 0. With binary regression, if `y_train` is all
+        `False` or `True`, it is set to ``Phi^-1(1/(n+1))`` or
+        ``Phi^-1(n/(n+1))``, respectively.
+    w
         Coefficients that rescale the error standard deviation on each
         datapoint. Not specifying `w` is equivalent to setting it to 1 for all
         datapoints. Note: `w` is ignored in the automatic determination of
         `sigest`, so either the weights should be O(1), or `sigest` should be
         specified by the user.
-    ntree : int, default 200
-        The number of trees used to represent the latent mean function.
-    numcut : int, default 255
+    ntree
+        The number of trees used to represent the latent mean function. By
+        default 200 for continuous regression and 50 for binary regression.
+    numcut
         If `usequants` is `False`: the exact number of cutpoints used to bin the
         predictors, ranging between the minimum and maximum observed values
         (excluded).
@@ -116,14 +210,17 @@ class gbart:
         Before running the algorithm, the predictors are compressed to the
         smallest integer type that fits the bin indices, so `numcut` is best set
-        to the maximum value of an unsigned integer type.
-    ndpost : int, default 1000
+        to the maximum value of an unsigned integer type, like 255.
+        Ignored if `xinfo` is specified.
+    ndpost
         The number of MCMC samples to save, after burn-in.
-    nskip : int, default 100
+    nskip
         The number of initial MCMC samples to discard as burn-in.
-    keepevery : int, default 1
-        The thinning factor for the MCMC samples, after burn-in.
-    printevery : int or None, default 100
+    keepevery
+        The thinning factor for the MCMC samples, after burn-in. By default, 1
+        for continuous regression and 10 for binary regression.
+    printevery
         The number of iterations (including thinned-away ones) between each log
         line. Set to `None` to disable logging.
@@ -132,34 +229,24 @@ class gbart:
         iterations is a multiple of `printevery`, so if ``nskip + keepevery *
         ndpost`` is not a multiple of `printevery`, some of the last iterations
         will not be saved.
-    seed : int or jax random key, default 0
+    seed
         The seed for the random number generator.
-    maxdepth : int, default 6
+    maxdepth
         The maximum depth of the trees. This is 1-based, so with the default
         ``maxdepth=6``, the depths of the levels range from 0 to 5.
-    init_kw : dict
-        Additional arguments passed to `mcmcstep.init`.
-    run_mcmc_kw : dict
-        Additional arguments passed to `mcmcloop.run_mcmc`.
+    init_kw
+        Additional arguments passed to `bartz.mcmcstep.init`.
+    run_mcmc_kw
+        Additional arguments passed to `bartz.mcmcloop.run_mcmc`.
     Attributes
     ----------
-    yhat_train : array (ndpost, n)
-        The conditional posterior mean at `x_train` for each MCMC iteration.
-    yhat_train_mean : array (n,)
-        The marginal posterior mean at `x_train`.
-    yhat_test : array (ndpost, m)
-        The conditional posterior mean at `x_test` for each MCMC iteration.
-    yhat_test_mean : array (m,)
-        The marginal posterior mean at `x_test`.
-    sigma : array (ndpost,)
-        The standard deviation of the error.
-    first_sigma : array (nskip,)
-        The standard deviation of the error in the burn-in phase.
-    offset : float
+    offset : Float32[Array, '']
         The prior mean of the latent mean function.
-    sigest : float or None
+    sigest : Float32[Array, ''] | None
         The estimated standard deviation of the error used to set `lamda`.
+    yhat_test : Float32[Array, 'ndpost m'] | None
+        The conditional posterior mean at `x_test` for each MCMC iteration.
     Notes
     -----
@@ -168,68 +255,111 @@ class gbart:
     - If `x_train` and `x_test` are matrices, they have one predictor per row
       instead of per column.
-    - If `type` is not specified, it is determined solely based on the data type
-      of `y_train`, and not on whether it contains only two unique values.
     - If ``usequants=False``, R BART switches to quantiles anyway if there are
       less predictor values than the required number of bins, while bartz
       always follows the specification.
+    - Some functionality is missing.
     - The error variance parameter is called `lamda` instead of `lambda`.
-    - `rm_const` is always `False`.
-    - The default `numcut` is 255 instead of 100.
-    - A lot of functionality is missing (e.g., variable selection).
     - There are some additional attributes, and some missing.
     - The trees have a maximum depth.
+    - `rm_const` refers to predictors without decision rules instead of
+      predictors that are constant in `x_train`.
+    - If `rm_const=True` and some variables are dropped, the predictors
+      matrix/dataframe passed to `predict` should still include them.
+    References
+    ----------
+    .. [1] Linero, Antonio R. (2018). “Bayesian Regression Trees for
+       High-Dimensional Prediction and Variable Selection”. In: Journal of the
+       American Statistical Association 113.522, pp. 626-636.
+    .. [2] Hugh A. Chipman, Edward I. George, Robert E. McCulloch "BART:
+       Bayesian additive regression trees," The Annals of Applied Statistics,
+       Ann. Appl. Stat. 4(1), 266-298, (March 2010).
     """
+    _main_trace: mcmcloop.MainTrace
+    _burnin_trace: mcmcloop.BurninTrace
+    _mcmc_state: mcmcstep.State
+    _splits: Real[Array, 'p max_num_splits']
+    _x_train_fmt: Any = field(static=True)
+    ndpost: int = field(static=True)
+    offset: Float32[Array, '']
+    sigest: Float32[Array, ''] | None = None
+    yhat_test: Float32[Array, 'ndpost m'] | None = None
     def __init__(
         self,
-        x_train,
-        y_train,
+        x_train: Real[Array, 'p n'] | DataFrame,
+        y_train: Bool[Array, ' n'] | Float32[Array, ' n'] | Series,
         *,
-        x_test=None,
-        type: Literal['wbart', 'pbart'] = 'wbart',
-        usequants=False,
-        sigest=None,
-        sigdf=3,
-        sigquant=0.9,
-        k=2,
-        power=2,
-        base=0.95,
+        x_test: Real[Array, 'p m'] | DataFrame | None = None,
+        type: Literal['wbart', 'pbart'] = 'wbart',  # noqa: A002
+        sparse: bool = False,
+        theta: FloatLike | None = None,
+        a: FloatLike = 0.5,
+        b: FloatLike = 1.0,
+        rho: FloatLike | None = None,
+        xinfo: Float[Array, 'p n'] | None = None,
+        usequants: bool = False,
+        rm_const: bool | None = True,
+        sigest: FloatLike | None = None,
+        sigdf: FloatLike = 3.0,
+        sigquant: FloatLike = 0.9,
+        k: FloatLike = 2.0,
+        power: FloatLike = 2.0,
+        base: FloatLike = 0.95,
         lamda: FloatLike | None = None,
         tau_num: FloatLike | None = None,
         offset: FloatLike | None = None,
-        w=None,
-        ntree=200,
-        numcut=255,
-        ndpost=1000,
-        nskip=100,
-        keepevery=1,
-        printevery=100,
-        seed=0,
-        maxdepth=6,
-        init_kw=None,
-        run_mcmc_kw=None,
+        w: Float[Array, ' n'] | None = None,
+        ntree: int | None = None,
+        numcut: int = 100,
+        ndpost: int = 1000,
+        nskip: int = 100,
+        keepevery: int | None = None,
+        printevery: int | None = 100,
+        seed: int | Key[Array, ''] = 0,
+        maxdepth: int = 6,
+        init_kw: dict | None = None,
+        run_mcmc_kw: dict | None = None,
     ):
+        # check data and put it in the right format
         x_train, x_train_fmt = self._process_predictor_input(x_train)
-        y_train, _ = self._process_response_input(y_train)
+        y_train = self._process_response_input(y_train)
         self._check_same_length(x_train, y_train)
         if w is not None:
-            w, _ = self._process_response_input(w)
+            w = self._process_response_input(w)
             self._check_same_length(x_train, w)
-        y_train = self._process_type_settings(y_train, type, w)
+        # check data types are correct for continuous/binary regression
+        self._check_type_settings(y_train, type, w)
         # from here onwards, the type is determined by y_train.dtype == bool
+        # set defaults that depend on type of regression
+        if ntree is None:
+            ntree = 50 if y_train.dtype == bool else 200
+        if keepevery is None:
+            keepevery = 10 if y_train.dtype == bool else 1
+        # process sparsity settings
+        theta, a, b, rho = self._process_sparsity_settings(
+            x_train, sparse, theta, a, b, rho
+        )
+        # process "standardization" settings
         offset = self._process_offset_settings(y_train, offset)
         sigma_mu = self._process_leaf_sdev_settings(y_train, k, ntree, tau_num)
         lamda, sigest = self._process_error_variance_settings(
             x_train, y_train, sigest, sigdf, sigquant, lamda
         )
-        splits, max_split = self._determine_splits(x_train, usequants, numcut)
+        # determine splits
+        splits, max_split = self._determine_splits(x_train, usequants, numcut, xinfo)
         x_train = self._bin_predictors(x_train, splits)
-        mcmc_state = self._setup_mcmc(
+        # setup and run mcmc
+        initial_state = self._setup_mcmc(
             x_train,
             y_train,
             offset,
@@ -243,51 +373,163 @@ class gbart:
             maxdepth,
             ntree,
             init_kw,
+            rm_const,
+            theta,
+            a,
+            b,
+            rho,
         )
         final_state, burnin_trace, main_trace = self._run_mcmc(
-            mcmc_state, ndpost, nskip, keepevery, printevery, seed, run_mcmc_kw
+            initial_state,
+            ndpost,
+            nskip,
+            keepevery,
+            printevery,
+            seed,
+            run_mcmc_kw,
+            sparse,
         )
-        sigma = self._extract_sigma(main_trace)
-        first_sigma = self._extract_sigma(burnin_trace)
+        # set public attributes
         self.offset = final_state.offset  # from the state because of buffer donation
+        self.ndpost = ndpost
         self.sigest = sigest
-        self.sigma = sigma
-        self.first_sigma = first_sigma
-        self._x_train_fmt = x_train_fmt
-        self._splits = splits
+        # set private attributes
         self._main_trace = main_trace
+        self._burnin_trace = burnin_trace
         self._mcmc_state = final_state
+        self._splits = splits
+        self._x_train_fmt = x_train_fmt
+        # predict at test points
         if x_test is not None:
-            yhat_test = self.predict(x_test)
-            self.yhat_test = yhat_test
-            self.yhat_test_mean = yhat_test.mean(axis=0)
+            self.yhat_test = self.predict(x_test)
+    @cached_property
+    def prob_test(self) -> Float32[Array, 'ndpost m'] | None:
+        """The posterior probability of y being True at `x_test` for each MCMC iteration."""
+        if self.yhat_test is None or self._mcmc_state.y.dtype != bool:
+            return None
+        else:
+            return ndtr(self.yhat_test)
+    @cached_property
+    def prob_test_mean(self) -> Float32[Array, ' m'] | None:
+        """The marginal posterior probability of y being True at `x_test`."""
+        if self.prob_test is None:
+            return None
+        else:
+            return self.prob_test.mean(axis=0)
+    @cached_property
+    def prob_train(self) -> Float32[Array, 'ndpost n'] | None:
+        """The posterior probability of y being True at `x_train` for each MCMC iteration."""
+        if self._mcmc_state.y.dtype == bool:
+            return ndtr(self.yhat_train)
+        else:
+            return None
+    @cached_property
+    def prob_train_mean(self) -> Float32[Array, ' n'] | None:
+        """The marginal posterior probability of y being True at `x_train`."""
+        if self.prob_train is None:
+            return None
+        else:
+            return self.prob_train.mean(axis=0)
+    @cached_property
+    def sigma(self) -> Float32[Array, ' nskip+ndpost'] | None:
+        """The standard deviation of the error, including burn-in samples."""
+        if self._burnin_trace.sigma2 is None:
+            return None
+        else:
+            assert self._main_trace.sigma2 is not None
+            return jnp.sqrt(
+                jnp.concatenate([self._burnin_trace.sigma2, self._main_trace.sigma2])
+            )
+    @cached_property
+    def sigma_mean(self) -> Float32[Array, ''] | None:
+        """The mean of `sigma`, only over the post-burnin samples."""
+        if self.sigma is None:
+            return None
+        else:
+            return self.sigma[len(self.sigma) - self.ndpost :].mean(axis=0)
-    @functools.cached_property
-    def yhat_train(self):
+    @cached_property
+    def varcount(self) -> Int32[Array, 'ndpost p']:
+        """Histogram of predictor usage for decision rules in the trees."""
+        return mcmcloop.compute_varcount(
+            self._mcmc_state.forest.max_split.size, self._main_trace
+        )
+    @cached_property
+    def varcount_mean(self) -> Float32[Array, ' p']:
+        """Average of `varcount` across MCMC iterations."""
+        return self.varcount.mean(axis=0)
+    @cached_property
+    def varprob(self) -> Float32[Array, 'ndpost p']:
+        """Posterior samples of the probability of choosing each predictor for a decision rule."""
+        varprob = self._main_trace.varprob
+        if varprob is None:
+            max_split = self._mcmc_state.forest.max_split
+            p = max_split.size
+            peff = jnp.count_nonzero(max_split)
+            varprob = jnp.where(max_split, 1 / peff, 0)
+            varprob = jnp.broadcast_to(varprob, (self.ndpost, p))
+        return varprob
+    @cached_property
+    def varprob_mean(self) -> Float32[Array, ' p']:
+        """The marginal posterior probability of each predictor being chosen for a decision rule."""
+        return self.varprob.mean(axis=0)
+    @cached_property
+    def yhat_test_mean(self) -> Float32[Array, ' m'] | None:
+        """The marginal posterior mean at `x_test`.
+        Not defined with binary regression because it's error-prone, typically
+        the right thing to consider would be `prob_test_mean`.
+        """
+        if self.yhat_test is None or self._mcmc_state.y.dtype == bool:
+            return None
+        else:
+            return self.yhat_test.mean(axis=0)
+    @cached_property
+    def yhat_train(self) -> Float32[Array, 'ndpost n']:
+        """The conditional posterior mean at `x_train` for each MCMC iteration."""
         x_train = self._mcmc_state.X
-        return self._predict(self._main_trace, x_train)
+        return self._predict(x_train)
-    @functools.cached_property
-    def yhat_train_mean(self):
-        return self.yhat_train.mean(axis=0)
+    @cached_property
+    def yhat_train_mean(self) -> Float32[Array, ' n'] | None:
+        """The marginal posterior mean at `x_train`.
-    def predict(self, x_test):
+        Not defined with binary regression because it's error-prone, typically
+        the right thing to consider would be `prob_train_mean`.
+        """
+        if self._mcmc_state.y.dtype == bool:
+            return None
+        else:
+            return self.yhat_train.mean(axis=0)
+    def predict(
+        self, x_test: Real[Array, 'p m'] | DataFrame
+    ) -> Float32[Array, 'ndpost m']:
         """
         Compute the posterior mean at `x_test` for each MCMC iteration.
         Parameters
         ----------
-        x_test : array (p, m) or DataFrame
+        x_test
             The test predictors.
         Returns
         -------
-        yhat_test : array (ndpost, m)
-            The conditional posterior mean at `x_test` for each MCMC iteration.
+        The conditional posterior mean at `x_test` for each MCMC iteration.
         Raises
         ------
@@ -296,14 +538,13 @@ class gbart:
         """
         x_test, x_test_fmt = self._process_predictor_input(x_test)
         if x_test_fmt != self._x_train_fmt:
-            raise ValueError(
-                f'Input format mismatch: {x_test_fmt=} != x_train_fmt={self._x_train_fmt!r}'
-            )
+            msg = f'Input format mismatch: {x_test_fmt=} != x_train_fmt={self._x_train_fmt!r}'
+            raise ValueError(msg)
         x_test = self._bin_predictors(x_test, self._splits)
-        return self._predict(self._main_trace, x_test)
+        return self._predict(x_test)
     @staticmethod
-    def _process_predictor_input(x):
+    def _process_predictor_input(x) -> tuple[Shaped[Array, 'p n'], Any]:
         if hasattr(x, 'columns'):
             fmt = dict(kind='dataframe', columns=x.columns)
             x = x.to_numpy().T
@@ -314,15 +555,12 @@ class gbart:
         return x, fmt
     @staticmethod
-    def _process_response_input(y):
+    def _process_response_input(y) -> Shaped[Array, ' n']:
         if hasattr(y, 'to_numpy'):
-            fmt = dict(kind='series', name=y.name)
             y = y.to_numpy()
-        else:
-            fmt = dict(kind='array')
         y = jnp.asarray(y)
         assert y.ndim == 1
-        return y, fmt
+        return y
     @staticmethod
     def _check_same_length(x1, x2):
@@ -335,13 +573,16 @@ class gbart:
     ) -> tuple[Float32[Array, ''] | None, ...]:
         if y_train.dtype == bool:
             if sigest is not None:
-                raise ValueError('Let `sigest=None` for binary regression')
+                msg = 'Let `sigest=None` for binary regression'
+                raise ValueError(msg)
             if lamda is not None:
-                raise ValueError('Let `lamda=None` for binary regression')
+                msg = 'Let `lamda=None` for binary regression'
+                raise ValueError(msg)
             return None, None
         elif lamda is not None:
             if sigest is not None:
-                raise ValueError('Let `sigest=None` if `lamda` is specified')
+                msg = 'Let `sigest=None` if `lamda` is specified'
+                raise ValueError(msg)
             return lamda, None
         else:
             if sigest is not None:
@@ -359,37 +600,60 @@ class gbart:
                 dof = len(y_train) - rank
                 sigest2 = chisq / dof
             alpha = sigdf / 2
-            invchi2 = jaxext.scipy.stats.invgamma.ppf(sigquant, alpha) / 2
+            invchi2 = invgamma.ppf(sigquant, alpha) / 2
             invchi2rid = invchi2 * sigdf
             return sigest2 / invchi2rid, jnp.sqrt(sigest2)
     @staticmethod
-    def _process_type_settings(y_train, type, w):
+    def _check_type_settings(y_train, type, w):  # noqa: A002
         match type:
             case 'wbart':
                 if y_train.dtype != jnp.float32:
-                    raise TypeError(
+                    msg = (
                         'Continuous regression requires y_train.dtype=float32,'
                         f' got {y_train.dtype=} instead.'
                     )
+                    raise TypeError(msg)
             case 'pbart':
                 if w is not None:
-                    raise ValueError(
-                        'Binary regression does not support weights, set `w=None`'
-                    )
+                    msg = 'Binary regression does not support weights, set `w=None`'
+                    raise ValueError(msg)
                 if y_train.dtype != bool:
-                    raise TypeError(
+                    msg = (
                         'Binary regression requires y_train.dtype=bool,'
                         f' got {y_train.dtype=} instead.'
                     )
+                    raise TypeError(msg)
             case _:
-                raise ValueError(f'Invalid {type=}')
+                msg = f'Invalid {type=}'
+                raise ValueError(msg)
-        return y_train
+    @staticmethod
+    def _process_sparsity_settings(
+        x_train: Real[Array, 'p n'],
+        sparse: bool,
+        theta: FloatLike | None,
+        a: FloatLike,
+        b: FloatLike,
+        rho: FloatLike | None,
+    ) -> (
+        tuple[None, None, None, None]
+        | tuple[FloatLike, None, None, None]
+        | tuple[None, FloatLike, FloatLike, FloatLike]
+    ):
+        if not sparse:
+            return None, None, None, None
+        elif theta is not None:
+            return theta, None, None, None
+        else:
+            if rho is None:
+                p, _ = x_train.shape
+                rho = float(p)
+            return None, a, b, rho
     @staticmethod
     def _process_offset_settings(
-        y_train: Float32[Array, 'n'] | Bool[Array, 'n'],
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
         offset: float | Float32[Any, ''] | None,
     ) -> Float32[Array, '']:
         if offset is not None:
@@ -400,13 +664,15 @@ class gbart:
             mean = y_train.mean()
         if y_train.dtype == bool:
+            bound = 1 / (1 + y_train.size)
+            mean = jnp.clip(mean, bound, 1 - bound)
             return ndtri(mean)
         else:
             return mean
     @staticmethod
     def _process_leaf_sdev_settings(
-        y_train: Float32[Array, 'n'] | Bool[Array, 'n'],
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
         k: float,
         ntree: int,
         tau_num: FloatLike | None,
@@ -422,31 +688,46 @@ class gbart:
         return tau_num / (k * math.sqrt(ntree))
     @staticmethod
-    def _determine_splits(x_train, usequants, numcut):
-        if usequants:
+    def _determine_splits(
+        x_train: Real[Array, 'p n'],
+        usequants: bool,
+        numcut: int,
+        xinfo: Float[Array, 'p n'] | None,
+    ) -> tuple[Real[Array, 'p m'], UInt[Array, ' p']]:
+        if xinfo is not None:
+            if xinfo.ndim != 2 or xinfo.shape[0] != x_train.shape[0]:
+                msg = f'{xinfo.shape=} different from expected ({x_train.shape[0]}, *)'
+                raise ValueError(msg)
+            return prepcovars.parse_xinfo(xinfo)
+        elif usequants:
             return prepcovars.quantilized_splits_from_matrix(x_train, numcut + 1)
         else:
             return prepcovars.uniform_splits_from_matrix(x_train, numcut + 1)
     @staticmethod
-    def _bin_predictors(x, splits):
+    def _bin_predictors(x, splits) -> UInt[Array, 'p n']:
         return prepcovars.bin_predictors(x, splits)
     @staticmethod
     def _setup_mcmc(
-        x_train,
-        y_train,
-        offset,
-        w,
-        max_split,
-        lamda,
-        sigma_mu,
-        sigdf,
-        power,
-        base,
-        maxdepth,
-        ntree,
-        init_kw,
+        x_train: Real[Array, 'p n'],
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
+        offset: Float32[Array, ''],
+        w: Float[Array, ' n'] | None,
+        max_split: UInt[Array, ' p'],
+        lamda: Float32[Array, ''] | None,
+        sigma_mu: FloatLike,
+        sigdf: FloatLike,
+        power: FloatLike,
+        base: FloatLike,
+        maxdepth: int,
+        ntree: int,
+        init_kw: dict[str, Any] | None,
+        rm_const: bool | None,
+        theta: FloatLike | None,
+        a: FloatLike | None,
+        b: FloatLike | None,
+        rho: FloatLike | None,
     ):
         depth = jnp.arange(maxdepth - 1)
         p_nonterminal = base / (1 + depth).astype(float) ** power
@@ -470,14 +751,42 @@ class gbart:
             sigma_mu2=jnp.square(sigma_mu),
             sigma2_alpha=sigma2_alpha,
             sigma2_beta=sigma2_beta,
+            min_points_per_decision_node=10,
             min_points_per_leaf=5,
+            theta=theta,
+            a=a,
+            b=b,
+            rho=rho,
         )
+        if rm_const is None:
+            kw.update(filter_splitless_vars=False)
+        elif rm_const:
+            kw.update(filter_splitless_vars=True)
+        else:
+            n_empty = jnp.count_nonzero(max_split == 0)
+            if n_empty:
+                msg = f'There are {n_empty}/{max_split.size} predictors without decision rules'
+                raise ValueError(msg)
+            kw.update(filter_splitless_vars=False)
         if init_kw is not None:
             kw.update(init_kw)
         return mcmcstep.init(**kw)
     @staticmethod
-    def _run_mcmc(mcmc_state, ndpost, nskip, keepevery, printevery, seed, run_mcmc_kw):
+    def _run_mcmc(
+        mcmc_state: mcmcstep.State,
+        ndpost: int,
+        nskip: int,
+        keepevery: int,
+        printevery: int | None,
+        seed: int | Integer[Array, ''] | Key[Array, ''],
+        run_mcmc_kw: dict | None,
+        sparse: bool,
+    ):
+        # prepare random generator seed
         if isinstance(seed, jax.Array) and jnp.issubdtype(
             seed.dtype, jax.dtypes.prng_key
         ):
@@ -486,118 +795,19 @@ class gbart:
         else:
             key = jax.random.key(seed)
-        kw = dict(
-            n_burn=nskip,
-            n_skip=keepevery,
-            inner_loop_length=printevery,
-            allow_overflow=True,
+        # prepare arguments
+        kw = dict(n_burn=nskip, n_skip=keepevery, inner_loop_length=printevery)
+        kw.update(
+            mcmcloop.make_default_callback(
+                dot_every=None if printevery is None or printevery == 1 else 1,
+                report_every=printevery,
+                sparse_on_at=nskip // 2 if sparse else None,
+            )
         )
-        if printevery is not None:
-            kw.update(mcmcloop.make_print_callbacks())
         if run_mcmc_kw is not None:
             kw.update(run_mcmc_kw)
         return mcmcloop.run_mcmc(key, mcmc_state, ndpost, **kw)
-    @staticmethod
-    def _extract_sigma(trace) -> Float32[Array, 'trace_length'] | None:
-        if trace['sigma2'] is None:
-            return None
-        else:
-            return jnp.sqrt(trace['sigma2'])
-    @staticmethod
-    def _predict(trace, x):
-        return mcmcloop.evaluate_trace(trace, x)
-    def _show_tree(self, i_sample, i_tree, print_all=False):
-        from . import debug
-        trace = self._main_trace
-        leaf_tree = trace['leaf_trees'][i_sample, i_tree]
-        var_tree = trace['var_trees'][i_sample, i_tree]
-        split_tree = trace['split_trees'][i_sample, i_tree]
-        debug.print_tree(leaf_tree, var_tree, split_tree, print_all)
-    def _sigma_harmonic_mean(self, prior=False):
-        bart = self._mcmc_state
-        if prior:
-            alpha = bart['sigma2_alpha']
-            beta = bart['sigma2_beta']
-        else:
-            resid = bart['resid']
-            alpha = bart['sigma2_alpha'] + resid.size / 2
-            norm2 = jnp.dot(
-                resid, resid, preferred_element_type=bart['sigma2_beta'].dtype
-            )
-            beta = bart['sigma2_beta'] + norm2 / 2
-        sigma2 = beta / alpha
-        return jnp.sqrt(sigma2)
-    def _compare_resid(self):
-        bart = self._mcmc_state
-        resid1 = bart.resid
-        trees = grove.evaluate_forest(
-            bart.X,
-            bart.forest.leaf_trees,
-            bart.forest.var_trees,
-            bart.forest.split_trees,
-            jnp.float32,  # TODO remove these configurable dtypes around
-        )
-        if bart.z is not None:
-            ref = bart.z
-        else:
-            ref = bart.y
-        resid2 = ref - (trees + bart.offset)
-        return resid1, resid2
-    def _avg_acc(self):
-        trace = self._main_trace
-        def acc(prefix):
-            acc = trace[f'{prefix}_acc_count']
-            prop = trace[f'{prefix}_prop_count']
-            return acc.sum() / prop.sum()
-        return acc('grow'), acc('prune')
-    def _avg_prop(self):
-        trace = self._main_trace
-        def prop(prefix):
-            return trace[f'{prefix}_prop_count'].sum()
-        pgrow = prop('grow')
-        pprune = prop('prune')
-        total = pgrow + pprune
-        return pgrow / total, pprune / total
-    def _avg_move(self):
-        agrow, aprune = self._avg_acc()
-        pgrow, pprune = self._avg_prop()
-        return agrow * pgrow, aprune * pprune
-    def _depth_distr(self):
-        from . import debug
-        trace = self._main_trace
-        split_trees = trace['split_trees']
-        return debug.trace_depth_distr(split_trees)
-    def _points_per_leaf_distr(self):
-        from . import debug
-        return debug.trace_points_per_leaf_distr(self._main_trace, self._mcmc_state.X)
-    def _check_trees(self):
-        from . import debug
-        return debug.check_trace(self._main_trace, self._mcmc_state)
-    def _tree_goes_bad(self):
-        bad = self._check_trees().astype(bool)
-        bad_before = jnp.pad(bad[:-1], [(1, 0), (0, 0)])
-        return bad & ~bad_before
+    def _predict(self, x):
+        return mcmcloop.evaluate_trace(self._main_trace, x)

bartz 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

bartz 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl