PyPI - bartz - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

bartz 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bartz/BART.py +582 -279
bartz/__init__.py +3 -3
bartz/_version.py +1 -1
bartz/debug.py +1259 -79
bartz/grove.py +168 -81
bartz/jaxext/__init__.py +213 -0
bartz/jaxext/_autobatch.py +238 -0
bartz/jaxext/scipy/__init__.py +25 -0
bartz/jaxext/scipy/special.py +240 -0
bartz/jaxext/scipy/stats.py +36 -0
bartz/mcmcloop.py +568 -158
bartz/mcmcstep.py +1722 -926
bartz/prepcovars.py +142 -44
{bartz-0.5.0.dist-info → bartz-0.7.0.dist-info}/METADATA +6 -5
bartz-0.7.0.dist-info/RECORD +17 -0
{bartz-0.5.0.dist-info → bartz-0.7.0.dist-info}/WHEEL +1 -1
bartz/jaxext.py +0 -374
bartz-0.5.0.dist-info/RECORD +0 -13

bartz/BART.py CHANGED Viewed

@@ -22,17 +22,73 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import functools
+"""Implement a class `gbart` that mimics the R BART package."""
+import math
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Any, Literal, Protocol
 import jax
 import jax.numpy as jnp
+from equinox import Module, field
+from jax.scipy.special import ndtr
+from jaxtyping import (
+    Array,
+    Bool,
+    Float,
+    Float32,
+    Int32,
+    Integer,
+    Key,
+    Real,
+    Shaped,
+    UInt,
+)
+from numpy import ndarray
+from bartz import mcmcloop, mcmcstep, prepcovars
+from bartz.jaxext.scipy.special import ndtri
+from bartz.jaxext.scipy.stats import invgamma
+FloatLike = float | Float[Any, '']
+class DataFrame(Protocol):
+    """DataFrame duck-type for `gbart`.
+    Attributes
+    ----------
+    columns : Sequence[str]
+        The names of the columns.
+    """
+    columns: Sequence[str]
+    def to_numpy(self) -> ndarray:
+        """Convert the dataframe to a 2d numpy array with columns on the second axis."""
+        ...
-from . import grove, jaxext, mcmcloop, mcmcstep, prepcovars
+class Series(Protocol):
+    """Series duck-type for `gbart`.
-class gbart:
+    Attributes
+    ----------
+    name : str | None
+        The name of the series.
     """
-    Nonparametric regression with Bayesian Additive Regression Trees (BART).
+    name: str | None
+    def to_numpy(self) -> ndarray:
+        """Convert the series to a 1d numpy array."""
+        ...
+class gbart(Module):
+    R"""
+    Nonparametric regression with Bayesian Additive Regression Trees (BART) [2]_.
     Regress `y_train` on `x_train` with a latent mean function represented as
     a sum of decision trees. The inference is carried out by sampling the
@@ -40,55 +96,108 @@ class gbart:
     Parameters
     ----------
-    x_train : array (p, n) or DataFrame
+    x_train
         The training predictors.
-    y_train : array (n,) or Series
+    y_train
         The training responses.
-    x_test : array (p, m) or DataFrame, optional
+    x_test
         The test predictors.
-    usequants : bool, default False
+    type
+        The type of regression. 'wbart' for continuous regression, 'pbart' for
+        binary regression with probit link.
+    sparse
+        Whether to activate variable selection on the predictors as done in
+        [1]_.
+    theta
+    a
+    b
+    rho
+        Hyperparameters of the sparsity prior used for variable selection.
+        The prior distribution on the choice of predictor for each decision rule
+        is
+        .. math::
+            (s_1, \ldots, s_p) \sim
+            \operatorname{Dirichlet}(\mathtt{theta}/p, \ldots, \mathtt{theta}/p).
+        If `theta` is not specified, it's a priori distributed according to
+        .. math::
+            \frac{\mathtt{theta}}{\mathtt{theta} + \mathtt{rho}} \sim
+            \operatorname{Beta}(\mathtt{a}, \mathtt{b}).
+        If not specified, `rho` is set to the number of predictors p. To tune
+        the prior, consider setting a lower `rho` to prefer more sparsity.
+        If setting `theta` directly, it should be in the ballpark of p or lower
+        as well.
+    xinfo
+        A matrix with the cutpoins to use to bin each predictor. If not
+        specified, it is generated automatically according to `usequants` and
+        `numcut`.
+        Each row shall contain a sorted list of cutpoints for a predictor. If
+        there are less cutpoints than the number of columns in the matrix,
+        fill the remaining cells with NaN.
+        `xinfo` shall be a matrix even if `x_train` is a dataframe.
+    usequants
         Whether to use predictors quantiles instead of a uniform grid to bin
-        predictors.
-    sigest : float, optional
+        predictors. Ignored if `xinfo` is specified.
+    rm_const
+        How to treat predictors with no associated decision rules (i.e., there
+        are no available cutpoints for that predictor). If `True` (default),
+        they are ignored. If `False`, an error is raised if there are any. If
+        `None`, no check is performed, and the output of the MCMC may not make
+        sense if there are predictors without cutpoints. The option `None` is
+        provided only to allow jax tracing.
+    sigest
         An estimate of the residual standard deviation on `y_train`, used to set
         `lamda`. If not specified, it is estimated by linear regression (with
         intercept, and without taking into account `w`). If `y_train` has less
         than two elements, it is set to 1. If n <= p, it is set to the standard
         deviation of `y_train`. Ignored if `lamda` is specified.
-    sigdf : int, default 3
+    sigdf
         The degrees of freedom of the scaled inverse-chisquared prior on the
         noise variance.
-    sigquant : float, default 0.9
+    sigquant
         The quantile of the prior on the noise variance that shall match
         `sigest` to set the scale of the prior. Ignored if `lamda` is specified.
-    k : float, default 2
+    k
         The inverse scale of the prior standard deviation on the latent mean
         function, relative to half the observed range of `y_train`. If `y_train`
         has less than two elements, `k` is ignored and the scale is set to 1.
-    power : float, default 2
-    base : float, default 0.95
+    power
+    base
         Parameters of the prior on tree node generation. The probability that a
         node at depth `d` (0-based) is non-terminal is ``base / (1 + d) **
         power``.
-    maxdepth : int, default 6
-        The maximum depth of the trees. This is 1-based, so with the default
-        ``maxdepth=6``, the depths of the levels range from 0 to 5.
-    lamda : float, optional
-        The scale of the prior on the noise variance. If ``lamda==1``, the
-        prior is an inverse chi-squared scaled to have harmonic mean 1. If
-        not specified, it is set based on `sigest` and `sigquant`.
-    offset : float, optional
+    lamda
+        The prior harmonic mean of the error variance. (The harmonic mean of x
+        is 1/mean(1/x).) If not specified, it is set based on `sigest` and
+        `sigquant`.
+    tau_num
+        The numerator in the expression that determines the prior standard
+        deviation of leaves. If not specified, default to ``(max(y_train) -
+        min(y_train)) / 2`` (or 1 if `y_train` has less than two elements) for
+        continuous regression, and 3 for binary regression.
+    offset
         The prior mean of the latent mean function. If not specified, it is set
-        to the mean of `y_train`. If `y_train` is empty, it is set to 0.
-    w : array (n,), optional
+        to the mean of `y_train` for continuous regression, and to
+        ``Phi^-1(mean(y_train))`` for binary regression. If `y_train` is empty,
+        `offset` is set to 0. With binary regression, if `y_train` is all
+        `False` or `True`, it is set to ``Phi^-1(1/(n+1))`` or
+        ``Phi^-1(n/(n+1))``, respectively.
+    w
         Coefficients that rescale the error standard deviation on each
         datapoint. Not specifying `w` is equivalent to setting it to 1 for all
         datapoints. Note: `w` is ignored in the automatic determination of
         `sigest`, so either the weights should be O(1), or `sigest` should be
         specified by the user.
-    ntree : int, default 200
-        The number of trees used to represent the latent mean function.
-    numcut : int, default 255
+    ntree
+        The number of trees used to represent the latent mean function. By
+        default 200 for continuous regression and 50 for binary regression.
+    numcut
         If `usequants` is `False`: the exact number of cutpoints used to bin the
         predictors, ranging between the minimum and maximum observed values
         (excluded).
@@ -101,50 +210,43 @@ class gbart:
         Before running the algorithm, the predictors are compressed to the
         smallest integer type that fits the bin indices, so `numcut` is best set
-        to the maximum value of an unsigned integer type.
-    ndpost : int, default 1000
+        to the maximum value of an unsigned integer type, like 255.
+        Ignored if `xinfo` is specified.
+    ndpost
         The number of MCMC samples to save, after burn-in.
-    nskip : int, default 100
+    nskip
         The number of initial MCMC samples to discard as burn-in.
-    keepevery : int, default 1
-        The thinning factor for the MCMC samples, after burn-in.
-    printevery : int, default 100
-        The number of iterations (including skipped ones) between each log.
-    seed : int or jax random key, default 0
+    keepevery
+        The thinning factor for the MCMC samples, after burn-in. By default, 1
+        for continuous regression and 10 for binary regression.
+    printevery
+        The number of iterations (including thinned-away ones) between each log
+        line. Set to `None` to disable logging.
+        `printevery` has a few unexpected side effects. On cpu, interrupting
+        with ^C halts the MCMC only on the next log. And the total number of
+        iterations is a multiple of `printevery`, so if ``nskip + keepevery *
+        ndpost`` is not a multiple of `printevery`, some of the last iterations
+        will not be saved.
+    seed
         The seed for the random number generator.
-    initkw : dict
-        Additional arguments passed to `mcmcstep.init`.
+    maxdepth
+        The maximum depth of the trees. This is 1-based, so with the default
+        ``maxdepth=6``, the depths of the levels range from 0 to 5.
+    init_kw
+        Additional arguments passed to `bartz.mcmcstep.init`.
+    run_mcmc_kw
+        Additional arguments passed to `bartz.mcmcloop.run_mcmc`.
     Attributes
     ----------
-    yhat_train : array (ndpost, n)
-        The conditional posterior mean at `x_train` for each MCMC iteration.
-    yhat_train_mean : array (n,)
-        The marginal posterior mean at `x_train`.
-    yhat_test : array (ndpost, m)
-        The conditional posterior mean at `x_test` for each MCMC iteration.
-    yhat_test_mean : array (m,)
-        The marginal posterior mean at `x_test`.
-    sigma : array (ndpost,)
-        The standard deviation of the error.
-    first_sigma : array (nskip,)
-        The standard deviation of the error in the burn-in phase.
-    offset : float
+    offset : Float32[Array, '']
         The prior mean of the latent mean function.
-    scale : float
-        The prior standard deviation of the latent mean function.
-    lamda : float
-        The prior harmonic mean of the error variance.
-    sigest : float or None
+    sigest : Float32[Array, ''] | None
         The estimated standard deviation of the error used to set `lamda`.
-    ntree : int
-        The number of trees.
-    maxdepth : int
-        The maximum depth of the trees.
-    Methods
-    -------
-    predict
+    yhat_test : Float32[Array, 'ndpost m'] | None
+        The conditional posterior mean at `x_test` for each MCMC iteration.
     Notes
     -----
@@ -156,128 +258,293 @@ class gbart:
     - If ``usequants=False``, R BART switches to quantiles anyway if there are
       less predictor values than the required number of bins, while bartz
       always follows the specification.
+    - Some functionality is missing.
     - The error variance parameter is called `lamda` instead of `lambda`.
-    - `rm_const` is always `False`.
-    - The default `numcut` is 255 instead of 100.
-    - A lot of functionality is missing (variable selection, discrete response).
     - There are some additional attributes, and some missing.
+    - The trees have a maximum depth.
+    - `rm_const` refers to predictors without decision rules instead of
+      predictors that are constant in `x_train`.
+    - If `rm_const=True` and some variables are dropped, the predictors
+      matrix/dataframe passed to `predict` should still include them.
+    References
+    ----------
+    .. [1] Linero, Antonio R. (2018). “Bayesian Regression Trees for
+       High-Dimensional Prediction and Variable Selection”. In: Journal of the
+       American Statistical Association 113.522, pp. 626-636.
+    .. [2] Hugh A. Chipman, Edward I. George, Robert E. McCulloch "BART:
+       Bayesian additive regression trees," The Annals of Applied Statistics,
+       Ann. Appl. Stat. 4(1), 266-298, (March 2010).
     """
+    _main_trace: mcmcloop.MainTrace
+    _burnin_trace: mcmcloop.BurninTrace
+    _mcmc_state: mcmcstep.State
+    _splits: Real[Array, 'p max_num_splits']
+    _x_train_fmt: Any = field(static=True)
+    ndpost: int = field(static=True)
+    offset: Float32[Array, '']
+    sigest: Float32[Array, ''] | None = None
+    yhat_test: Float32[Array, 'ndpost m'] | None = None
     def __init__(
         self,
-        x_train,
-        y_train,
+        x_train: Real[Array, 'p n'] | DataFrame,
+        y_train: Bool[Array, ' n'] | Float32[Array, ' n'] | Series,
         *,
-        x_test=None,
-        usequants=False,
-        sigest=None,
-        sigdf=3,
-        sigquant=0.9,
-        k=2,
-        power=2,
-        base=0.95,
-        maxdepth=6,
-        lamda=None,
-        offset=None,
-        w=None,
-        ntree=200,
-        numcut=255,
-        ndpost=1000,
-        nskip=100,
-        keepevery=1,
-        printevery=100,
-        seed=0,
-        initkw=None,
+        x_test: Real[Array, 'p m'] | DataFrame | None = None,
+        type: Literal['wbart', 'pbart'] = 'wbart',  # noqa: A002
+        sparse: bool = False,
+        theta: FloatLike | None = None,
+        a: FloatLike = 0.5,
+        b: FloatLike = 1.0,
+        rho: FloatLike | None = None,
+        xinfo: Float[Array, 'p n'] | None = None,
+        usequants: bool = False,
+        rm_const: bool | None = True,
+        sigest: FloatLike | None = None,
+        sigdf: FloatLike = 3.0,
+        sigquant: FloatLike = 0.9,
+        k: FloatLike = 2.0,
+        power: FloatLike = 2.0,
+        base: FloatLike = 0.95,
+        lamda: FloatLike | None = None,
+        tau_num: FloatLike | None = None,
+        offset: FloatLike | None = None,
+        w: Float[Array, ' n'] | None = None,
+        ntree: int | None = None,
+        numcut: int = 100,
+        ndpost: int = 1000,
+        nskip: int = 100,
+        keepevery: int | None = None,
+        printevery: int | None = 100,
+        seed: int | Key[Array, ''] = 0,
+        maxdepth: int = 6,
+        init_kw: dict | None = None,
+        run_mcmc_kw: dict | None = None,
     ):
+        # check data and put it in the right format
         x_train, x_train_fmt = self._process_predictor_input(x_train)
-        y_train, _ = self._process_response_input(y_train)
+        y_train = self._process_response_input(y_train)
         self._check_same_length(x_train, y_train)
         if w is not None:
-            w, _ = self._process_response_input(w)
+            w = self._process_response_input(w)
             self._check_same_length(x_train, w)
+        # check data types are correct for continuous/binary regression
+        self._check_type_settings(y_train, type, w)
+        # from here onwards, the type is determined by y_train.dtype == bool
+        # set defaults that depend on type of regression
+        if ntree is None:
+            ntree = 50 if y_train.dtype == bool else 200
+        if keepevery is None:
+            keepevery = 10 if y_train.dtype == bool else 1
+        # process sparsity settings
+        theta, a, b, rho = self._process_sparsity_settings(
+            x_train, sparse, theta, a, b, rho
+        )
+        # process "standardization" settings
         offset = self._process_offset_settings(y_train, offset)
-        scale = self._process_scale_settings(y_train, k)
-        lamda, sigest = self._process_noise_variance_settings(
-            x_train, y_train, sigest, sigdf, sigquant, lamda, offset
+        sigma_mu = self._process_leaf_sdev_settings(y_train, k, ntree, tau_num)
+        lamda, sigest = self._process_error_variance_settings(
+            x_train, y_train, sigest, sigdf, sigquant, lamda
         )
-        splits, max_split = self._determine_splits(x_train, usequants, numcut)
+        # determine splits
+        splits, max_split = self._determine_splits(x_train, usequants, numcut, xinfo)
         x_train = self._bin_predictors(x_train, splits)
-        y_train, lamda_scaled = self._transform_input(y_train, lamda, offset, scale)
-        mcmc_state = self._setup_mcmc(
+        # setup and run mcmc
+        initial_state = self._setup_mcmc(
             x_train,
             y_train,
+            offset,
             w,
             max_split,
-            lamda_scaled,
+            lamda,
+            sigma_mu,
             sigdf,
             power,
             base,
             maxdepth,
             ntree,
-            initkw,
+            init_kw,
+            rm_const,
+            theta,
+            a,
+            b,
+            rho,
         )
         final_state, burnin_trace, main_trace = self._run_mcmc(
-            mcmc_state, ndpost, nskip, keepevery, printevery, seed
+            initial_state,
+            ndpost,
+            nskip,
+            keepevery,
+            printevery,
+            seed,
+            run_mcmc_kw,
+            sparse,
         )
-        sigma = self._extract_sigma(main_trace, scale)
-        first_sigma = self._extract_sigma(burnin_trace, scale)
-        self.offset = offset
-        self.scale = scale
-        self.lamda = lamda
+        # set public attributes
+        self.offset = final_state.offset  # from the state because of buffer donation
+        self.ndpost = ndpost
         self.sigest = sigest
-        self.ntree = ntree
-        self.maxdepth = maxdepth
-        self.sigma = sigma
-        self.first_sigma = first_sigma
-        self._x_train_fmt = x_train_fmt
-        self._splits = splits
+        # set private attributes
         self._main_trace = main_trace
+        self._burnin_trace = burnin_trace
         self._mcmc_state = final_state
+        self._splits = splits
+        self._x_train_fmt = x_train_fmt
+        # predict at test points
         if x_test is not None:
-            yhat_test = self.predict(x_test)
-            self.yhat_test = yhat_test
-            self.yhat_test_mean = yhat_test.mean(axis=0)
+            self.yhat_test = self.predict(x_test)
+    @cached_property
+    def prob_test(self) -> Float32[Array, 'ndpost m'] | None:
+        """The posterior probability of y being True at `x_test` for each MCMC iteration."""
+        if self.yhat_test is None or self._mcmc_state.y.dtype != bool:
+            return None
+        else:
+            return ndtr(self.yhat_test)
+    @cached_property
+    def prob_test_mean(self) -> Float32[Array, ' m'] | None:
+        """The marginal posterior probability of y being True at `x_test`."""
+        if self.prob_test is None:
+            return None
+        else:
+            return self.prob_test.mean(axis=0)
+    @cached_property
+    def prob_train(self) -> Float32[Array, 'ndpost n'] | None:
+        """The posterior probability of y being True at `x_train` for each MCMC iteration."""
+        if self._mcmc_state.y.dtype == bool:
+            return ndtr(self.yhat_train)
+        else:
+            return None
+    @cached_property
+    def prob_train_mean(self) -> Float32[Array, ' n'] | None:
+        """The marginal posterior probability of y being True at `x_train`."""
+        if self.prob_train is None:
+            return None
+        else:
+            return self.prob_train.mean(axis=0)
+    @cached_property
+    def sigma(self) -> Float32[Array, ' nskip+ndpost'] | None:
+        """The standard deviation of the error, including burn-in samples."""
+        if self._burnin_trace.sigma2 is None:
+            return None
+        else:
+            assert self._main_trace.sigma2 is not None
+            return jnp.sqrt(
+                jnp.concatenate([self._burnin_trace.sigma2, self._main_trace.sigma2])
+            )
+    @cached_property
+    def sigma_mean(self) -> Float32[Array, ''] | None:
+        """The mean of `sigma`, only over the post-burnin samples."""
+        if self.sigma is None:
+            return None
+        else:
+            return self.sigma[len(self.sigma) - self.ndpost :].mean(axis=0)
+    @cached_property
+    def varcount(self) -> Int32[Array, 'ndpost p']:
+        """Histogram of predictor usage for decision rules in the trees."""
+        return mcmcloop.compute_varcount(
+            self._mcmc_state.forest.max_split.size, self._main_trace
+        )
+    @cached_property
+    def varcount_mean(self) -> Float32[Array, ' p']:
+        """Average of `varcount` across MCMC iterations."""
+        return self.varcount.mean(axis=0)
+    @cached_property
+    def varprob(self) -> Float32[Array, 'ndpost p']:
+        """Posterior samples of the probability of choosing each predictor for a decision rule."""
+        varprob = self._main_trace.varprob
+        if varprob is None:
+            max_split = self._mcmc_state.forest.max_split
+            p = max_split.size
+            peff = jnp.count_nonzero(max_split)
+            varprob = jnp.where(max_split, 1 / peff, 0)
+            varprob = jnp.broadcast_to(varprob, (self.ndpost, p))
+        return varprob
+    @cached_property
+    def varprob_mean(self) -> Float32[Array, ' p']:
+        """The marginal posterior probability of each predictor being chosen for a decision rule."""
+        return self.varprob.mean(axis=0)
+    @cached_property
+    def yhat_test_mean(self) -> Float32[Array, ' m'] | None:
+        """The marginal posterior mean at `x_test`.
+        Not defined with binary regression because it's error-prone, typically
+        the right thing to consider would be `prob_test_mean`.
+        """
+        if self.yhat_test is None or self._mcmc_state.y.dtype == bool:
+            return None
+        else:
+            return self.yhat_test.mean(axis=0)
+    @cached_property
+    def yhat_train(self) -> Float32[Array, 'ndpost n']:
+        """The conditional posterior mean at `x_train` for each MCMC iteration."""
+        x_train = self._mcmc_state.X
+        return self._predict(x_train)
-    @functools.cached_property
-    def yhat_train(self):
-        x_train = self._mcmc_state['X']
-        yhat_train = self._predict(self._main_trace, x_train)
-        return self._transform_output(yhat_train, self.offset, self.scale)
+    @cached_property
+    def yhat_train_mean(self) -> Float32[Array, ' n'] | None:
+        """The marginal posterior mean at `x_train`.
-    @functools.cached_property
-    def yhat_train_mean(self):
-        return self.yhat_train.mean(axis=0)
+        Not defined with binary regression because it's error-prone, typically
+        the right thing to consider would be `prob_train_mean`.
+        """
+        if self._mcmc_state.y.dtype == bool:
+            return None
+        else:
+            return self.yhat_train.mean(axis=0)
-    def predict(self, x_test):
+    def predict(
+        self, x_test: Real[Array, 'p m'] | DataFrame
+    ) -> Float32[Array, 'ndpost m']:
         """
         Compute the posterior mean at `x_test` for each MCMC iteration.
         Parameters
         ----------
-        x_test : array (p, m) or DataFrame
+        x_test
             The test predictors.
         Returns
         -------
-        yhat_test : array (ndpost, m)
-            The conditional posterior mean at `x_test` for each MCMC iteration.
+        The conditional posterior mean at `x_test` for each MCMC iteration.
+        Raises
+        ------
+        ValueError
+            If `x_test` has a different format than `x_train`.
         """
         x_test, x_test_fmt = self._process_predictor_input(x_test)
-        self._check_compatible_formats(x_test_fmt, self._x_train_fmt)
+        if x_test_fmt != self._x_train_fmt:
+            msg = f'Input format mismatch: {x_test_fmt=} != x_train_fmt={self._x_train_fmt!r}'
+            raise ValueError(msg)
         x_test = self._bin_predictors(x_test, self._splits)
-        yhat_test = self._predict(self._main_trace, x_test)
-        return self._transform_output(yhat_test, self.offset, self.scale)
+        return self._predict(x_test)
     @staticmethod
-    def _process_predictor_input(x):
+    def _process_predictor_input(x) -> tuple[Shaped[Array, 'p n'], Any]:
         if hasattr(x, 'columns'):
             fmt = dict(kind='dataframe', columns=x.columns)
             x = x.to_numpy().T
@@ -288,19 +555,12 @@ class gbart:
         return x, fmt
     @staticmethod
-    def _check_compatible_formats(fmt1, fmt2):
-        assert fmt1 == fmt2
-    @staticmethod
-    def _process_response_input(y):
+    def _process_response_input(y) -> Shaped[Array, ' n']:
         if hasattr(y, 'to_numpy'):
-            fmt = dict(kind='series', name=y.name)
             y = y.to_numpy()
-        else:
-            fmt = dict(kind='array')
         y = jnp.asarray(y)
         assert y.ndim == 1
-        return y, fmt
+        return y
     @staticmethod
     def _check_same_length(x1, x2):
@@ -308,18 +568,29 @@ class gbart:
         assert get_length(x1) == get_length(x2)
     @staticmethod
-    def _process_noise_variance_settings(
-        x_train, y_train, sigest, sigdf, sigquant, lamda, offset
-    ):
-        if lamda is not None:
+    def _process_error_variance_settings(
+        x_train, y_train, sigest, sigdf, sigquant, lamda
+    ) -> tuple[Float32[Array, ''] | None, ...]:
+        if y_train.dtype == bool:
+            if sigest is not None:
+                msg = 'Let `sigest=None` for binary regression'
+                raise ValueError(msg)
+            if lamda is not None:
+                msg = 'Let `lamda=None` for binary regression'
+                raise ValueError(msg)
+            return None, None
+        elif lamda is not None:
+            if sigest is not None:
+                msg = 'Let `sigest=None` if `lamda` is specified'
+                raise ValueError(msg)
             return lamda, None
         else:
             if sigest is not None:
-                sigest2 = sigest * sigest
+                sigest2 = jnp.square(sigest)
             elif y_train.size < 2:
                 sigest2 = 1
             elif y_train.size <= x_train.shape[0]:
-                sigest2 = jnp.var(y_train - offset)
+                sigest2 = jnp.var(y_train)
             else:
                 x_centered = x_train.T - x_train.mean(axis=1)
                 y_centered = y_train - y_train.mean()
@@ -329,182 +600,214 @@ class gbart:
                 dof = len(y_train) - rank
                 sigest2 = chisq / dof
             alpha = sigdf / 2
-            invchi2 = jaxext.scipy.stats.invgamma.ppf(sigquant, alpha) / 2
+            invchi2 = invgamma.ppf(sigquant, alpha) / 2
             invchi2rid = invchi2 * sigdf
             return sigest2 / invchi2rid, jnp.sqrt(sigest2)
     @staticmethod
-    def _process_offset_settings(y_train, offset):
+    def _check_type_settings(y_train, type, w):  # noqa: A002
+        match type:
+            case 'wbart':
+                if y_train.dtype != jnp.float32:
+                    msg = (
+                        'Continuous regression requires y_train.dtype=float32,'
+                        f' got {y_train.dtype=} instead.'
+                    )
+                    raise TypeError(msg)
+            case 'pbart':
+                if w is not None:
+                    msg = 'Binary regression does not support weights, set `w=None`'
+                    raise ValueError(msg)
+                if y_train.dtype != bool:
+                    msg = (
+                        'Binary regression requires y_train.dtype=bool,'
+                        f' got {y_train.dtype=} instead.'
+                    )
+                    raise TypeError(msg)
+            case _:
+                msg = f'Invalid {type=}'
+                raise ValueError(msg)
+    @staticmethod
+    def _process_sparsity_settings(
+        x_train: Real[Array, 'p n'],
+        sparse: bool,
+        theta: FloatLike | None,
+        a: FloatLike,
+        b: FloatLike,
+        rho: FloatLike | None,
+    ) -> (
+        tuple[None, None, None, None]
+        | tuple[FloatLike, None, None, None]
+        | tuple[None, FloatLike, FloatLike, FloatLike]
+    ):
+        if not sparse:
+            return None, None, None, None
+        elif theta is not None:
+            return theta, None, None, None
+        else:
+            if rho is None:
+                p, _ = x_train.shape
+                rho = float(p)
+            return None, a, b, rho
+    @staticmethod
+    def _process_offset_settings(
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
+        offset: float | Float32[Any, ''] | None,
+    ) -> Float32[Array, '']:
         if offset is not None:
-            return offset
+            return jnp.asarray(offset)
         elif y_train.size < 1:
-            return 0
+            return jnp.array(0.0)
         else:
-            return y_train.mean()
+            mean = y_train.mean()
-    @staticmethod
-    def _process_scale_settings(y_train, k):
-        if y_train.size < 2:
-            return 1
+        if y_train.dtype == bool:
+            bound = 1 / (1 + y_train.size)
+            mean = jnp.clip(mean, bound, 1 - bound)
+            return ndtri(mean)
         else:
-            return (y_train.max() - y_train.min()) / (2 * k)
+            return mean
     @staticmethod
-    def _determine_splits(x_train, usequants, numcut):
-        if usequants:
+    def _process_leaf_sdev_settings(
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
+        k: float,
+        ntree: int,
+        tau_num: FloatLike | None,
+    ):
+        if tau_num is None:
+            if y_train.dtype == bool:
+                tau_num = 3.0
+            elif y_train.size < 2:
+                tau_num = 1.0
+            else:
+                tau_num = (y_train.max() - y_train.min()) / 2
+        return tau_num / (k * math.sqrt(ntree))
+    @staticmethod
+    def _determine_splits(
+        x_train: Real[Array, 'p n'],
+        usequants: bool,
+        numcut: int,
+        xinfo: Float[Array, 'p n'] | None,
+    ) -> tuple[Real[Array, 'p m'], UInt[Array, ' p']]:
+        if xinfo is not None:
+            if xinfo.ndim != 2 or xinfo.shape[0] != x_train.shape[0]:
+                msg = f'{xinfo.shape=} different from expected ({x_train.shape[0]}, *)'
+                raise ValueError(msg)
+            return prepcovars.parse_xinfo(xinfo)
+        elif usequants:
             return prepcovars.quantilized_splits_from_matrix(x_train, numcut + 1)
         else:
             return prepcovars.uniform_splits_from_matrix(x_train, numcut + 1)
     @staticmethod
-    def _bin_predictors(x, splits):
+    def _bin_predictors(x, splits) -> UInt[Array, 'p n']:
         return prepcovars.bin_predictors(x, splits)
-    @staticmethod
-    def _transform_input(y, lamda, offset, scale):
-        y = (y - offset) / scale
-        lamda = lamda / (scale * scale)
-        return y, lamda
     @staticmethod
     def _setup_mcmc(
-        x_train,
-        y_train,
-        w,
-        max_split,
-        lamda,
-        sigdf,
-        power,
-        base,
-        maxdepth,
-        ntree,
-        initkw,
+        x_train: Real[Array, 'p n'],
+        y_train: Float32[Array, ' n'] | Bool[Array, ' n'],
+        offset: Float32[Array, ''],
+        w: Float[Array, ' n'] | None,
+        max_split: UInt[Array, ' p'],
+        lamda: Float32[Array, ''] | None,
+        sigma_mu: FloatLike,
+        sigdf: FloatLike,
+        power: FloatLike,
+        base: FloatLike,
+        maxdepth: int,
+        ntree: int,
+        init_kw: dict[str, Any] | None,
+        rm_const: bool | None,
+        theta: FloatLike | None,
+        a: FloatLike | None,
+        b: FloatLike | None,
+        rho: FloatLike | None,
     ):
         depth = jnp.arange(maxdepth - 1)
         p_nonterminal = base / (1 + depth).astype(float) ** power
-        sigma2_alpha = sigdf / 2
-        sigma2_beta = lamda * sigma2_alpha
+        if y_train.dtype == bool:
+            sigma2_alpha = None
+            sigma2_beta = None
+        else:
+            sigma2_alpha = sigdf / 2
+            sigma2_beta = lamda * sigma2_alpha
         kw = dict(
             X=x_train,
-            y=y_train,
+            # copy y_train because it's going to be donated in the mcmc loop
+            y=jnp.array(y_train),
+            offset=offset,
             error_scale=w,
             max_split=max_split,
             num_trees=ntree,
             p_nonterminal=p_nonterminal,
+            sigma_mu2=jnp.square(sigma_mu),
             sigma2_alpha=sigma2_alpha,
             sigma2_beta=sigma2_beta,
+            min_points_per_decision_node=10,
             min_points_per_leaf=5,
+            theta=theta,
+            a=a,
+            b=b,
+            rho=rho,
         )
-        if initkw is not None:
-            kw.update(initkw)
+        if rm_const is None:
+            kw.update(filter_splitless_vars=False)
+        elif rm_const:
+            kw.update(filter_splitless_vars=True)
+        else:
+            n_empty = jnp.count_nonzero(max_split == 0)
+            if n_empty:
+                msg = f'There are {n_empty}/{max_split.size} predictors without decision rules'
+                raise ValueError(msg)
+            kw.update(filter_splitless_vars=False)
+        if init_kw is not None:
+            kw.update(init_kw)
         return mcmcstep.init(**kw)
     @staticmethod
-    def _run_mcmc(mcmc_state, ndpost, nskip, keepevery, printevery, seed):
+    def _run_mcmc(
+        mcmc_state: mcmcstep.State,
+        ndpost: int,
+        nskip: int,
+        keepevery: int,
+        printevery: int | None,
+        seed: int | Integer[Array, ''] | Key[Array, ''],
+        run_mcmc_kw: dict | None,
+        sparse: bool,
+    ):
+        # prepare random generator seed
         if isinstance(seed, jax.Array) and jnp.issubdtype(
             seed.dtype, jax.dtypes.prng_key
         ):
-            key = seed
+            key = seed.copy()
+            # copy because the inner loop in run_mcmc will donate the buffer
         else:
             key = jax.random.key(seed)
-        callback = mcmcloop.make_simple_print_callback(printevery)
-        return mcmcloop.run_mcmc(key, mcmc_state, nskip, ndpost, keepevery, callback)
-    @staticmethod
-    def _predict(trace, x):
-        return mcmcloop.evaluate_trace(trace, x)
-    @staticmethod
-    def _transform_output(y, offset, scale):
-        return offset + scale * y
-    @staticmethod
-    def _extract_sigma(trace, scale):
-        return scale * jnp.sqrt(trace['sigma2'])
-    def _show_tree(self, i_sample, i_tree, print_all=False):
-        from . import debug
-        trace = self._main_trace
-        leaf_tree = trace['leaf_trees'][i_sample, i_tree]
-        var_tree = trace['var_trees'][i_sample, i_tree]
-        split_tree = trace['split_trees'][i_sample, i_tree]
-        debug.print_tree(leaf_tree, var_tree, split_tree, print_all)
-    def _sigma_harmonic_mean(self, prior=False):
-        bart = self._mcmc_state
-        if prior:
-            alpha = bart['sigma2_alpha']
-            beta = bart['sigma2_beta']
-        else:
-            resid = bart['resid']
-            alpha = bart['sigma2_alpha'] + resid.size / 2
-            norm2 = jnp.dot(
-                resid, resid, preferred_element_type=bart['sigma2_beta'].dtype
+        # prepare arguments
+        kw = dict(n_burn=nskip, n_skip=keepevery, inner_loop_length=printevery)
+        kw.update(
+            mcmcloop.make_default_callback(
+                dot_every=None if printevery is None or printevery == 1 else 1,
+                report_every=printevery,
+                sparse_on_at=nskip // 2 if sparse else None,
             )
-            beta = bart['sigma2_beta'] + norm2 / 2
-        sigma2 = beta / alpha
-        return jnp.sqrt(sigma2) * self.scale
-    def _compare_resid(self):
-        bart = self._mcmc_state
-        resid1 = bart['resid']
-        yhat = grove.evaluate_forest(
-            bart['X'],
-            bart['leaf_trees'],
-            bart['var_trees'],
-            bart['split_trees'],
-            jnp.float32,
         )
-        resid2 = bart['y'] - yhat
-        return resid1, resid2
-    def _avg_acc(self):
-        trace = self._main_trace
-        def acc(prefix):
-            acc = trace[f'{prefix}_acc_count']
-            prop = trace[f'{prefix}_prop_count']
-            return acc.sum() / prop.sum()
-        return acc('grow'), acc('prune')
-    def _avg_prop(self):
-        trace = self._main_trace
-        def prop(prefix):
-            return trace[f'{prefix}_prop_count'].sum()
-        pgrow = prop('grow')
-        pprune = prop('prune')
-        total = pgrow + pprune
-        return pgrow / total, pprune / total
-    def _avg_move(self):
-        agrow, aprune = self._avg_acc()
-        pgrow, pprune = self._avg_prop()
-        return agrow * pgrow, aprune * pprune
-    def _depth_distr(self):
-        from . import debug
-        trace = self._main_trace
-        split_trees = trace['split_trees']
-        return debug.trace_depth_distr(split_trees)
-    def _points_per_leaf_distr(self):
-        from . import debug
-        return debug.trace_points_per_leaf_distr(
-            self._main_trace, self._mcmc_state['X']
-        )
-    def _check_trees(self):
-        from . import debug
+        if run_mcmc_kw is not None:
+            kw.update(run_mcmc_kw)
-        return debug.check_trace(self._main_trace, self._mcmc_state)
+        return mcmcloop.run_mcmc(key, mcmc_state, ndpost, **kw)
-    def _tree_goes_bad(self):
-        bad = self._check_trees().astype(bool)
-        bad_before = jnp.pad(bad[:-1], [(1, 0), (0, 0)])
-        return bad & ~bad_before
+    def _predict(self, x):
+        return mcmcloop.evaluate_trace(self._main_trace, x)

bartz 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

bartz 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl