PyPI - bartz - Versions diffs - 0.0__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

bartz 0.0py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

bartz/__init__.py +1 -1
bartz/_version.py +1 -0
bartz/debug.py +9 -27
bartz/grove.py +71 -118
bartz/interface.py +29 -32
bartz/mcmcloop.py +17 -8
bartz/mcmcstep.py +379 -427
{bartz-0.0.dist-info → bartz-0.1.0.dist-info}/METADATA +8 -7
bartz-0.1.0.dist-info/RECORD +13 -0
bartz-0.0.dist-info/RECORD +0 -12
{bartz-0.0.dist-info → bartz-0.1.0.dist-info}/LICENSE +0 -0
{bartz-0.0.dist-info → bartz-0.1.0.dist-info}/WHEEL +0 -0

bartz/__init__.py CHANGED Viewed

@@ -28,7 +28,7 @@ A jax implementation of BART
 See the manual at https://gattocrucco.github.io/bartz/docs
 """
-__version__ = '0.0'
+from ._version import __version__
 from .interface import BART

bartz/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '0.1.0'

bartz/debug.py CHANGED Viewed

@@ -7,22 +7,6 @@ from jax import lax
 from . import grove
 from . import mcmcstep
-def trace_evaluate_trees(bart, X):
-    """
-    Evaluate all trees, for all samples, at all x. Out axes:
-        0: mcmc sample
-        1: tree
-        2: X
-    """
-    def loop(_, bart):
-        return None, evaluate_all_trees(X, bart['leaf_trees'], bart['var_trees'], bart['split_trees'])
-    _, y = lax.scan(loop, None, bart)
-    return y
-@functools.partial(jax.vmap, in_axes=(None, 0, 0, 0)) # vectorize over forest
-def evaluate_all_trees(X, leaf_trees, var_trees, split_trees):
-    return grove.evaluate_tree_vmap_x(X, leaf_trees, var_trees, split_trees, jnp.float32)
 def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
     tee = '├──'
@@ -65,15 +49,11 @@ def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
         else:
             link = ' '
-        if print_all:
-            max_number = len(leaf_tree) - 1
-            ndigits = len(str(max_number))
-            number = str(index).rjust(ndigits)
-            number = f' {number} '
-        else:
-            number = ''
+        max_number = len(leaf_tree) - 1
+        ndigits = len(str(max_number))
+        number = str(index).rjust(ndigits)
-        print(f'{number}{indent}{first_indent}{link}{node_str}')
+        print(f' {number} {indent}{first_indent}{link}{node_str}')
         indent += next_indent
         unused = unused or is_leaf
@@ -101,8 +81,10 @@ def trace_depth_distr(split_trees_trace):
     return jax.vmap(forest_depth_distr)(split_trees_trace)
 def points_per_leaf_distr(var_tree, split_tree, X):
-    dummy = jnp.ones(X.shape[1], jnp.uint8)
-    _, count_tree = mcmcstep.agg_values(X, var_tree, split_tree, dummy, dummy.dtype)
+    traverse_tree = jax.vmap(grove.traverse_tree, in_axes=(1, None, None))
+    indices = traverse_tree(X, var_tree, split_tree)
+    count_tree = jnp.zeros(2 * split_tree.size, dtype=grove.minimal_unsigned_dtype(indices.size))
+    count_tree = count_tree.at[indices].add(1)
     is_leaf = grove.is_actual_leaf(split_tree, add_bottom_level=True).view(jnp.uint8)
     return jnp.bincount(count_tree, is_leaf, length=X.shape[1] + 1)
@@ -129,7 +111,7 @@ def check_sizes(leaf_tree, var_tree, split_tree, max_split):
     return leaf_tree.size == 2 * var_tree.size == 2 * split_tree.size
 def check_unused_node(leaf_tree, var_tree, split_tree, max_split):
-    return (leaf_tree[0] == 0) & (var_tree[0] == 0) & (split_tree[0] == 0)
+    return (var_tree[0] == 0) & (split_tree[0] == 0)
 def check_leaf_values(leaf_tree, var_tree, split_tree, max_split):
     return jnp.all(jnp.isfinite(leaf_tree))

bartz/grove.py CHANGED Viewed

@@ -28,13 +28,15 @@ Functions to create and manipulate binary trees.
 A tree is represented with arrays as a heap. The root node is at index 1. The children nodes of a node at index :math:`i` are at indices :math:`2i` (left child) and :math:`2i + 1` (right child). The array element at index 0 is unused.
-A decision tree is represented by tree arrays: 'leaf', 'var', and 'split'. The 'leaf' array contains the values in the leaves. The 'var' array contains the axes along which the decision nodes operate. The 'split' array contains the decision boundaries.
+A decision tree is represented by tree arrays: 'leaf', 'var', and 'split'.
-Whether a node is a leaf is indicated by the corresponding 'split' element being 0.
+The 'leaf' array contains the values in the leaves.
-Since the nodes at the bottom can only be leaves and not decision nodes, the 'var' and 'split' arrays have half the length of the 'leaf' array.
+The 'var' array contains the axes along which the decision nodes operate.
+The 'split' array contains the decision boundaries. The boundaries are open on the right, i.e., a point belongs to the left child iff x < split. Whether a node is a leaf is indicated by the corresponding 'split' element being 0.
-The unused array element at index 0 is always fixed to 0 by convention.
+Since the nodes at the bottom can only be leaves and not decision nodes, the 'var' and 'split' arrays have half the length of the 'leaf' array.
 """
@@ -42,6 +44,7 @@ import functools
 import math
 import jax
 from jax import numpy as jnp
 from jax import lax
@@ -63,24 +66,18 @@ def make_tree(depth, dtype):
     -------
     tree : array
         An array of zeroes with shape (2 ** depth,).
-    Notes
-    -----
-    The tree is represented as a heap, with the root node at index 1, and the
-    children of the node at index i at indices 2 * i and 2 * i + 1. The element
-    at index 0 is unused.
     """
     return jnp.zeros(2 ** depth, dtype)
 def tree_depth(tree):
     """
-    Return the maximum depth of a binary tree created by `make_tree`.
+    Return the maximum depth of a tree.
     Parameters
     ----------
     tree : array
-        A binary tree created by `make_tree`. If the array is ND, the tree
-        structure is assumed to be along the last axis.
+        A tree created by `make_tree`. If the array is ND, the tree structure is
+        assumed to be along the last axis.
     Returns
     -------
@@ -89,120 +86,97 @@ def tree_depth(tree):
     """
     return int(round(math.log2(tree.shape[-1])))
-def evaluate_tree(X, leaf_trees, var_trees, split_trees, out_dtype):
+def traverse_tree(x, var_tree, split_tree):
     """
-    Evaluate a decision tree or forest.
+    Find the leaf where a point falls into.
     Parameters
     ----------
-    X : array (p,)
+    x : array (p,)
         The coordinates to evaluate the tree at.
-    leaf_trees : array (n,) or (m, n)
-        The leaf values of the tree or forest. If the input is a forest, the
-        first axis is the tree index, and the values are summed.
-    var_trees : array (n,) or (m, n)
-        The variable indices of the tree or forest. Each index is in [0, p) and
-        indicates which value of `X` to consider.
-    split_trees : array (n,) or (m, n)
-        The split values of the tree or forest. Leaf nodes are indicated by the
-        condition `split == 0`. If non-zero, the node has children, and its left
-        children is assigned points which satisfy `x < split`.
-    out_dtype : dtype
-        The dtype of the output.
+    var_tree : array (2 ** (d - 1),)
+        The decision axes of the tree.
+    split_tree : array (2 ** (d - 1),)
+        The decision boundaries of the tree.
     Returns
     -------
-    out : scalar
-        The value of the tree or forest at the given point.
+    index : int
+        The index of the leaf.
     """
-    is_forest = leaf_trees.ndim == 2
-    if is_forest:
-        m, _ = leaf_trees.shape
-        forest_shape = m,
-        tree_index = jnp.arange(m, dtype=minimal_unsigned_dtype(m - 1)),
-    else:
-        forest_shape = ()
-        tree_index = ()
     carry = (
-        jnp.zeros(forest_shape, bool),
-        jnp.zeros((), out_dtype),
-        jnp.ones(forest_shape, minimal_unsigned_dtype(leaf_trees.shape[-1] - 1))
+        jnp.zeros((), bool),
+        jnp.ones((), minimal_unsigned_dtype(2 * var_tree.size - 1)),
     )
     def loop(carry, _):
-        leaf_found, out, node_index = carry
-        is_leaf = split_trees.at[tree_index + (node_index,)].get(mode='fill', fill_value=0) == 0
-        leaf_value = leaf_trees[tree_index + (node_index,)]
-        if is_forest:
-            leaf_sum = jnp.sum(leaf_value, where=is_leaf) # TODO set dtype to large float
-                # alternative: dot(is_leaf, leaf_value):
-                # - maybe faster
-                # - maybe less accurate
-                # - fucked by nans
-        else:
-            leaf_sum = jnp.where(is_leaf, leaf_value, 0)
-        out += leaf_sum
-        leaf_found |= is_leaf
-        split = split_trees.at[tree_index + (node_index,)].get(mode='fill', fill_value=0)
-        var = var_trees.at[tree_index + (node_index,)].get(mode='fill', fill_value=0)
-        x = X[var]
+        leaf_found, index = carry
+        split = split_tree.at[index].get(mode='fill', fill_value=0)
+        var = var_tree.at[index].get(mode='fill', fill_value=0)
-        node_index <<= 1
-        node_index += x >= split
-        node_index = jnp.where(leaf_found, 0, node_index)
+        leaf_found |= split_tree.at[index].get(mode='fill', fill_value=0) == 0
+        child_index = (index << 1) + (x[var] >= split)
+        index = jnp.where(leaf_found, index, child_index)
-        carry = leaf_found, out, node_index
-        return carry, _
+        return (leaf_found, index), None
-    depth = tree_depth(leaf_trees)
-    (_, out, _), _ = lax.scan(loop, carry, None, depth)
-    return out
+        # TODO
+        # - unroll (how much? 5?)
+        # - separate and special-case the last iteration
-def minimal_unsigned_dtype(max_value):
-    """
-    Return the smallest unsigned integer dtype that can represent a given
-    maximum value.
-    """
-    if max_value < 2 ** 8:
-        return jnp.uint8
-    if max_value < 2 ** 16:
-        return jnp.uint16
-    if max_value < 2 ** 32:
-        return jnp.uint32
-    return jnp.uint64
+    depth = 1 + tree_depth(var_tree)
+    (_, index), _ = lax.scan(loop, carry, None, depth)
+    return index
-@functools.partial(jaxext.vmap_nodoc, in_axes=(1, None, None, None, None), out_axes=0)
-def evaluate_tree_vmap_x(X, leaf_trees, var_trees, split_trees, out_dtype):
+def evaluate_forest(X, leaf_trees, var_trees, split_trees, dtype):
     """
-    Evaluate a decision tree or forest over multiple points.
+    Evaluate a ensemble of trees at an array of points.
     Parameters
     ----------
     X : array (p, n)
-        The points to evaluate the tree at.
-    leaf_trees : array (n,) or (m, n)
+        The coordinates to evaluate the trees at.
+    leaf_trees : (m, 2 ** d)
         The leaf values of the tree or forest. If the input is a forest, the
         first axis is the tree index, and the values are summed.
-    var_trees : array (n,) or (m, n)
-        The variable indices of the tree or forest. Each index is in [0, p) and
-        indicates which value of `X` to consider.
-    split_trees : array (n,) or (m, n)
-        The split values of the tree or forest. Leaf nodes are indicated by the
-        condition `split == 0`. If non-zero, the node has children, and its left
-        children is assigned points which satisfy `x < split`.
-    out_dtype : dtype
+    var_trees : array (m, 2 ** (d - 1))
+        The decision axes of the trees.
+    split_trees : array (m, 2 ** (d - 1))
+        The decision boundaries of the trees.
+    dtype : dtype
         The dtype of the output.
     Returns
     -------
-    out : (n,)
-        The value of the tree or forest at each point.
+    out : array (n,)
+        The sum of the values of the trees at the points in `X`.
     """
-    return evaluate_tree(X, leaf_trees, var_trees, split_trees, out_dtype)
+    indices = _traverse_forest(X, var_trees, split_trees)
+    ntree, _ = leaf_trees.shape
+    tree_index = jnp.arange(ntree, dtype=minimal_unsigned_dtype(ntree - 1))[:, None]
+    leaves = leaf_trees[tree_index, indices]
+    return jnp.sum(leaves, axis=0, dtype=dtype)
+        # this sum suggests to swap the vmaps, but I think it's better for X copying to keep it that way
+@functools.partial(jax.vmap, in_axes=(None, 0, 0))
+@functools.partial(jax.vmap, in_axes=(1, None, None))
+def _traverse_forest(X, var_trees, split_trees):
+    return traverse_tree(X, var_trees, split_trees)
+def minimal_unsigned_dtype(max_value):
+    """
+    Return the smallest unsigned integer dtype that can represent a given
+    maximum value.
+    """
+    if max_value < 2 ** 8:
+        return jnp.uint8
+    if max_value < 2 ** 16:
+        return jnp.uint16
+    if max_value < 2 ** 32:
+        return jnp.uint32
+    return jnp.uint64
 def is_actual_leaf(split_tree, *, add_bottom_level=False):
     """
@@ -239,7 +213,7 @@ def is_leaves_parent(split_tree):
     Parameters
     ----------
     split_tree : int array (2 ** (d - 1),)
-        The splitting points of the tree.
+        The decision boundaries of the tree.
     Returns
     -------
@@ -279,24 +253,3 @@ def tree_depths(tree_length):
         depths.append(depth - 1)
     depths[0] = 0
     return jnp.array(depths, minimal_unsigned_dtype(max(depths)))
-def index_depth(index, tree_length):
-    """
-    Return the depth of a node in a binary tree.
-    Parameters
-    ----------
-    index : int
-        The index of the node.
-    tree_length : int
-        The length of the tree array, i.e., 2 ** d.
-    Returns
-    -------
-    depth : int
-        The depth of the node. The root node (index 1) has depth 0. The depth is
-        the position of the most significant non-zero bit in the index. If
-        ``index == 0``, return -1.
-    """
-    depths = tree_depths(tree_length)
-    return depths[index]

bartz/interface.py CHANGED Viewed

@@ -38,7 +38,7 @@ class BART:
     Nonparametric regression with Bayesian Additive Regression Trees (BART).
     Regress `y_train` on `x_train` with a latent mean function represented as
-    a sum of decision trees. The inference is carried out by estimating the
+    a sum of decision trees. The inference is carried out by sampling the
     posterior distribution of the tree ensemble with an MCMC.
     Parameters
@@ -86,7 +86,7 @@ class BART:
         predictor is binned such that its distribution in `x_train` is
         approximately uniform across bins. The number of bins is at most the
         number of unique values appearing in `x_train`, or ``numcut + 1``.
-        Before running the algorithm, the predictors are compressed to th
+        Before running the algorithm, the predictors are compressed to the
         smallest integer type that fits the bin indices, so `numcut` is best set
         to the maximum value of an unsigned integer type.
     ndpost : int, default 1000
@@ -102,14 +102,6 @@ class BART:
     Attributes
     ----------
-    offset : float
-        The prior mean of the latent mean function.
-    scale : float
-        The prior standard deviation of the latent mean function.
-    lamda : float
-        The prior harmonic mean of the error variance.
-    ntree : int
-        The number of trees.
     yhat_train : array (ndpost, n)
         The conditional posterior mean at `x_train` for each MCMC iteration.
     yhat_train_mean : array (n,)
@@ -122,6 +114,18 @@ class BART:
         The standard deviation of the error.
     first_sigma : array (nskip,)
         The standard deviation of the error in the burn-in phase.
+    offset : float
+        The prior mean of the latent mean function.
+    scale : float
+        The prior standard deviation of the latent mean function.
+    lamda : float
+        The prior harmonic mean of the error variance.
+    sigest : float or None
+        The estimated standard deviation of the error used to set `lamda`.
+    ntree : int
+        The number of trees.
+    maxdepth : int
+        The maximum depth of the trees.
     Methods
     -------
@@ -166,17 +170,17 @@ class BART:
         y_train, y_train_fmt = self._process_response_input(y_train)
         self._check_same_length(x_train, y_train)
-        lamda = self._process_noise_variance_settings(x_train, y_train, sigest, sigdf, sigquant, lamda)
         offset = self._process_offset_settings(y_train, offset)
         scale = self._process_scale_settings(y_train, k)
+        lamda, sigest = self._process_noise_variance_settings(x_train, y_train, sigest, sigdf, sigquant, lamda, offset)
         splits, max_split = self._determine_splits(x_train, numcut)
         x_train = self._bin_predictors(x_train, splits)
         y_train = self._transform_input(y_train, offset, scale)
-        lamda = lamda / scale
+        lamda_scaled = lamda / (scale * scale)
-        mcmc_state = self._setup_mcmc(x_train, y_train, max_split, lamda, sigdf, power, base, maxdepth, ntree)
+        mcmc_state = self._setup_mcmc(x_train, y_train, max_split, lamda_scaled, sigdf, power, base, maxdepth, ntree)
         final_state, burnin_trace, main_trace = self._run_mcmc(mcmc_state, ndpost, nskip, keepevery, printevery, seed)
         sigma = self._extract_sigma(main_trace, scale)
@@ -184,8 +188,10 @@ class BART:
         self.offset = offset
         self.scale = scale
-        self.lamda = lamda * scale
+        self.lamda = lamda
+        self.sigest = sigest
         self.ntree = ntree
+        self.maxdepth = maxdepth
         self.sigma = sigma
         self.first_sigma = first_sigma
@@ -261,25 +267,25 @@ class BART:
         assert get_length(x1) == get_length(x2)
     @staticmethod
-    def _process_noise_variance_settings(x_train, y_train, sigest, sigdf, sigquant, lamda):
+    def _process_noise_variance_settings(x_train, y_train, sigest, sigdf, sigquant, lamda, offset):
         if lamda is not None:
-            return lamda
+            return lamda, None
         else:
             if sigest is not None:
                 sigest2 = sigest * sigest
             elif y_train.size < 2:
                 sigest2 = 1
             elif y_train.size <= x_train.shape[0]:
-                sigest2 = jnp.var(y_train)
+                sigest2 = jnp.var(y_train - offset)
             else:
-                _, chisq, rank, _ = jnp.linalg.lstsq(x_train.T, y_train)
+                _, chisq, rank, _ = jnp.linalg.lstsq(x_train.T, y_train - offset)
                 chisq = chisq.squeeze(0)
                 dof = len(y_train) - rank
                 sigest2 = chisq / dof
             alpha = sigdf / 2
             invchi2 = jaxext.scipy.stats.invgamma.ppf(sigquant, alpha) / 2
             invchi2rid = invchi2 * sigdf
-            return sigest2 / invchi2rid
+            return sigest2 / invchi2rid, jnp.sqrt(sigest2)
     @staticmethod
     def _process_offset_settings(y_train, offset):
@@ -315,7 +321,7 @@ class BART:
         p_nonterminal = base / (1 + depth).astype(float) ** power
         sigma2_alpha = sigdf / 2
         sigma2_beta = lamda * sigma2_alpha
-        return mcmcstep.make_bart(
+        return mcmcstep.init(
             X=x_train,
             y=y_train,
             max_split=max_split,
@@ -348,13 +354,6 @@ class BART:
         return scale * jnp.sqrt(trace['sigma2'])
-    def _predict_debug(self, x_test):
-        from . import debug
-        x_test, x_test_fmt = self._process_predictor_input(x_test)
-        self._check_compatible_formats(x_test_fmt, self._x_train_fmt)
-        x_test = self._bin_predictors(x_test, self._splits)
-        return debug.trace_evaluate_trees(self._main_trace, x_test)
     def _show_tree(self, i_sample, i_tree, print_all=False):
         from . import debug
         trace = self._main_trace
@@ -379,7 +378,7 @@ class BART:
     def _compare_resid(self):
         bart = self._mcmc_state
         resid1 = bart['resid']
-        yhat = grove.evaluate_tree_vmap_x(bart['X'], bart['leaf_trees'], bart['var_trees'], bart['split_trees'], jnp.float32)
+        yhat = grove.evaluate_forest(bart['X'], bart['leaf_trees'], bart['var_trees'], bart['split_trees'], jnp.float32)
         resid2 = bart['y'] - yhat
         return resid1, resid2
@@ -421,7 +420,5 @@ class BART:
     def _tree_goes_bad(self):
         bad = self._check_trees().astype(bool)
-        bad_before = bad[:-1]
-        bad_after = bad[1:]
-        goes_bad = bad_after & ~bad_before
-        return jnp.pad(goes_bad, [(1, 0), (0, 0)])
+        bad_before = jnp.pad(bad[:-1], [(1, 0), (0, 0)])
+        return bad & ~bad_before

bartz/mcmcloop.py CHANGED Viewed

@@ -100,15 +100,21 @@ def run_mcmc(bart, n_burn, n_save, n_skip, callback, key):
     def inner_loop(carry, _, tracelist, burnin):
         bart, i_total, i_skip, key = carry
         key, subkey = random.split(key)
-        bart = mcmcstep.mcmc_step(bart, subkey)
+        bart = mcmcstep.step(bart, subkey)
         callback(bart=bart, burnin=burnin, i_total=i_total, i_skip=i_skip, **callback_kw)
         output = {key: bart[key] for key in tracelist}
         return (bart, i_total + 1, i_skip + 1, key), output
-    # TODO avoid invoking this altogether if burnin is 0 to shorten compilation time & size
-    carry = bart, 0, 0, key
-    burnin_loop = functools.partial(inner_loop, tracelist=tracelist_burnin, burnin=True)
-    (bart, i_total, _, key), burnin_trace = lax.scan(burnin_loop, carry, None, n_burn)
+    if n_burn > 0:
+        carry = bart, 0, 0, key
+        burnin_loop = functools.partial(inner_loop, tracelist=tracelist_burnin, burnin=True)
+        (bart, i_total, _, key), burnin_trace = lax.scan(burnin_loop, carry, None, n_burn)
+    else:
+        i_total = 0
+        burnin_trace = {
+            key: jnp.empty((0,) + bart[key].shape, bart[key].dtype)
+            for key in tracelist_burnin
+        }
     def outer_loop(carry, _):
         bart, i_total, key = carry
@@ -148,14 +154,17 @@ def make_simple_print_callback(printevery):
         prune_prop = bart['prune_prop_count'] / prop_total
         grow_acc = bart['grow_acc_count'] / bart['grow_prop_count']
         prune_acc = bart['prune_acc_count'] / bart['prune_prop_count']
-        n_total = n_burn + n_save
+        n_total = n_burn + n_save * n_skip
         debug.callback(simple_print_callback_impl, burnin, i_total, n_total, grow_prop, grow_acc, prune_prop, prune_acc, printevery)
     return callback
 def simple_print_callback_impl(burnin, i_total, n_total, grow_prop, grow_acc, prune_prop, prune_acc, printevery):
     if (i_total + 1) % printevery == 0:
         burnin_flag = ' (burnin)' if burnin else ''
-        print(f'Iteration {i_total + 1:4d}/{n_total:d} '
+        total_str = str(n_total)
+        ndigits = len(total_str)
+        i_str = str(i_total + 1).rjust(ndigits)
+        print(f'Iteration {i_str}/{total_str} '
             f'P_grow={grow_prop:.2f} P_prune={prune_prop:.2f} '
             f'A_grow={grow_acc:.2f} A_prune={prune_acc:.2f}{burnin_flag}')
@@ -177,6 +186,6 @@ def evaluate_trace(trace, X):
         The predictions for each iteration of the MCMC.
     """
     def loop(_, state):
-        return None, grove.evaluate_tree_vmap_x(X, state['leaf_trees'], state['var_trees'], state['split_trees'], jnp.float32)
+        return None, grove.evaluate_forest(X, state['leaf_trees'], state['var_trees'], state['split_trees'], jnp.float32)
     _, y = lax.scan(loop, None, trace)
     return y

bartz 0.0__py3-none-any.whl → 0.1.0__py3-none-any.whl

bartz 0.0py3-none-any.whl → 0.1.0py3-none-any.whl