PyPI - bartz - Versions diffs - 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

bartz 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

bartz/.DS_Store +0 -0
bartz/BART/__init__.py +27 -0
bartz/BART/_gbart.py +522 -0
bartz/__init__.py +6 -4
bartz/_interface.py +937 -0
bartz/_profiler.py +318 -0
bartz/_version.py +1 -1
bartz/debug.py +1217 -82
bartz/grove.py +205 -103
bartz/jaxext/__init__.py +287 -0
bartz/jaxext/_autobatch.py +444 -0
bartz/jaxext/scipy/__init__.py +25 -0
bartz/jaxext/scipy/special.py +239 -0
bartz/jaxext/scipy/stats.py +36 -0
bartz/mcmcloop.py +662 -314
bartz/mcmcstep/__init__.py +35 -0
bartz/mcmcstep/_moves.py +904 -0
bartz/mcmcstep/_state.py +1114 -0
bartz/mcmcstep/_step.py +1603 -0
bartz/prepcovars.py +140 -44
bartz/testing/__init__.py +29 -0
bartz/testing/_dgp.py +442 -0
{bartz-0.6.0.dist-info → bartz-0.8.0.dist-info}/METADATA +18 -13
bartz-0.8.0.dist-info/RECORD +25 -0
{bartz-0.6.0.dist-info → bartz-0.8.0.dist-info}/WHEEL +1 -1
bartz/BART.py +0 -603
bartz/jaxext.py +0 -423
bartz/mcmcstep.py +0 -2335
bartz-0.6.0.dist-info/RECORD +0 -13

bartz/grove.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # bartz/src/bartz/grove.py
 #
-# Copyright (c) 2024-2025, Giacomo Petrillo
+# Copyright (c) 2024-2026, The Bartz Contributors
 #
 # This file is part of bartz.
 #
@@ -22,93 +22,122 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-"""
+"""Functions to create and manipulate binary decision trees."""
-Functions to create and manipulate binary trees.
+import math
+from functools import partial
+from typing import Protocol
-A tree is represented with arrays as a heap. The root node is at index 1. The children nodes of a node at index :math:`i` are at indices :math:`2i` (left child) and :math:`2i + 1` (right child). The array element at index 0 is unused.
+from jax import jit, lax, vmap
+from jax import numpy as jnp
+from jaxtyping import Array, Bool, DTypeLike, Float32, Int32, Shaped, UInt
-A decision tree is represented by tree arrays: 'leaf', 'var', and 'split'.
+try:
+    from numpy.lib.array_utils import normalize_axis_tuple  # numpy 2
+except ImportError:
+    from numpy.core.numeric import normalize_axis_tuple  # numpy 1
-The 'leaf' array contains the values in the leaves.
+from bartz.jaxext import minimal_unsigned_dtype, vmap_nodoc
-The 'var' array contains the axes along which the decision nodes operate.
-The 'split' array contains the decision boundaries. The boundaries are open on the right, i.e., a point belongs to the left child iff x < split. Whether a node is a leaf is indicated by the corresponding 'split' element being 0. Unused nodes also have split set to 0.
+class TreeHeaps(Protocol):
+    """A protocol for dataclasses that represent trees.
-Since the nodes at the bottom can only be leaves and not decision nodes, the 'var' and 'split' arrays have half the length of the 'leaf' array.
+    A tree is represented with arrays as a heap. The root node is at index 1.
+    The children nodes of a node at index :math:`i` are at indices :math:`2i`
+    (left child) and :math:`2i + 1` (right child). The array element at index 0
+    is unused.
-"""
+    Since the nodes at the bottom can only be leaves and not decision nodes,
+    `var_tree` and `split_tree` are half as long as `leaf_tree`.
-import functools
-import math
+    Arrays may have additional initial axes to represent multiple trees.
+    """
-import jax
-from jax import lax
-from jax import numpy as jnp
+    leaf_tree: (
+        Float32[Array, '*batch_shape 2**d'] | Float32[Array, '*batch_shape k 2**d']
+    )
+    """The values in the leaves of the trees. This array can be dirty, i.e.,
+    unused nodes can have whatever value. It may have an additional axis
+    for multivariate leaves."""
-from . import jaxext
+    var_tree: UInt[Array, '*batch_shape 2**(d-1)']
+    """The axes along which the decision nodes operate. This array can be
+    dirty but for the always unused node at index 0 which must be set to 0."""
+    split_tree: UInt[Array, '*batch_shape 2**(d-1)']
+    """The decision boundaries of the trees. The boundaries are open on the
+    right, i.e., a point belongs to the left child iff x < split. Whether a
+    node is a leaf is indicated by the corresponding 'split' element being
+    0. Unused nodes also have split set to 0. This array can't be dirty."""
-def make_tree(depth, dtype):
+def make_tree(
+    depth: int, dtype: DTypeLike, batch_shape: tuple[int, ...] = ()
+) -> Shaped[Array, '*batch_shape 2**{depth}']:
     """
     Make an array to represent a binary tree.
     Parameters
     ----------
-    depth : int
+    depth
         The maximum depth of the tree. Depth 1 means that there is only a root
         node.
-    dtype : dtype
+    dtype
         The dtype of the array.
+    batch_shape
+        The leading shape of the array, to represent multiple trees and/or
+        multivariate trees.
     Returns
     -------
-    tree : array
-        An array of zeroes with shape (2 ** depth,).
+    An array of zeroes with the appropriate shape.
     """
-    return jnp.zeros(2**depth, dtype)
+    shape = (*batch_shape, 2**depth)
+    return jnp.zeros(shape, dtype)
-def tree_depth(tree):
+def tree_depth(tree: Shaped[Array, '*batch_shape 2**d']) -> int:
     """
     Return the maximum depth of a tree.
     Parameters
     ----------
-    tree : array
+    tree
         A tree created by `make_tree`. If the array is ND, the tree structure is
         assumed to be along the last axis.
     Returns
     -------
-    depth : int
-        The maximum depth of the tree.
+    The maximum depth of the tree.
     """
-    return int(round(math.log2(tree.shape[-1])))
+    return round(math.log2(tree.shape[-1]))
-def traverse_tree(x, var_tree, split_tree):
+def traverse_tree(
+    x: UInt[Array, ' p'],
+    var_tree: UInt[Array, ' 2**(d-1)'],
+    split_tree: UInt[Array, ' 2**(d-1)'],
+) -> UInt[Array, '']:
     """
     Find the leaf where a point falls into.
     Parameters
     ----------
-    x : array (p,)
+    x
         The coordinates to evaluate the tree at.
-    var_tree : array (2 ** (d - 1),)
+    var_tree
         The decision axes of the tree.
-    split_tree : array (2 ** (d - 1),)
+    split_tree
         The decision boundaries of the tree.
     Returns
     -------
-    index : int
-        The index of the leaf.
+    The index of the leaf.
     """
     carry = (
         jnp.zeros((), bool),
-        jnp.ones((), jaxext.minimal_unsigned_dtype(2 * var_tree.size - 1)),
+        jnp.ones((), minimal_unsigned_dtype(2 * var_tree.size - 1)),
     )
     def loop(carry, _):
@@ -128,111 +157,132 @@ def traverse_tree(x, var_tree, split_tree):
     return index
-@functools.partial(jaxext.vmap_nodoc, in_axes=(None, 0, 0))
-@functools.partial(jaxext.vmap_nodoc, in_axes=(1, None, None))
-def traverse_forest(X, var_trees, split_trees):
+@jit
+@partial(jnp.vectorize, excluded=(0,), signature='(hts),(hts)->(n)')
+@partial(vmap_nodoc, in_axes=(1, None, None))
+def traverse_forest(
+    X: UInt[Array, 'p n'],
+    var_trees: UInt[Array, '*forest_shape 2**(d-1)'],
+    split_trees: UInt[Array, '*forest_shape 2**(d-1)'],
+) -> UInt[Array, '*forest_shape n']:
     """
-    Find the leaves where points fall into.
+    Find the leaves where points falls into for each tree in a set.
     Parameters
     ----------
-    X : array (p, n)
+    X
         The coordinates to evaluate the trees at.
-    var_trees : array (m, 2 ** (d - 1))
+    var_trees
         The decision axes of the trees.
-    split_trees : array (m, 2 ** (d - 1))
+    split_trees
         The decision boundaries of the trees.
     Returns
     -------
-    indices : array (m, n)
-        The indices of the leaves.
+    The indices of the leaves.
     """
     return traverse_tree(X, var_trees, split_trees)
-def evaluate_forest(X, leaf_trees, var_trees, split_trees, dtype=None, sum_trees=True):
+@partial(jit, static_argnames=('sum_batch_axis',))
+def evaluate_forest(
+    X: UInt[Array, 'p n'],
+    trees: TreeHeaps,
+    *,
+    sum_batch_axis: int | tuple[int, ...] = (),
+) -> (
+    Float32[Array, '*reduced_batch_size n'] | Float32[Array, '*reduced_batch_size k n']
+):
     """
-    Evaluate a ensemble of trees at an array of points.
+    Evaluate an ensemble of trees at an array of points.
     Parameters
     ----------
-    X : array (p, n)
+    X
         The coordinates to evaluate the trees at.
-    leaf_trees : array (m, 2 ** d)
-        The leaf values of the tree or forest. If the input is a forest, the
-        first axis is the tree index, and the values are summed.
-    var_trees : array (m, 2 ** (d - 1))
-        The decision axes of the trees.
-    split_trees : array (m, 2 ** (d - 1))
-        The decision boundaries of the trees.
-    dtype : dtype, optional
-        The dtype of the output. Ignored if `sum_trees` is `False`.
-    sum_trees : bool, default True
-        Whether to sum the values across trees.
+    trees
+        The trees.
+    sum_batch_axis
+        The batch axes to sum over. By default, no summation is performed.
+        Note that negative indices count from the end of the batch dimensions,
+        the core dimensions n and k can't be summed over by this function.
     Returns
     -------
-    out : array (n,) or (m, n)
-        The (sum of) the values of the trees at the points in `X`.
+    The (sum of) the values of the trees at the points in `X`.
     """
-    indices = traverse_forest(X, var_trees, split_trees)
-    ntree, _ = leaf_trees.shape
-    tree_index = jnp.arange(ntree, dtype=jaxext.minimal_unsigned_dtype(ntree - 1))
-    leaves = leaf_trees[tree_index[:, None], indices]
-    if sum_trees:
-        return jnp.sum(leaves, axis=0, dtype=dtype)
-    # this sum suggests to swap the vmaps, but I think it's better for X
-    # copying to keep it that way
-    else:
-        return leaves
-def is_actual_leaf(split_tree, *, add_bottom_level=False):
+    indices: UInt[Array, '*forest_shape n']
+    indices = traverse_forest(X, trees.var_tree, trees.split_tree)
+    is_mv = trees.leaf_tree.ndim != trees.var_tree.ndim
+    bc_indices: UInt[Array, '*forest_shape n 1'] | UInt[Array, '*forest_shape 1 n 1']
+    bc_indices = indices[..., None, :, None] if is_mv else indices[..., None]
+    bc_leaf_tree: (
+        Float32[Array, '*forest_shape 1 tree_size']
+        | Float32[Array, '*forest_shape k 1 tree_size']
+    )
+    bc_leaf_tree = (
+        trees.leaf_tree[..., :, None, :] if is_mv else trees.leaf_tree[..., None, :]
+    )
+    bc_leaves: (
+        Float32[Array, '*forest_shape n 1'] | Float32[Array, '*forest_shape k n 1']
+    )
+    bc_leaves = jnp.take_along_axis(bc_leaf_tree, bc_indices, -1)
+    leaves: Float32[Array, '*forest_shape n'] | Float32[Array, '*forest_shape k n']
+    leaves = jnp.squeeze(bc_leaves, -1)
+    axis = normalize_axis_tuple(sum_batch_axis, trees.var_tree.ndim - 1)
+    return jnp.sum(leaves, axis=axis)
+def is_actual_leaf(
+    split_tree: UInt[Array, ' 2**(d-1)'], *, add_bottom_level: bool = False
+) -> Bool[Array, ' 2**(d-1)'] | Bool[Array, ' 2**d']:
     """
     Return a mask indicating the leaf nodes in a tree.
     Parameters
     ----------
-    split_tree : int array (2 ** (d - 1),)
+    split_tree
         The splitting points of the tree.
-    add_bottom_level : bool, default False
+    add_bottom_level
         If True, the bottom level of the tree is also considered.
     Returns
     -------
-    is_actual_leaf : bool array (2 ** (d - 1) or 2 ** d,)
-        The mask indicating the leaf nodes. The length is doubled if
-        `add_bottom_level` is True.
+    The mask marking the leaf nodes. Length doubled if `add_bottom_level` is True.
     """
     size = split_tree.size
     is_leaf = split_tree == 0
     if add_bottom_level:
         size *= 2
         is_leaf = jnp.concatenate([is_leaf, jnp.ones_like(is_leaf)])
-    index = jnp.arange(size, dtype=jaxext.minimal_unsigned_dtype(size - 1))
+    index = jnp.arange(size, dtype=minimal_unsigned_dtype(size - 1))
     parent_index = index >> 1
     parent_nonleaf = split_tree[parent_index].astype(bool)
     parent_nonleaf = parent_nonleaf.at[1].set(True)
     return is_leaf & parent_nonleaf
-def is_leaves_parent(split_tree):
+def is_leaves_parent(split_tree: UInt[Array, ' 2**(d-1)']) -> Bool[Array, ' 2**(d-1)']:
     """
     Return a mask indicating the nodes with leaf (and only leaf) children.
     Parameters
     ----------
-    split_tree : int array (2 ** (d - 1),)
+    split_tree
         The decision boundaries of the tree.
     Returns
     -------
-    is_leaves_parent : bool array (2 ** (d - 1),)
-        The mask indicating which nodes have leaf children.
+    The mask indicating which nodes have leaf children.
     """
     index = jnp.arange(
-        split_tree.size, dtype=jaxext.minimal_unsigned_dtype(2 * split_tree.size - 1)
+        split_tree.size, dtype=minimal_unsigned_dtype(2 * split_tree.size - 1)
     )
     left_index = index << 1  # left child
     right_index = left_index + 1  # right child
@@ -243,45 +293,50 @@ def is_leaves_parent(split_tree):
     # the 0-th item has split == 0, so it's not counted
-def tree_depths(tree_length):
+def tree_depths(tree_size: int) -> Int32[Array, ' {tree_size}']:
     """
     Return the depth of each node in a binary tree.
     Parameters
     ----------
-    tree_length : int
+    tree_size
         The length of the tree array, i.e., 2 ** d.
     Returns
     -------
-    depth : array (tree_length,)
-        The depth of each node. The root node (index 1) has depth 0. The depth
-        is the position of the most significant non-zero bit in the index. The
-        first element (the unused node) is marked as depth 0.
+    The depth of each node.
+    Notes
+    -----
+    The root node (index 1) has depth 0. The depth is the position of the most
+    significant non-zero bit in the index. The first element (the unused node)
+    is marked as depth 0.
     """
     depths = []
     depth = 0
-    for i in range(tree_length):
+    for i in range(tree_size):
         if i == 2**depth:
             depth += 1
         depths.append(depth - 1)
     depths[0] = 0
-    return jnp.array(depths, jaxext.minimal_unsigned_dtype(max(depths)))
+    return jnp.array(depths, minimal_unsigned_dtype(max(depths)))
-def is_used(split_tree):
+@partial(jnp.vectorize, signature='(half_tree_size)->(tree_size)')
+def is_used(
+    split_tree: UInt[Array, '*batch_shape 2**(d-1)'],
+) -> Bool[Array, '*batch_shape 2**d']:
     """
     Return a mask indicating the used nodes in a tree.
     Parameters
     ----------
-    split_tree : int array (2 ** (d - 1),)
+    split_tree
         The decision boundaries of the tree.
     Returns
     -------
-    is_used : bool array (2 ** d,)
-        A mask indicating which nodes are actually used.
+    A mask indicating which nodes are actually used.
     """
     internal_node = split_tree.astype(bool)
     internal_node = jnp.concatenate([internal_node, jnp.zeros_like(internal_node)])
@@ -289,22 +344,69 @@ def is_used(split_tree):
     return internal_node | actual_leaf
-def forest_fill(split_trees):
+@jit
+def forest_fill(split_tree: UInt[Array, '*batch_shape 2**(d-1)']) -> Float32[Array, '']:
     """
     Return the fraction of used nodes in a set of trees.
     Parameters
     ----------
-    split_trees : array (m, 2 ** (d - 1),)
+    split_tree
         The decision boundaries of the trees.
     Returns
     -------
-    fill : float
-        The number of tree nodes in the forest over the maximum number that
-        could be stored in the arrays.
+    Number of tree nodes over the maximum number that could be stored.
     """
-    m, _ = split_trees.shape
-    used = jax.vmap(is_used)(split_trees)
+    used = is_used(split_tree)
     count = jnp.count_nonzero(used)
-    return count / (used.size - m)
+    batch_size = split_tree.size // split_tree.shape[-1]
+    return count / (used.size - batch_size)
+@partial(jit, static_argnames=('p', 'sum_batch_axis'))
+def var_histogram(
+    p: int,
+    var_tree: UInt[Array, '*batch_shape 2**(d-1)'],
+    split_tree: UInt[Array, '*batch_shape 2**(d-1)'],
+    *,
+    sum_batch_axis: int | tuple[int, ...] = (),
+) -> Int32[Array, '*reduced_batch_shape {p}']:
+    """
+    Count how many times each variable appears in a tree.
+    Parameters
+    ----------
+    p
+        The number of variables (the maximum value that can occur in `var_tree`
+        is ``p - 1``).
+    var_tree
+        The decision axes of the tree.
+    split_tree
+        The decision boundaries of the tree.
+    sum_batch_axis
+        The batch axes to sum over. By default, no summation is performed. Note
+        that negative indices count from the end of the batch dimensions, the
+        core dimension p can't be summed over by this function.
+    Returns
+    -------
+    The histogram(s) of the variables used in the tree.
+    """
+    is_internal = split_tree.astype(bool)
+    def scatter_add(
+        var_tree: UInt[Array, '*summed_batch_axes half_tree_size'],
+        is_internal: Bool[Array, '*summed_batch_axes half_tree_size'],
+    ) -> Int32[Array, ' p']:
+        return jnp.zeros(p, int).at[var_tree].add(is_internal)
+    # vmap scatter_add over non-batched dims
+    batch_ndim = var_tree.ndim - 1
+    axes = normalize_axis_tuple(sum_batch_axis, batch_ndim)
+    for i in reversed(range(batch_ndim)):
+        neg_i = i - var_tree.ndim
+        if i not in axes:
+            scatter_add = vmap(scatter_add, in_axes=neg_i)
+    return scatter_add(var_tree, is_internal)

bartz 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

bartz 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl