PyPI - bartz - Versions diffs - 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

bartz 0.4.1py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

bartz/.DS_Store +0 -0
bartz/BART.py +266 -113
bartz/__init__.py +4 -12
bartz/_version.py +1 -1
bartz/debug.py +42 -16
bartz/grove.py +62 -12
bartz/jaxext.py +111 -37
bartz/mcmcloop.py +419 -105
bartz/mcmcstep.py +1528 -760
bartz/prepcovars.py +25 -10
{bartz-0.4.1.dist-info → bartz-0.6.0.dist-info}/METADATA +14 -16
bartz-0.6.0.dist-info/RECORD +13 -0
bartz-0.6.0.dist-info/WHEEL +4 -0
bartz-0.4.1.dist-info/LICENSE +0 -21
bartz-0.4.1.dist-info/RECORD +0 -13
bartz-0.4.1.dist-info/WHEEL +0 -4

bartz/debug.py CHANGED Viewed

@@ -1,21 +1,19 @@
 import functools
 import jax
-from jax import numpy as jnp
 from jax import lax
+from jax import numpy as jnp
-from . import grove
-from . import mcmcstep
-from . import jaxext
+from . import grove, jaxext
-def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
+def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
     tee = '├──'
     corner = '└──'
     join = '│  '
     space = '   '
     down = '┐'
-    bottom = '╢' # '┨' #
+    bottom = '╢'  # '┨' #
     def traverse_tree(index, depth, indent, first_indent, next_indent, unused):
         if index >= len(leaf_tree):
@@ -58,7 +56,7 @@ def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
         indent += next_indent
         unused = unused or is_leaf
         if unused and not print_all:
             return
@@ -67,58 +65,80 @@ def print_tree(leaf_tree, var_tree, split_tree, print_all=False):
     traverse_tree(1, 0, '', '', '', False)
 def tree_actual_depth(split_tree):
     is_leaf = grove.is_actual_leaf(split_tree, add_bottom_level=True)
     depth = grove.tree_depths(is_leaf.size)
     depth = jnp.where(is_leaf, depth, 0)
     return jnp.max(depth)
 def forest_depth_distr(split_trees):
     depth = grove.tree_depth(split_trees) + 1
     depths = jax.vmap(tree_actual_depth)(split_trees)
     return jnp.bincount(depths, length=depth)
 def trace_depth_distr(split_trees_trace):
     return jax.vmap(forest_depth_distr)(split_trees_trace)
 def points_per_leaf_distr(var_tree, split_tree, X):
     traverse_tree = jax.vmap(grove.traverse_tree, in_axes=(1, None, None))
     indices = traverse_tree(X, var_tree, split_tree)
-    count_tree = jnp.zeros(2 * split_tree.size, dtype=jaxext.minimal_unsigned_dtype(indices.size))
+    count_tree = jnp.zeros(
+        2 * split_tree.size, dtype=jaxext.minimal_unsigned_dtype(indices.size)
+    )
     count_tree = count_tree.at[indices].add(1)
     is_leaf = grove.is_actual_leaf(split_tree, add_bottom_level=True).view(jnp.uint8)
     return jnp.bincount(count_tree, is_leaf, length=X.shape[1] + 1)
 def forest_points_per_leaf_distr(bart, X):
     distr = jnp.zeros(X.shape[1] + 1, int)
     trees = bart['var_trees'], bart['split_trees']
     def loop(distr, tree):
         return distr + points_per_leaf_distr(*tree, X), None
     distr, _ = lax.scan(loop, distr, trees)
     return distr
 def trace_points_per_leaf_distr(bart, X):
     def loop(_, bart):
         return None, forest_points_per_leaf_distr(bart, X)
     _, distr = lax.scan(loop, None, bart)
     return distr
 def check_types(leaf_tree, var_tree, split_tree, max_split):
     expected_var_dtype = jaxext.minimal_unsigned_dtype(max_split.size - 1)
     expected_split_dtype = max_split.dtype
-    return var_tree.dtype == expected_var_dtype and split_tree.dtype == expected_split_dtype
+    return (
+        var_tree.dtype == expected_var_dtype
+        and split_tree.dtype == expected_split_dtype
+    )
 def check_sizes(leaf_tree, var_tree, split_tree, max_split):
     return leaf_tree.size == 2 * var_tree.size == 2 * split_tree.size
 def check_unused_node(leaf_tree, var_tree, split_tree, max_split):
     return (var_tree[0] == 0) & (split_tree[0] == 0)
 def check_leaf_values(leaf_tree, var_tree, split_tree, max_split):
     return jnp.all(jnp.isfinite(leaf_tree))
 def check_stray_nodes(leaf_tree, var_tree, split_tree, max_split):
-    index = jnp.arange(2 * split_tree.size, dtype=jaxext.minimal_unsigned_dtype(2 * split_tree.size - 1))
+    index = jnp.arange(
+        2 * split_tree.size,
+        dtype=jaxext.minimal_unsigned_dtype(2 * split_tree.size - 1),
+    )
     parent_index = index >> 1
     is_not_leaf = split_tree.at[index].get(mode='fill', fill_value=0) != 0
     parent_is_leaf = split_tree[parent_index] == 0
@@ -126,6 +146,7 @@ def check_stray_nodes(leaf_tree, var_tree, split_tree, max_split):
     stray = stray.at[1].set(False)
     return ~jnp.any(stray)
 check_functions = [
     check_types,
     check_sizes,
@@ -134,6 +155,7 @@ check_functions = [
     check_stray_nodes,
 ]
 def check_tree(leaf_tree, var_tree, split_tree, max_split):
     error_type = jaxext.minimal_unsigned_dtype(2 ** len(check_functions) - 1)
     error = error_type(0)
@@ -144,15 +166,19 @@ def check_tree(leaf_tree, var_tree, split_tree, max_split):
         error |= bit
     return error
 def describe_error(error):
-    return [
-        func.__name__
-        for i, func in enumerate(check_functions)
-        if error & (1 << i)
-    ]
+    return [func.__name__ for i, func in enumerate(check_functions) if error & (1 << i)]
 check_forest = jax.vmap(check_tree, in_axes=(0, 0, 0, None))
 @functools.partial(jax.vmap, in_axes=(0, None))
 def check_trace(trace, state):
-    return check_forest(trace['leaf_trees'], trace['var_trees'], trace['split_trees'], state['max_split'])
+    return check_forest(
+        trace['leaf_trees'],
+        trace['var_trees'],
+        trace['split_trees'],
+        state.max_split,
+    )

bartz/grove.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # bartz/src/bartz/grove.py
 #
-# Copyright (c) 2024, Giacomo Petrillo
+# Copyright (c) 2024-2025, Giacomo Petrillo
 #
 # This file is part of bartz.
 #
@@ -34,7 +34,7 @@ The 'leaf' array contains the values in the leaves.
 The 'var' array contains the axes along which the decision nodes operate.
-The 'split' array contains the decision boundaries. The boundaries are open on the right, i.e., a point belongs to the left child iff x < split. Whether a node is a leaf is indicated by the corresponding 'split' element being 0.
+The 'split' array contains the decision boundaries. The boundaries are open on the right, i.e., a point belongs to the left child iff x < split. Whether a node is a leaf is indicated by the corresponding 'split' element being 0. Unused nodes also have split set to 0.
 Since the nodes at the bottom can only be leaves and not decision nodes, the 'var' and 'split' arrays have half the length of the 'leaf' array.
@@ -44,11 +44,12 @@ import functools
 import math
 import jax
-from jax import numpy as jnp
 from jax import lax
+from jax import numpy as jnp
 from . import jaxext
 def make_tree(depth, dtype):
     """
     Make an array to represent a binary tree.
@@ -66,7 +67,8 @@ def make_tree(depth, dtype):
     tree : array
         An array of zeroes with shape (2 ** depth,).
     """
-    return jnp.zeros(2 ** depth, dtype)
+    return jnp.zeros(2**depth, dtype)
 def tree_depth(tree):
     """
@@ -85,6 +87,7 @@ def tree_depth(tree):
     """
     return int(round(math.log2(tree.shape[-1])))
 def traverse_tree(x, var_tree, split_tree):
     """
     Find the leaf where a point falls into.
@@ -103,7 +106,6 @@ def traverse_tree(x, var_tree, split_tree):
     index : int
         The index of the leaf.
     """
     carry = (
         jnp.zeros((), bool),
         jnp.ones((), jaxext.minimal_unsigned_dtype(2 * var_tree.size - 1)),
@@ -125,6 +127,7 @@ def traverse_tree(x, var_tree, split_tree):
     (_, index), _ = lax.scan(loop, carry, None, depth, unroll=16)
     return index
 @functools.partial(jaxext.vmap_nodoc, in_axes=(None, 0, 0))
 @functools.partial(jaxext.vmap_nodoc, in_axes=(1, None, None))
 def traverse_forest(X, var_trees, split_trees):
@@ -147,6 +150,7 @@ def traverse_forest(X, var_trees, split_trees):
     """
     return traverse_tree(X, var_trees, split_trees)
 def evaluate_forest(X, leaf_trees, var_trees, split_trees, dtype=None, sum_trees=True):
     """
     Evaluate a ensemble of trees at an array of points.
@@ -178,11 +182,12 @@ def evaluate_forest(X, leaf_trees, var_trees, split_trees, dtype=None, sum_trees
     leaves = leaf_trees[tree_index[:, None], indices]
     if sum_trees:
         return jnp.sum(leaves, axis=0, dtype=dtype)
-            # this sum suggests to swap the vmaps, but I think it's better for X
-            # copying to keep it that way
+    # this sum suggests to swap the vmaps, but I think it's better for X
+    # copying to keep it that way
     else:
         return leaves
 def is_actual_leaf(split_tree, *, add_bottom_level=False):
     """
     Return a mask indicating the leaf nodes in a tree.
@@ -211,6 +216,7 @@ def is_actual_leaf(split_tree, *, add_bottom_level=False):
     parent_nonleaf = parent_nonleaf.at[1].set(True)
     return is_leaf & parent_nonleaf
 def is_leaves_parent(split_tree):
     """
     Return a mask indicating the nodes with leaf (and only leaf) children.
@@ -225,14 +231,17 @@ def is_leaves_parent(split_tree):
     is_leaves_parent : bool array (2 ** (d - 1),)
         The mask indicating which nodes have leaf children.
     """
-    index = jnp.arange(split_tree.size, dtype=jaxext.minimal_unsigned_dtype(2 * split_tree.size - 1))
-    left_index = index << 1 # left child
-    right_index = left_index + 1 # right child
+    index = jnp.arange(
+        split_tree.size, dtype=jaxext.minimal_unsigned_dtype(2 * split_tree.size - 1)
+    )
+    left_index = index << 1  # left child
+    right_index = left_index + 1  # right child
     left_leaf = split_tree.at[left_index].get(mode='fill', fill_value=0) == 0
     right_leaf = split_tree.at[right_index].get(mode='fill', fill_value=0) == 0
     is_not_leaf = split_tree.astype(bool)
     return is_not_leaf & left_leaf & right_leaf
-        # the 0-th item has split == 0, so it's not counted
+    # the 0-th item has split == 0, so it's not counted
 def tree_depths(tree_length):
     """
@@ -253,8 +262,49 @@ def tree_depths(tree_length):
     depths = []
     depth = 0
     for i in range(tree_length):
-        if i == 2 ** depth:
+        if i == 2**depth:
             depth += 1
         depths.append(depth - 1)
     depths[0] = 0
     return jnp.array(depths, jaxext.minimal_unsigned_dtype(max(depths)))
+def is_used(split_tree):
+    """
+    Return a mask indicating the used nodes in a tree.
+    Parameters
+    ----------
+    split_tree : int array (2 ** (d - 1),)
+        The decision boundaries of the tree.
+    Returns
+    -------
+    is_used : bool array (2 ** d,)
+        A mask indicating which nodes are actually used.
+    """
+    internal_node = split_tree.astype(bool)
+    internal_node = jnp.concatenate([internal_node, jnp.zeros_like(internal_node)])
+    actual_leaf = is_actual_leaf(split_tree, add_bottom_level=True)
+    return internal_node | actual_leaf
+def forest_fill(split_trees):
+    """
+    Return the fraction of used nodes in a set of trees.
+    Parameters
+    ----------
+    split_trees : array (m, 2 ** (d - 1),)
+        The decision boundaries of the trees.
+    Returns
+    -------
+    fill : float
+        The number of tree nodes in the forest over the maximum number that
+        could be stored in the arrays.
+    """
+    m, _ = split_trees.shape
+    used = jax.vmap(is_used)(split_trees)
+    count = jnp.count_nonzero(used)
+    return count / (used.size - m)

bartz/jaxext.py CHANGED Viewed

@@ -22,60 +22,74 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+"""Additions to jax."""
 import functools
 import math
 import warnings
-from scipy import special
 import jax
+from jax import lax, random, tree_util
 from jax import numpy as jnp
-from jax import tree_util
-from jax import lax
+from scipy import special
 def float_type(*args):
-    """
-    Determine the jax floating point result type given operands/types.
-    """
+    """Determine the jax floating point result type given operands/types."""
     t = jnp.result_type(*args)
     return jnp.sin(jnp.empty(0, t)).dtype
-def castto(func, type):
+def _castto(func, type):
     @functools.wraps(func)
     def newfunc(*args, **kw):
         return func(*args, **kw).astype(type)
     return newfunc
 class scipy:
+    """Mockup of the :external:py:mod:`scipy` module."""
     class special:
+        """Mockup of the :external:py:mod:`scipy.special` module."""
-        @functools.wraps(special.gammainccinv)
+        @staticmethod
         def gammainccinv(a, y):
+            """Survival function inverse of the Gamma(a, 1) distribution."""
             a = jnp.asarray(a)
             y = jnp.asarray(y)
             shape = jnp.broadcast_shapes(a.shape, y.shape)
             dtype = float_type(a.dtype, y.dtype)
             dummy = jax.ShapeDtypeStruct(shape, dtype)
-            ufunc = castto(special.gammainccinv, dtype)
+            ufunc = _castto(special.gammainccinv, dtype)
             return jax.pure_callback(ufunc, dummy, a, y, vmap_method='expand_dims')
     class stats:
+        """Mockup of the :external:py:mod:`scipy.stats` module."""
         class invgamma:
+            """Class that represents the distribution InvGamma(a, 1)."""
+            @staticmethod
             def ppf(q, a):
+                """Percentile point function."""
                 return 1 / scipy.special.gammainccinv(a, q)
-@functools.wraps(jax.vmap)
 def vmap_nodoc(fun, *args, **kw):
     """
-    Version of `jax.vmap` that preserves the docstring of the input function.
+    Acts like `jax.vmap` but preserves the docstring of the function unchanged.
+    This is useful if the docstring already takes into account that the
+    arguments have additional axes due to vmap.
     """
     doc = fun.__doc__
     fun = jax.vmap(fun, *args, **kw)
     fun.__doc__ = doc
     return fun
 def huge_value(x):
     """
     Return the maximum value that can be stored in `x`.
@@ -95,23 +109,23 @@ def huge_value(x):
     else:
         return jnp.inf
-def minimal_unsigned_dtype(max_value):
-    """
-    Return the smallest unsigned integer dtype that can represent a given
-    maximum value (inclusive).
-    """
-    if max_value < 2 ** 8:
+def minimal_unsigned_dtype(value):
+    """Return the smallest unsigned integer dtype that can represent `value`."""
+    if value < 2**8:
         return jnp.uint8
-    if max_value < 2 ** 16:
+    if value < 2**16:
         return jnp.uint16
-    if max_value < 2 ** 32:
+    if value < 2**32:
         return jnp.uint32
     return jnp.uint64
 def signed_to_unsigned(int_dtype):
     """
-    Map a signed integer type to its unsigned counterpart. Unsigned types are
-    passed through.
+    Map a signed integer type to its unsigned counterpart.
+    Unsigned types are passed through.
     """
     assert jnp.issubdtype(int_dtype, jnp.integer)
     if jnp.issubdtype(int_dtype, jnp.unsignedinteger):
@@ -125,12 +139,12 @@ def signed_to_unsigned(int_dtype):
     if int_dtype == jnp.int64:
         return jnp.uint64
 def ensure_unsigned(x):
-    """
-    If x has signed integer type, cast it to the unsigned dtype of the same size.
-    """
+    """If x has signed integer type, cast it to the unsigned dtype of the same size."""
     return x.astype(signed_to_unsigned(x.dtype))
 @functools.partial(jax.jit, static_argnums=(1,))
 def unique(x, size, fill_value):
     """
@@ -158,15 +172,18 @@ def unique(x, size, fill_value):
     if size == 0:
         return jnp.empty(0, x.dtype), 0
     x = jnp.sort(x)
     def loop(carry, x):
         i_out, i_in, last, out = carry
         i_out = jnp.where(x == last, i_out, i_out + 1)
         out = out.at[i_out].set(x)
         return (i_out, i_in + 1, x, out), None
     carry = 0, 0, x[0], jnp.full(size, fill_value, x.dtype)
     (actual_length, _, _, out), _ = jax.lax.scan(loop, carry, x[:size])
     return out, actual_length + 1
 def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False):
     """
     Batch a function such that each batch is smaller than a threshold.
@@ -203,6 +220,7 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
     def check_no_nones(axes, tree):
         def check_not_none(_, axis):
             assert axis is not None
         tree_util.tree_map(check_not_none, tree, axes)
     def extract_size(axes, tree):
@@ -211,6 +229,7 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
                 return None
             else:
                 return x.shape[axis]
         sizes = tree_util.tree_map(get_size, tree, axes)
         sizes, _ = tree_util.tree_flatten(sizes)
         assert all(s == sizes[0] for s in sizes)
@@ -219,6 +238,7 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
     def sum_nbytes(tree):
         def nbytes(x):
             return math.prod(x.shape) * x.dtype.itemsize
         return tree_util.tree_reduce(lambda size, x: size + nbytes(x), tree, 0)
     def next_divisor_small(dividend, min_divisor):
@@ -247,6 +267,7 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
                 return None
             else:
                 return x
         return tree_util.tree_map(pull_nonbatched, tree, axes), tree
     def push_nonbatched(axes, tree, original_tree):
@@ -255,32 +276,38 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
                 return original_x
             else:
                 return x
         return tree_util.tree_map(push_nonbatched, original_tree, tree, axes)
     def move_axes_out(axes, tree):
         def move_axis_out(x, axis):
             return jnp.moveaxis(x, axis, 0)
         return tree_util.tree_map(move_axis_out, tree, axes)
     def move_axes_in(axes, tree):
         def move_axis_in(x, axis):
             return jnp.moveaxis(x, 0, axis)
         return tree_util.tree_map(move_axis_in, tree, axes)
     def batch(tree, nbatches):
         def batch(x):
             return x.reshape((nbatches, x.shape[0] // nbatches) + x.shape[1:])
         return tree_util.tree_map(batch, tree)
     def unbatch(tree):
         def unbatch(x):
             return x.reshape((x.shape[0] * x.shape[1],) + x.shape[2:])
         return tree_util.tree_map(unbatch, tree)
     def check_same(tree1, tree2):
         def check_same(x1, x2):
             assert x1.shape == x2.shape
             assert x1.dtype == x2.dtype
         tree_util.tree_map(check_same, tree1, tree2)
     initial_in_axes = in_axes
@@ -300,7 +327,9 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
         args, nonbatched_args = pull_nonbatched(in_axes, args)
         total_nbytes = sum_nbytes((args, example_result))
-        min_nbatches = total_nbytes // max_io_nbytes + bool(total_nbytes % max_io_nbytes)
+        min_nbatches = total_nbytes // max_io_nbytes + bool(
+            total_nbytes % max_io_nbytes
+        )
         min_nbatches = max(1, min_nbatches)
         nbatches = next_divisor(size, min_nbatches)
         assert 1 <= nbatches <= max(1, size)
@@ -310,7 +339,9 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
         batch_nbytes = total_nbytes // nbatches
         if batch_nbytes > max_io_nbytes:
             assert size == nbatches
-            warnings.warn(f'batch_nbytes = {batch_nbytes} > max_io_nbytes = {max_io_nbytes}')
+            warnings.warn(
+                f'batch_nbytes = {batch_nbytes} > max_io_nbytes = {max_io_nbytes}'
+            )
         def loop(_, args):
             args = move_axes_in(in_axes, args)
@@ -333,17 +364,60 @@ def autobatch(func, max_io_nbytes, in_axes=0, out_axes=0, return_nbatches=False)
     return batched_func
-@tree_util.register_pytree_node_class
-class LeafDict(dict):
-    """ dictionary that acts as a leaf in jax pytrees, to store compile-time
-    values """
-    def tree_flatten(self):
-        return (), self
+class split:
+    """
+    Split a key into `num` keys.
-    @classmethod
-    def tree_unflatten(cls, aux_data, children):
-        return aux_data
+    Parameters
+    ----------
+    key : jax.dtypes.prng_key array
+        The key to split.
+    num : int
+        The number of keys to split into.
+    """
-    def __repr__(self):
-        return f'{__class__.__name__}({super().__repr__()})'
+    def __init__(self, key, num=2):
+        self._keys = random.split(key, num)
+    def __len__(self):
+        return self._keys.size
+    def pop(self, shape=None):
+        """
+        Pop one or more keys from the list.
+        Parameters
+        ----------
+        shape : int or tuple of int, optional
+            The shape of the keys to pop. If `None`, a single key is popped.
+            If an integer, that many keys are popped. If a tuple, the keys are
+            reshaped to that shape.
+        Returns
+        -------
+        keys : jax.dtypes.prng_key array
+            The popped keys.
+        Raises
+        ------
+        IndexError
+            If `shape` is larger than the number of keys left in the list.
+        Notes
+        -----
+        The keys are popped from the beginning of the list, so for example
+        ``list(keys.pop(2))`` is equivalent to ``[keys.pop(), keys.pop()]``.
+        """
+        if shape is None:
+            shape = ()
+        elif not isinstance(shape, tuple):
+            shape = (shape,)
+        size_to_pop = math.prod(shape)
+        if size_to_pop > self._keys.size:
+            raise IndexError(
+                f'Cannot pop {size_to_pop} keys from {self._keys.size} keys'
+            )
+        popped_keys = self._keys[:size_to_pop]
+        self._keys = self._keys[size_to_pop:]
+        return popped_keys.reshape(shape)

bartz 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

bartz 0.4.1py3-none-any.whl → 0.6.0py3-none-any.whl