PyPI - xax - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

xax 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

xax/__init__.py +4 -1
xax/nn/attention.py +144 -92
xax/nn/embeddings.py +10 -10
xax/nn/geom.py +5 -5
xax/nn/ssm.py +6 -6
xax/task/mixins/train.py +6 -1
xax/utils/pytree.py +13 -0
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/METADATA +1 -1
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/RECORD +13 -13
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/WHEEL +0 -0
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/entry_points.txt +0 -0
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/licenses/LICENSE +0 -0
{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/top_level.txt +0 -0

xax/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.3.4"
+__version__ = "0.3.6"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -136,6 +136,7 @@ __all__ = [
     "compute_nan_ratio",
     "flatten_array",
     "flatten_pytree",
+    "get_pytree_mapping",
     "get_pytree_param_count",
     "pytree_has_nans",
     "reshuffle_pytree",
@@ -323,6 +324,7 @@ NAME_MAP: dict[str, str] = {
     "compute_nan_ratio": "utils.pytree",
     "flatten_array": "utils.pytree",
     "flatten_pytree": "utils.pytree",
+    "get_pytree_mapping": "utils.pytree",
     "get_pytree_param_count": "utils.pytree",
     "pytree_has_nans": "utils.pytree",
     "reshuffle_pytree": "utils.pytree",
@@ -509,6 +511,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         compute_nan_ratio,
         flatten_array,
         flatten_pytree,
+        get_pytree_mapping,
         get_pytree_param_count,
         pytree_has_nans,
         reshuffle_pytree,

xax/nn/attention.py CHANGED Viewed

@@ -5,6 +5,8 @@ supporting a fixed-size context window and caching that can be used to train
 transformers which can be unrolled with a fixed-length cache.
 """
+import math
+import warnings
 from typing import NotRequired, TypedDict
 import chex
@@ -13,6 +15,8 @@ import jax
 import jax.numpy as jnp
 from jaxtyping import Array, PRNGKeyArray
+from xax.utils.jax import scan as xax_scan
 class RotaryEmbedding(eqx.Module):
     """Rotary Position Embedding (RoPE) for transformer attention.
@@ -22,8 +26,8 @@ class RotaryEmbedding(eqx.Module):
     https://arxiv.org/abs/2104.09864
     """
-    head_dim: int = eqx.static_field()
-    base: float = eqx.static_field()
+    head_dim: int = eqx.field()
+    base: float = eqx.field()
     def __init__(
         self,
@@ -125,15 +129,15 @@ class TransformerCache(TypedDict):
 class SelfAttentionBlock(eqx.Module):
     """Self-attention block using jax.nn.dot_product_attention."""
-    q_proj: eqx.nn.Linear
-    k_proj: eqx.nn.Linear
-    v_proj: eqx.nn.Linear
-    output_proj: eqx.nn.Linear
-    rotary_emb: RotaryEmbedding | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    q_proj: eqx.nn.Linear = eqx.field()
+    k_proj: eqx.nn.Linear = eqx.field()
+    v_proj: eqx.nn.Linear = eqx.field()
+    output_proj: eqx.nn.Linear = eqx.field()
+    rotary_emb: RotaryEmbedding | None = eqx.field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    local_window_size: int | None = eqx.field()
     def __init__(
         self,
@@ -169,8 +173,12 @@ class SelfAttentionBlock(eqx.Module):
         else:
             self.rotary_emb = None
+        if context_length is not None and not causal:
+            warnings.warn("context_length is set but causal is False; overriding causal to True", stacklevel=2)
+            causal = True
         self.causal = causal
-        self.context_length = context_length
+        self.local_window_size = None if context_length is None else context_length - 1
     @property
     def embed_dim(self) -> int:
@@ -195,28 +203,44 @@ class SelfAttentionBlock(eqx.Module):
         Returns:
             Cache with fixed-length k and v tensors
         """
-        if self.context_length is None:
+        if self.local_window_size is None:
             raise ValueError("context_length must be set for caching")
         # Create fixed-length cache
-        k_cache = jnp.zeros((self.context_length - 1, self.num_heads, self.head_dim), dtype=dtype)
-        v_cache = jnp.zeros((self.context_length - 1, self.num_heads, self.head_dim), dtype=dtype)
+        k_cache = jnp.zeros((self.local_window_size, self.num_heads, self.head_dim), dtype=dtype)
+        v_cache = jnp.zeros((self.local_window_size, self.num_heads, self.head_dim), dtype=dtype)
         return {"k": k_cache, "v": v_cache, "position": 0}
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        in_dim, out_dim = seq_len, seq_len
-        if with_cache:
-            if self.context_length is None:
-                raise ValueError("context_length must be set for caching")
-            in_dim = in_dim + self.context_length - 1
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        """Initialize the attention matrix mask.
-        mask = jnp.tril(jnp.ones((in_dim, out_dim)))
-        if self.context_length is not None:
-            neg_mask = 1 - jnp.tril(jnp.ones((in_dim, out_dim)), -self.context_length)
-            mask = mask * neg_mask
+        Args:
+            seq_len: The length of the sequence
+            add_cache: Whether to add the cache to the mask
+            batch_dim: Whether to add a batch dimension to the mask
-        return mask.astype(jnp.bool_).transpose()
+        Returns:
+            The attention matrix mask of shape (bsz, 1, seq_len, seq_len + cache_len)
+            if batch_dim is True, otherwise (seq_len, seq_len + cache_len).
+        """
+        t, s, o = seq_len, seq_len, 0
+        if add_cache:
+            if self.local_window_size is None:
+                raise ValueError("local_window_size must be set for caching")
+            s += self.local_window_size
+            o -= self.local_window_size
+        mask = jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-o)
+        if self.local_window_size is not None:
+            neg_mask = ~jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-(self.local_window_size + 1 + o))
+            mask = mask & neg_mask
+        mask = mask.reshape(1, 1, t, s) if batch_dim else mask.reshape(t, s)
+        return mask
     def forward(
         self,
@@ -229,7 +253,8 @@ class SelfAttentionBlock(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
-            mask: Optional mask tensor
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: The cached key and value tensors (fixed-length)
         Returns:
@@ -263,25 +288,36 @@ class SelfAttentionBlock(eqx.Module):
             v_cache = cache["v"]
             k = jnp.concatenate([k_cache, k], axis=0)
             v = jnp.concatenate([v_cache, v], axis=0)
             new_position = cache["position"] + seq_len
         else:
             new_position = seq_len
-        attn_output = jax.nn.dot_product_attention(
-            q,
-            k,
-            v,
-            mask=mask,
-            is_causal=self.causal and mask is None,
-        )
+        if seq_len == 1:
+            attn_output = jax.nn.dot_product_attention(q, k, v)
+        elif mask is not None:
+            attn_output = jax.nn.dot_product_attention(q, k, v, mask=mask)
+        elif cache is not None:
+            raise NotImplementedError("For training with a cache, provide a mask instead.")
+        else:
+            attn_output = jax.nn.dot_product_attention(
+                q,
+                k,
+                v,
+                is_causal=self.causal,
+                local_window_size=(self.local_window_size, 0) if self.local_window_size is not None else None,
+            )
         attn_output = self._combine_heads(attn_output)
         output = jax.vmap(self.output_proj)(attn_output)
-        if self.context_length is not None:
-            k = k[-(self.context_length - 1) :]
-            v = v[-(self.context_length - 1) :]
+        if self.local_window_size is not None:
+            k = k[-self.local_window_size :]
+            v = v[-self.local_window_size :]
         return output, {"k": k, "v": v, "position": new_position}
@@ -294,8 +330,8 @@ class CrossAttentionBlock(eqx.Module):
     v_proj: eqx.nn.Linear
     output_proj: eqx.nn.Linear
     rotary_emb: RotaryEmbedding | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
     def __init__(
         self,
@@ -352,7 +388,6 @@ class CrossAttentionBlock(eqx.Module):
         *,
         kv_sn: Array | None = None,
         cache: AttentionCache | None = None,
-        mask: Array | None = None,
     ) -> tuple[Array, AttentionCache]:
         """Apply cross-attention.
@@ -362,7 +397,6 @@ class CrossAttentionBlock(eqx.Module):
                 If not provided, then `cache` must be provided.
             cache: The cached key and value tensors. If not provided, then
                 `kv_sn` must be provided.
-            mask: Optional mask tensor
         Returns:
             The output tensor of shape (q_seq_len, embed_dim)
@@ -404,7 +438,7 @@ class CrossAttentionBlock(eqx.Module):
             q_rot,
             k_rot,
             v,
-            mask=mask,
+            scale=1.0 / math.sqrt(self.head_dim),
             is_causal=False,
         )
@@ -424,10 +458,10 @@ class TransformerBlock(eqx.Module):
     layer_norm1: eqx.nn.LayerNorm
     layer_norm2: eqx.nn.LayerNorm
     layer_norm3: eqx.nn.LayerNorm | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    context_length: int | None = eqx.field()
     def __init__(
         self,
@@ -500,16 +534,24 @@ class TransformerBlock(eqx.Module):
             cache["cross_attn"] = self.cross_attn.init_cache(kv_sn=context_sn)
         return cache
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.self_attn.init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.self_attn.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: AttentionCacheDict | None = None,
     ) -> tuple[Array, AttentionCacheDict]:
         """Apply transformer block.
@@ -517,8 +559,8 @@ class TransformerBlock(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
-            self_mask: Mask for self-attention
-            cross_mask: Mask for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -531,7 +573,7 @@ class TransformerBlock(eqx.Module):
         attn_output, self_attn_cache = self.self_attn.forward(
             x_tn=norm_x,
-            mask=self_mask,
+            mask=mask,
             cache=None if cache is None else cache["self_attn"],
         )
         updated_cache: AttentionCacheDict = {"self_attn": self_attn_cache}
@@ -547,7 +589,6 @@ class TransformerBlock(eqx.Module):
             cross_attn_output, updated_cache["cross_attn"] = self.cross_attn.forward(
                 q_tn=norm_x,
                 kv_sn=context_sn,
-                mask=cross_mask,
                 cache=None if cache is None else cache.get("cross_attn"),
             )
@@ -564,9 +605,9 @@ class TransformerBlock(eqx.Module):
 class TransformerStack(eqx.Module):
     """A stack of transformer blocks."""
-    layers: list[TransformerBlock]
-    num_layers: int = eqx.static_field()
-    causal: bool = eqx.static_field()
+    layers: tuple[TransformerBlock, ...]
+    num_layers: int = eqx.field()
+    causal: bool = eqx.field()
     def __init__(
         self,
@@ -584,7 +625,7 @@ class TransformerStack(eqx.Module):
     ) -> None:
         keys = jax.random.split(key, num_layers)
-        self.layers = [
+        self.layers = tuple(
             TransformerBlock(
                 embed_dim=embed_dim,
                 num_heads=num_heads,
@@ -597,7 +638,7 @@ class TransformerStack(eqx.Module):
                 rotary_base=rotary_base,
             )
             for i in range(num_layers)
-        ]
+        )
         self.num_layers = num_layers
         self.causal = causal
@@ -609,16 +650,24 @@ class TransformerStack(eqx.Module):
             cache[f"layer_{i}"] = layer.init_cache(dtype=dtype, context_sn=x_tn)
         return {"layers": cache}
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.layers[0].init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers[0].init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Apply transformer stack.
@@ -626,8 +675,8 @@ class TransformerStack(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
-            self_mask: Mask for self-attention
-            cross_mask: Mask for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -647,8 +696,7 @@ class TransformerStack(eqx.Module):
             x_tn, updated_cache["layers"][f"layer_{i}"] = layer.forward(
                 x_tn,
                 context_sn=context_sn,
-                self_mask=self_mask,
-                cross_mask=cross_mask,
+                mask=mask,
                 cache=layer_cache,
             )
@@ -660,9 +708,9 @@ class Transformer(eqx.Module):
     layers: TransformerStack
     output_layer: eqx.nn.Linear | None
     layer_norm: eqx.nn.LayerNorm
-    embed_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    embed_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    context_length: int | None = eqx.field()
     def __init__(
         self,
@@ -713,8 +761,17 @@ class Transformer(eqx.Module):
         """Initialize cache for the input."""
         return self.layers.init_cache(dtype=dtype, x_tn=x_tn)
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.layers.init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def encode(
         self,
@@ -727,7 +784,8 @@ class Transformer(eqx.Module):
         Args:
             x: Input token indices of shape (seq_len)
-            mask: Optional attention mask
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -737,11 +795,7 @@ class Transformer(eqx.Module):
         x_embedded = jax.vmap(self.token_embedding)(x)
         # Apply transformer stack
-        x_embedded, updated_cache = self.layers.forward(
-            x_embedded,
-            self_mask=mask,
-            cache=cache,
-        )
+        x_embedded, updated_cache = self.layers.forward(x_embedded, mask=mask, cache=cache)
         # Apply final layer norm
         output = jax.vmap(self.layer_norm)(x_embedded)
@@ -753,8 +807,7 @@ class Transformer(eqx.Module):
         x_t: Array,
         context_s: Array,
         *,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Decode with self-attention and cross-attention.
@@ -763,8 +816,8 @@ class Transformer(eqx.Module):
             x_t: Input token indices, shape (seq_len)
             context_s: Context from encoder (token indices or embedded),
                 shape (context_len, embed_dim)
-            self_mask: Optional self-attention mask, shape (seq_len, seq_len)
-            cross_mask: Optional cross-attention mask, shape (seq_len, context_len)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -780,8 +833,7 @@ class Transformer(eqx.Module):
         x_embedded, updated_cache = self.layers.forward(
             x_embedded,
             context_sn=context_embedded,
-            self_mask=self_mask,
-            cross_mask=cross_mask,
+            mask=mask,
             cache=cache,
         )
@@ -801,7 +853,8 @@ class Transformer(eqx.Module):
         Args:
             x: Input token indices of shape (seq_len)
-            mask: Optional attention mask
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -809,11 +862,7 @@ class Transformer(eqx.Module):
         """
         chex.assert_rank(x, 1)
-        output, updated_cache = self.encode(
-            x,
-            mask=mask,
-            cache=cache,
-        )
+        output, updated_cache = self.encode(x, mask=mask, cache=cache)
         # Apply output layer if it exists
         if self.output_layer is not None:
@@ -832,6 +881,7 @@ class Transformer(eqx.Module):
         temperature: float = 1.0,
         top_k: int | None = None,
         key: PRNGKeyArray | None = None,
+        jit_level: int | None = None,
     ) -> Array:
         """Generate a sequence autoregressively with KV caching.
@@ -841,6 +891,7 @@ class Transformer(eqx.Module):
             temperature: Sampling temperature
             top_k: Optional top-k sampling parameter
             key: PRNG key for sampling
+            jit_level: JIT level for the scan function
         Returns:
             Generated sequence of shape (prompt_len + max_len,)
@@ -856,7 +907,8 @@ class Transformer(eqx.Module):
         # Initialize cache with prompt
         cache = self.init_cache()
-        _, cache = self.encode(prompt_seq, cache=cache)
+        mask = self.init_mask(prompt_len, add_cache=True, batch_dim=False)
+        _, cache = self.encode(prompt_seq, cache=cache, mask=mask)
         # Define scan function for autoregressive generation
         def scan_fn(
@@ -884,5 +936,5 @@ class Transformer(eqx.Module):
             return (new_output_seq, pos + 1, new_cache, rng), next_token
         init_carry = (output_seq, prompt_len - 1, cache, key)
-        (final_seq, _, _, _), _ = jax.lax.scan(scan_fn, init_carry, length=max_len)
+        (final_seq, _, _, _), _ = xax_scan(scan_fn, init_carry, length=max_len, jit_level=jit_level)
         return final_seq

xax/nn/embeddings.py CHANGED Viewed

@@ -33,10 +33,10 @@ class LearnedPositionalEmbeddings(eqx.Module):
         learnable: Whether the embeddings are learnable.
     """
-    max_tsz: int = eqx.field(static=True)
-    embed_dim: int = eqx.field(static=True)
-    learnable: bool = eqx.field(static=True)
-    embeddings_tc: Array
+    max_tsz: int = eqx.field()
+    embed_dim: int = eqx.field()
+    learnable: bool = eqx.field()
+    embeddings_tc: Array = eqx.field()
     def __init__(
         self,
@@ -74,10 +74,10 @@ class SinusoidalEmbeddings(eqx.Module):
         base: The base for the sinusoidal embeddings.
     """
-    base: int = eqx.field(static=True)
-    max_tsz: int | None = eqx.field(static=True)
-    embed_dim: int | None = eqx.field(static=True)
-    embeddings_tc: Array | None
+    base: int = eqx.field()
+    max_tsz: int | None = eqx.field()
+    embed_dim: int | None = eqx.field()
+    embeddings_tc: Array | None = eqx.field()
     def __init__(
         self,
@@ -91,8 +91,8 @@ class SinusoidalEmbeddings(eqx.Module):
         self.max_tsz = max_tsz
         self.embed_dim = embed_dim
         self.base = base
+        self.embeddings_tc = None
-        self.embeddings_tc: Array | None = None
         if learnable:
             assert max_tsz is not None, "Learnable parameters require `max_tsz` to be set"
             assert embed_dim is not None, "Learnable parameters require `embed_dim` to be set"
@@ -192,7 +192,7 @@ class RotaryEmbeddings(eqx.Module):
         base: The base for the sinusoidal embeddings.
     """
-    base: int = eqx.field(static=True)
+    base: int = eqx.field()
     def __init__(self, base: int = 10_000) -> None:
         """Defines a rotary embeddings module.

xax/nn/geom.py CHANGED Viewed

@@ -207,7 +207,7 @@ def quat_to_rotmat(quat: Array, eps: float = 1e-6) -> Array:
 def normalize(v: jnp.ndarray, axis: int = -1, eps: float = 1e-8) -> jnp.ndarray:
     norm = jnp.linalg.norm(v, axis=axis, keepdims=True)
-    return v / jnp.clip(norm, a_min=eps)
+    return v / jnp.clip(norm, min=eps)
 def rotation6d_to_rotation_matrix(r6d: jnp.ndarray) -> jnp.ndarray:
@@ -299,28 +299,28 @@ def rotation_matrix_to_quat(rotation_matrix: Array, eps: float = 1e-6) -> Array:
     trace = m00 + m11 + m22
     # Case 0: trace is positive
-    s0 = jnp.sqrt(jnp.clip(trace + 1.0, a_min=0.0)) * 2.0  # S = 4 * qw
+    s0 = jnp.sqrt(jnp.clip(trace + 1.0, min=0.0)) * 2.0  # S = 4 * qw
     w0 = 0.25 * s0
     x0 = (m21 - m12) / jnp.where(s0 < eps, 1.0, s0)
     y0 = (m02 - m20) / jnp.where(s0 < eps, 1.0, s0)
     z0 = (m10 - m01) / jnp.where(s0 < eps, 1.0, s0)
     # Case 1: m00 is the largest diagonal term
-    s1 = jnp.sqrt(jnp.clip(1.0 + m00 - m11 - m22, a_min=0.0)) * 2.0  # S = 4 * qx
+    s1 = jnp.sqrt(jnp.clip(1.0 + m00 - m11 - m22, min=0.0)) * 2.0  # S = 4 * qx
     w1 = (m21 - m12) / jnp.where(s1 < eps, 1.0, s1)
     x1 = 0.25 * s1
     y1 = (m01 + m10) / jnp.where(s1 < eps, 1.0, s1)
     z1 = (m02 + m20) / jnp.where(s1 < eps, 1.0, s1)
     # Case 2: m11 is the largest diagonal term
-    s2 = jnp.sqrt(jnp.clip(1.0 + m11 - m00 - m22, a_min=0.0)) * 2.0  # S = 4 * qy
+    s2 = jnp.sqrt(jnp.clip(1.0 + m11 - m00 - m22, min=0.0)) * 2.0  # S = 4 * qy
     w2 = (m02 - m20) / jnp.where(s2 < eps, 1.0, s2)
     x2 = (m01 + m10) / jnp.where(s2 < eps, 1.0, s2)
     y2 = 0.25 * s2
     z2 = (m12 + m21) / jnp.where(s2 < eps, 1.0, s2)
     # Case 3: m22 is the largest diagonal term
-    s3 = jnp.sqrt(jnp.clip(1.0 + m22 - m00 - m11, a_min=0.0)) * 2.0  # S = 4 * qz
+    s3 = jnp.sqrt(jnp.clip(1.0 + m22 - m00 - m11, min=0.0)) * 2.0  # S = 4 * qz
     w3 = (m10 - m01) / jnp.where(s3 < eps, 1.0, s3)
     x3 = (m02 + m20) / jnp.where(s3 < eps, 1.0, s3)
     y3 = (m12 + m21) / jnp.where(s3 < eps, 1.0, s3)

xax/nn/ssm.py CHANGED Viewed

@@ -222,12 +222,12 @@ class DiscreteDiagSSMBlock(DiagSSMBlock):
 class SSM(eqx.Module):
-    vocab_embedding: eqx.nn.Embedding
-    output_layer: eqx.nn.Linear
-    blocks: list[BaseSSMBlock]
-    num_layers: int = eqx.static_field()
-    hidden_size: int = eqx.static_field()
-    skip_connections: bool = eqx.static_field()
+    vocab_embedding: eqx.nn.Embedding = eqx.field()
+    output_layer: eqx.nn.Linear = eqx.field()
+    blocks: list[BaseSSMBlock] = eqx.field()
+    num_layers: int = eqx.field()
+    hidden_size: int = eqx.field()
+    skip_connections: bool = eqx.field()
     def __init__(
         self,

xax/task/mixins/train.py CHANGED Viewed

@@ -40,7 +40,12 @@ from xax.core.state import Phase, State
 from xax.nn.functions import set_random_seed
 from xax.nn.parallel import is_master
 from xax.task.mixins.artifacts import ArtifactsConfig, ArtifactsMixin
-from xax.task.mixins.checkpointing import CheckpointingConfig, CheckpointingMixin, CheckpointPart, load_ckpt
+from xax.task.mixins.checkpointing import (
+    CheckpointingConfig,
+    CheckpointingMixin,
+    CheckpointPart,
+    load_ckpt,
+)
 from xax.task.mixins.data_loader import DataloadersConfig, DataloadersMixin
 from xax.task.mixins.logger import LoggerConfig, LoggerMixin
 from xax.task.mixins.runnable import RunnableConfig, RunnableMixin

xax/utils/pytree.py CHANGED Viewed

@@ -253,3 +253,16 @@ def tuple_insert(t: tuple[T, ...], index: int, value: T) -> tuple[T, ...]:
     mut = list(t)
     mut[index] = value
     return tuple(mut)
+def get_pytree_mapping(pytree: PyTree) -> dict[str, Array]:
+    leaves: dict[str, Array] = {}
+    def _get_leaf(path: tuple, x: PyTree) -> None:
+        if isinstance(x, jnp.ndarray):
+            # Convert path tuple to string, e.g. (1, 'a', 2) -> '1/a/2'
+            path_str = "/".join(str(p) for p in path)
+            leaves[path_str] = x
+    jax.tree.map_with_path(_get_leaf, pytree)
+    return leaves

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.3.4
+Version: 0.3.6
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-xax/__init__.py,sha256=LJFB4xQplzC08tkbkZMxaCd-7jIB7aJZzBMcs9AuqiM,16240
+xax/__init__.py,sha256=9i6UlrAP1wLDh1lod-4ETWll4pcADIir_Tk3O6OvH7g,16336
 xax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/requirements-dev.txt,sha256=qkscNkFzWd1S5fump-AKH53rR65v2x5FmboFdy_kKvs,128
 xax/requirements.txt,sha256=6qY-84e-sTmlfJNrSjwONQKqzAn5h8G_oGIhnhmfSr4,302
@@ -8,14 +8,14 @@ xax/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/core/conf.py,sha256=d7Dp_GwKnaxtkztlSrJSM_LR0UYJX_FWTtceIWCBkxc,5138
 xax/core/state.py,sha256=_gtINsRc310Bu_HuIYsDoOKTZa6DgU2tz0IOKkdnY9Q,3813
 xax/nn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-xax/nn/attention.py,sha256=aIEtrM7vAQtaXTPKmsqGcYqt03CyiUQMccXj8Cjw3vc,29514
-xax/nn/embeddings.py,sha256=bQGxBFxkLwi2MQLkRfGaHPH5P_KKB21HdI7VNWTKIOQ,11847
+xax/nn/attention.py,sha256=m6yEoRqf7-wLgrEltaR6CxF_Cody0MaNtAkuKk39qJI,31176
+xax/nn/embeddings.py,sha256=8tAuAPdkVj-U5IwtRZKHA0WYMFRbpCuwyAxcChdKhbE,11784
 xax/nn/functions.py,sha256=bA5kJYzMtFM8eUqBC086i355zJMAO7k_vPFNSDBI9-s,2814
-xax/nn/geom.py,sha256=6rBQrZRX1miG08VG-s8phPjA6MEFxUAfQVPt5F0RQQI,10645
+xax/nn/geom.py,sha256=c9K52vLm-V-15CRqMNx0OmqsWfb3PHQxXW4OSx9kCAk,10635
 xax/nn/losses.py,sha256=Q_NVnm5n4UPBvp5nI_1aUptfXnqFYoUeFwySiyvopHg,272
 xax/nn/metrics.py,sha256=zuvPXlRQczBTLHD4ilNGmZaiq6Yie3rxCMq6JkI_kos,3154
 xax/nn/parallel.py,sha256=fnTiT7MsG7eQrJvqwjIz2Ifo3P27TuxIJzmpGYSa_dQ,4608
-xax/nn/ssm.py,sha256=8dLAcQ1hBaMT-kkHvwGu_ecxJeTY32WeMYmd4T4KtxA,10745
+xax/nn/ssm.py,sha256=qSBv_FobnaFA5jt87OF5P2q5ih6sj4SlehhEhEFaPjA,10766
 xax/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/task/base.py,sha256=i6FRJ75aqlekWkzJNRWDUEX7P514pUjLVuxjhX1GBgw,8198
 xax/task/logger.py,sha256=Bmhl4mv08Aq49ZyX6BdjPIsPJK28e8s3mVFatM4IY2Q,41060
@@ -42,7 +42,7 @@ xax/task/mixins/logger.py,sha256=6oXsJJyNUx6YT3q58FVXMZBUpMgjVkGre6BXFN20cVI,280
 xax/task/mixins/process.py,sha256=hqDEsMp_SL6ee97iq26-G0g49OcWZZaX82JD4F22eJU,1781
 xax/task/mixins/runnable.py,sha256=pcLrYc_TycZUY9zZim05Skc2FWk3IZKFnu6p3UDMonM,1966
 xax/task/mixins/step_wrapper.py,sha256=-Yu5Nft2CRw1JvZt6J_94SM1vqX8fk08IDK95Pmd2ew,1648
-xax/task/mixins/train.py,sha256=TZatz5QwTfrNhQTiO2IqrmQY9P4Lay6FAD2VsQpWa54,33245
+xax/task/mixins/train.py,sha256=bjBoigTCjbq9H4hcqIO32irHBc9rC2zkgXrnGNI2RtI,33266
 xax/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/debugging.py,sha256=OtUdu-3tQsQtik0Q9UM-SNV46IbPjwrAfZcywzoB5d4,1940
 xax/utils/experiments.py,sha256=5k5hPYSaVjzoR_nm2Q3DAHMMYi3Bcp3N3PAQbwZq7Gg,29830
@@ -51,7 +51,7 @@ xax/utils/jaxpr.py,sha256=H7pWl48ROXIB1-ZPWYfOn-ou3EBMxYWIwc_A0reJQoo,2333
 xax/utils/logging.py,sha256=Kkyma_LJXqrN2HTQ214gRP_9ih3_bKk115MWC60lQWM,6656
 xax/utils/numpy.py,sha256=_jOXVi-d2AtJnRftPkRK5MDMzsU8slgw-Jjv4GRm6ns,1197
 xax/utils/profile.py,sha256=-aFdWpgYFvBsBZXSLL4zXrFe3zzsDqzmx4q5f2WOtpQ,1628
-xax/utils/pytree.py,sha256=rVY2kKa637xfX3Oue6OP9ScwmDyxJ_CeHkUpZZtmN04,9231
+xax/utils/pytree.py,sha256=cLZRSd5xc-DqcbRfWnBy87pAiUU5fT8U4CHoLi_i_v4,9642
 xax/utils/tensorboard.py,sha256=P0oIFvX2Qts1H4lkpizhRIpQdD0MNppVMeut0Z94yCs,19878
 xax/utils/text.py,sha256=xS02aSzdywl3KIaNSpKWcxdd37oYlUJtu9wIjkc1wVc,10654
 xax/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -59,9 +59,9 @@ xax/utils/data/collate.py,sha256=Rd9vMomr_S_zCa_Hi4dO-8ntzAfVwndIUtuXFA3iNcc,706
 xax/utils/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/types/frozen_dict.py,sha256=ebtHENhyUzSjyJTlbMaLtcckQIJ7EtgJiok_40TJZpo,4689
 xax/utils/types/hashable_array.py,sha256=l5iIcFmkYzfGeaZmcSoeFkthFASqM8xJYK3AXhZQYwc,992
-xax-0.3.4.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
-xax-0.3.4.dist-info/METADATA,sha256=j_UQdK4iPYbhzMH0osmHm5XJnYnFY1A_Z5MwSJwXr-4,1246
-xax-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xax-0.3.4.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
-xax-0.3.4.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
-xax-0.3.4.dist-info/RECORD,,
+xax-0.3.6.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
+xax-0.3.6.dist-info/METADATA,sha256=PI1onOBOY7vwwjDdg_fDoQIDSQ6tyUfwDK3nPnE_fcE,1246
+xax-0.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+xax-0.3.6.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
+xax-0.3.6.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
+xax-0.3.6.dist-info/RECORD,,

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xax-0.3.4.dist-info → xax-0.3.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

xax 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

xax 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl