PyPI - xax - Versions diffs - 0.3.4__tar.gz → 0.3.9__tar.gz - Mend

xax 0.3.4tar.gz → 0.3.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{xax-0.3.4/xax.egg-info → xax-0.3.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.3.4
+Version: 0.3.9
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.3.4 → xax-0.3.9}/xax/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.3.4"
+__version__ = "0.3.9"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -31,6 +31,10 @@ __all__ = [
     "TransformerBlock",
     "TransformerCache",
     "TransformerStack",
+    "Categorical",
+    "Distribution",
+    "MixtureOfGaussians",
+    "Normal",
     "FourierEmbeddings",
     "IdentityPositionalEmbeddings",
     "LearnedPositionalEmbeddings",
@@ -136,6 +140,7 @@ __all__ = [
     "compute_nan_ratio",
     "flatten_array",
     "flatten_pytree",
+    "get_pytree_mapping",
     "get_pytree_param_count",
     "pytree_has_nans",
     "reshuffle_pytree",
@@ -218,6 +223,10 @@ NAME_MAP: dict[str, str] = {
     "TransformerBlock": "nn.attention",
     "TransformerCache": "nn.attention",
     "TransformerStack": "nn.attention",
+    "Categorical": "nn.distributions",
+    "Distribution": "nn.distributions",
+    "MixtureOfGaussians": "nn.distributions",
+    "Normal": "nn.distributions",
     "FourierEmbeddings": "nn.embeddings",
     "IdentityPositionalEmbeddings": "nn.embeddings",
     "LearnedPositionalEmbeddings": "nn.embeddings",
@@ -323,6 +332,7 @@ NAME_MAP: dict[str, str] = {
     "compute_nan_ratio": "utils.pytree",
     "flatten_array": "utils.pytree",
     "flatten_pytree": "utils.pytree",
+    "get_pytree_mapping": "utils.pytree",
     "get_pytree_param_count": "utils.pytree",
     "pytree_has_nans": "utils.pytree",
     "reshuffle_pytree": "utils.pytree",
@@ -403,6 +413,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         TransformerCache,
         TransformerStack,
     )
+    from xax.nn.distributions import Categorical, Distribution, MixtureOfGaussians, Normal
     from xax.nn.embeddings import (
         EmbeddingKind,
         FourierEmbeddings,
@@ -509,6 +520,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         compute_nan_ratio,
         flatten_array,
         flatten_pytree,
+        get_pytree_mapping,
         get_pytree_param_count,
         pytree_has_nans,
         reshuffle_pytree,

{xax-0.3.4 → xax-0.3.9}/xax/nn/attention.py RENAMED Viewed

@@ -5,6 +5,8 @@ supporting a fixed-size context window and caching that can be used to train
 transformers which can be unrolled with a fixed-length cache.
 """
+import math
+import warnings
 from typing import NotRequired, TypedDict
 import chex
@@ -13,6 +15,8 @@ import jax
 import jax.numpy as jnp
 from jaxtyping import Array, PRNGKeyArray
+from xax.utils.jax import scan as xax_scan
 class RotaryEmbedding(eqx.Module):
     """Rotary Position Embedding (RoPE) for transformer attention.
@@ -22,8 +26,8 @@ class RotaryEmbedding(eqx.Module):
     https://arxiv.org/abs/2104.09864
     """
-    head_dim: int = eqx.static_field()
-    base: float = eqx.static_field()
+    head_dim: int = eqx.field()
+    base: float = eqx.field()
     def __init__(
         self,
@@ -125,15 +129,15 @@ class TransformerCache(TypedDict):
 class SelfAttentionBlock(eqx.Module):
     """Self-attention block using jax.nn.dot_product_attention."""
-    q_proj: eqx.nn.Linear
-    k_proj: eqx.nn.Linear
-    v_proj: eqx.nn.Linear
-    output_proj: eqx.nn.Linear
-    rotary_emb: RotaryEmbedding | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    q_proj: eqx.nn.Linear = eqx.field()
+    k_proj: eqx.nn.Linear = eqx.field()
+    v_proj: eqx.nn.Linear = eqx.field()
+    output_proj: eqx.nn.Linear = eqx.field()
+    rotary_emb: RotaryEmbedding | None = eqx.field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    local_window_size: int | None = eqx.field()
     def __init__(
         self,
@@ -169,8 +173,12 @@ class SelfAttentionBlock(eqx.Module):
         else:
             self.rotary_emb = None
+        if context_length is not None and not causal:
+            warnings.warn("context_length is set but causal is False; overriding causal to True", stacklevel=2)
+            causal = True
         self.causal = causal
-        self.context_length = context_length
+        self.local_window_size = None if context_length is None else context_length - 1
     @property
     def embed_dim(self) -> int:
@@ -195,28 +203,44 @@ class SelfAttentionBlock(eqx.Module):
         Returns:
             Cache with fixed-length k and v tensors
         """
-        if self.context_length is None:
+        if self.local_window_size is None:
             raise ValueError("context_length must be set for caching")
         # Create fixed-length cache
-        k_cache = jnp.zeros((self.context_length - 1, self.num_heads, self.head_dim), dtype=dtype)
-        v_cache = jnp.zeros((self.context_length - 1, self.num_heads, self.head_dim), dtype=dtype)
+        k_cache = jnp.zeros((self.local_window_size, self.num_heads, self.head_dim), dtype=dtype)
+        v_cache = jnp.zeros((self.local_window_size, self.num_heads, self.head_dim), dtype=dtype)
         return {"k": k_cache, "v": v_cache, "position": 0}
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        in_dim, out_dim = seq_len, seq_len
-        if with_cache:
-            if self.context_length is None:
-                raise ValueError("context_length must be set for caching")
-            in_dim = in_dim + self.context_length - 1
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        """Initialize the attention matrix mask.
-        mask = jnp.tril(jnp.ones((in_dim, out_dim)))
-        if self.context_length is not None:
-            neg_mask = 1 - jnp.tril(jnp.ones((in_dim, out_dim)), -self.context_length)
-            mask = mask * neg_mask
+        Args:
+            seq_len: The length of the sequence
+            add_cache: Whether to add the cache to the mask
+            batch_dim: Whether to add a batch dimension to the mask
-        return mask.astype(jnp.bool_).transpose()
+        Returns:
+            The attention matrix mask of shape (bsz, 1, seq_len, seq_len + cache_len)
+            if batch_dim is True, otherwise (seq_len, seq_len + cache_len).
+        """
+        t, s, o = seq_len, seq_len, 0
+        if add_cache:
+            if self.local_window_size is None:
+                raise ValueError("local_window_size must be set for caching")
+            s += self.local_window_size
+            o -= self.local_window_size
+        mask = jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-o)
+        if self.local_window_size is not None:
+            neg_mask = ~jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-(self.local_window_size + 1 + o))
+            mask = mask & neg_mask
+        mask = mask.reshape(1, 1, t, s) if batch_dim else mask.reshape(t, s)
+        return mask
     def forward(
         self,
@@ -229,7 +253,8 @@ class SelfAttentionBlock(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
-            mask: Optional mask tensor
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: The cached key and value tensors (fixed-length)
         Returns:
@@ -263,25 +288,36 @@ class SelfAttentionBlock(eqx.Module):
             v_cache = cache["v"]
             k = jnp.concatenate([k_cache, k], axis=0)
             v = jnp.concatenate([v_cache, v], axis=0)
             new_position = cache["position"] + seq_len
         else:
             new_position = seq_len
-        attn_output = jax.nn.dot_product_attention(
-            q,
-            k,
-            v,
-            mask=mask,
-            is_causal=self.causal and mask is None,
-        )
+        if seq_len == 1:
+            attn_output = jax.nn.dot_product_attention(q, k, v)
+        elif mask is not None:
+            attn_output = jax.nn.dot_product_attention(q, k, v, mask=mask)
+        elif cache is not None:
+            raise NotImplementedError("For training with a cache, provide a mask instead.")
+        else:
+            attn_output = jax.nn.dot_product_attention(
+                q,
+                k,
+                v,
+                is_causal=self.causal,
+                local_window_size=(self.local_window_size, 0) if self.local_window_size is not None else None,
+            )
         attn_output = self._combine_heads(attn_output)
         output = jax.vmap(self.output_proj)(attn_output)
-        if self.context_length is not None:
-            k = k[-(self.context_length - 1) :]
-            v = v[-(self.context_length - 1) :]
+        if self.local_window_size is not None:
+            k = k[-self.local_window_size :]
+            v = v[-self.local_window_size :]
         return output, {"k": k, "v": v, "position": new_position}
@@ -294,8 +330,8 @@ class CrossAttentionBlock(eqx.Module):
     v_proj: eqx.nn.Linear
     output_proj: eqx.nn.Linear
     rotary_emb: RotaryEmbedding | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
     def __init__(
         self,
@@ -352,7 +388,6 @@ class CrossAttentionBlock(eqx.Module):
         *,
         kv_sn: Array | None = None,
         cache: AttentionCache | None = None,
-        mask: Array | None = None,
     ) -> tuple[Array, AttentionCache]:
         """Apply cross-attention.
@@ -362,7 +397,6 @@ class CrossAttentionBlock(eqx.Module):
                 If not provided, then `cache` must be provided.
             cache: The cached key and value tensors. If not provided, then
                 `kv_sn` must be provided.
-            mask: Optional mask tensor
         Returns:
             The output tensor of shape (q_seq_len, embed_dim)
@@ -404,7 +438,7 @@ class CrossAttentionBlock(eqx.Module):
             q_rot,
             k_rot,
             v,
-            mask=mask,
+            scale=1.0 / math.sqrt(self.head_dim),
             is_causal=False,
         )
@@ -424,10 +458,10 @@ class TransformerBlock(eqx.Module):
     layer_norm1: eqx.nn.LayerNorm
     layer_norm2: eqx.nn.LayerNorm
     layer_norm3: eqx.nn.LayerNorm | None
-    num_heads: int = eqx.static_field()
-    head_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    num_heads: int = eqx.field()
+    head_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    context_length: int | None = eqx.field()
     def __init__(
         self,
@@ -500,16 +534,24 @@ class TransformerBlock(eqx.Module):
             cache["cross_attn"] = self.cross_attn.init_cache(kv_sn=context_sn)
         return cache
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.self_attn.init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.self_attn.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: AttentionCacheDict | None = None,
     ) -> tuple[Array, AttentionCacheDict]:
         """Apply transformer block.
@@ -517,8 +559,8 @@ class TransformerBlock(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
-            self_mask: Mask for self-attention
-            cross_mask: Mask for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -531,7 +573,7 @@ class TransformerBlock(eqx.Module):
         attn_output, self_attn_cache = self.self_attn.forward(
             x_tn=norm_x,
-            mask=self_mask,
+            mask=mask,
             cache=None if cache is None else cache["self_attn"],
         )
         updated_cache: AttentionCacheDict = {"self_attn": self_attn_cache}
@@ -547,7 +589,6 @@ class TransformerBlock(eqx.Module):
             cross_attn_output, updated_cache["cross_attn"] = self.cross_attn.forward(
                 q_tn=norm_x,
                 kv_sn=context_sn,
-                mask=cross_mask,
                 cache=None if cache is None else cache.get("cross_attn"),
             )
@@ -564,9 +605,9 @@ class TransformerBlock(eqx.Module):
 class TransformerStack(eqx.Module):
     """A stack of transformer blocks."""
-    layers: list[TransformerBlock]
-    num_layers: int = eqx.static_field()
-    causal: bool = eqx.static_field()
+    layers: tuple[TransformerBlock, ...]
+    num_layers: int = eqx.field()
+    causal: bool = eqx.field()
     def __init__(
         self,
@@ -584,7 +625,7 @@ class TransformerStack(eqx.Module):
     ) -> None:
         keys = jax.random.split(key, num_layers)
-        self.layers = [
+        self.layers = tuple(
             TransformerBlock(
                 embed_dim=embed_dim,
                 num_heads=num_heads,
@@ -597,7 +638,7 @@ class TransformerStack(eqx.Module):
                 rotary_base=rotary_base,
             )
             for i in range(num_layers)
-        ]
+        )
         self.num_layers = num_layers
         self.causal = causal
@@ -609,16 +650,24 @@ class TransformerStack(eqx.Module):
             cache[f"layer_{i}"] = layer.init_cache(dtype=dtype, context_sn=x_tn)
         return {"layers": cache}
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.layers[0].init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers[0].init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Apply transformer stack.
@@ -626,8 +675,8 @@ class TransformerStack(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
-            self_mask: Mask for self-attention
-            cross_mask: Mask for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -647,8 +696,7 @@ class TransformerStack(eqx.Module):
             x_tn, updated_cache["layers"][f"layer_{i}"] = layer.forward(
                 x_tn,
                 context_sn=context_sn,
-                self_mask=self_mask,
-                cross_mask=cross_mask,
+                mask=mask,
                 cache=layer_cache,
             )
@@ -660,9 +708,9 @@ class Transformer(eqx.Module):
     layers: TransformerStack
     output_layer: eqx.nn.Linear | None
     layer_norm: eqx.nn.LayerNorm
-    embed_dim: int = eqx.static_field()
-    causal: bool = eqx.static_field()
-    context_length: int | None = eqx.static_field()
+    embed_dim: int = eqx.field()
+    causal: bool = eqx.field()
+    context_length: int | None = eqx.field()
     def __init__(
         self,
@@ -713,8 +761,17 @@ class Transformer(eqx.Module):
         """Initialize cache for the input."""
         return self.layers.init_cache(dtype=dtype, x_tn=x_tn)
-    def init_mask(self, seq_len: int, with_cache: bool = True) -> Array:
-        return self.layers.init_mask(seq_len, with_cache=with_cache)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def encode(
         self,
@@ -727,7 +784,8 @@ class Transformer(eqx.Module):
         Args:
             x: Input token indices of shape (seq_len)
-            mask: Optional attention mask
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -737,11 +795,7 @@ class Transformer(eqx.Module):
         x_embedded = jax.vmap(self.token_embedding)(x)
         # Apply transformer stack
-        x_embedded, updated_cache = self.layers.forward(
-            x_embedded,
-            self_mask=mask,
-            cache=cache,
-        )
+        x_embedded, updated_cache = self.layers.forward(x_embedded, mask=mask, cache=cache)
         # Apply final layer norm
         output = jax.vmap(self.layer_norm)(x_embedded)
@@ -753,8 +807,7 @@ class Transformer(eqx.Module):
         x_t: Array,
         context_s: Array,
         *,
-        self_mask: Array | None = None,
-        cross_mask: Array | None = None,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Decode with self-attention and cross-attention.
@@ -763,8 +816,8 @@ class Transformer(eqx.Module):
             x_t: Input token indices, shape (seq_len)
             context_s: Context from encoder (token indices or embedded),
                 shape (context_len, embed_dim)
-            self_mask: Optional self-attention mask, shape (seq_len, seq_len)
-            cross_mask: Optional cross-attention mask, shape (seq_len, context_len)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -780,8 +833,7 @@ class Transformer(eqx.Module):
         x_embedded, updated_cache = self.layers.forward(
             x_embedded,
             context_sn=context_embedded,
-            self_mask=self_mask,
-            cross_mask=cross_mask,
+            mask=mask,
             cache=cache,
         )
@@ -801,7 +853,8 @@ class Transformer(eqx.Module):
         Args:
             x: Input token indices of shape (seq_len)
-            mask: Optional attention mask
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -809,11 +862,7 @@ class Transformer(eqx.Module):
         """
         chex.assert_rank(x, 1)
-        output, updated_cache = self.encode(
-            x,
-            mask=mask,
-            cache=cache,
-        )
+        output, updated_cache = self.encode(x, mask=mask, cache=cache)
         # Apply output layer if it exists
         if self.output_layer is not None:
@@ -832,6 +881,7 @@ class Transformer(eqx.Module):
         temperature: float = 1.0,
         top_k: int | None = None,
         key: PRNGKeyArray | None = None,
+        jit_level: int | None = None,
     ) -> Array:
         """Generate a sequence autoregressively with KV caching.
@@ -841,6 +891,7 @@ class Transformer(eqx.Module):
             temperature: Sampling temperature
             top_k: Optional top-k sampling parameter
             key: PRNG key for sampling
+            jit_level: JIT level for the scan function
         Returns:
             Generated sequence of shape (prompt_len + max_len,)
@@ -856,7 +907,8 @@ class Transformer(eqx.Module):
         # Initialize cache with prompt
         cache = self.init_cache()
-        _, cache = self.encode(prompt_seq, cache=cache)
+        mask = self.init_mask(prompt_len, add_cache=True, batch_dim=False)
+        _, cache = self.encode(prompt_seq, cache=cache, mask=mask)
         # Define scan function for autoregressive generation
         def scan_fn(
@@ -884,5 +936,5 @@ class Transformer(eqx.Module):
             return (new_output_seq, pos + 1, new_cache, rng), next_token
         init_carry = (output_seq, prompt_len - 1, cache, key)
-        (final_seq, _, _, _), _ = jax.lax.scan(scan_fn, init_carry, length=max_len)
+        (final_seq, _, _, _), _ = xax_scan(scan_fn, init_carry, length=max_len, jit_level=jit_level)
         return final_seq

xax 0.3.4__tar.gz → 0.3.9__tar.gz

xax 0.3.4tar.gz → 0.3.9tar.gz