PyPI - xax - Versions diffs - 0.3.5__tar.gz → 0.3.6__tar.gz - Mend

xax 0.3.5tar.gz → 0.3.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

{xax-0.3.5/xax.egg-info → xax-0.3.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.3.5
+Version: 0.3.6
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.3.5 → xax-0.3.6}/xax/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.3.5"
+__version__ = "0.3.6"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -136,6 +136,7 @@ __all__ = [
     "compute_nan_ratio",
     "flatten_array",
     "flatten_pytree",
+    "get_pytree_mapping",
     "get_pytree_param_count",
     "pytree_has_nans",
     "reshuffle_pytree",
@@ -323,6 +324,7 @@ NAME_MAP: dict[str, str] = {
     "compute_nan_ratio": "utils.pytree",
     "flatten_array": "utils.pytree",
     "flatten_pytree": "utils.pytree",
+    "get_pytree_mapping": "utils.pytree",
     "get_pytree_param_count": "utils.pytree",
     "pytree_has_nans": "utils.pytree",
     "reshuffle_pytree": "utils.pytree",
@@ -509,6 +511,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         compute_nan_ratio,
         flatten_array,
         flatten_pytree,
+        get_pytree_mapping,
         get_pytree_param_count,
         pytree_has_nans,
         reshuffle_pytree,

{xax-0.3.5 → xax-0.3.6}/xax/nn/attention.py RENAMED Viewed

@@ -212,16 +212,49 @@ class SelfAttentionBlock(eqx.Module):
         return {"k": k_cache, "v": v_cache, "position": 0}
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        """Initialize the attention matrix mask.
+        Args:
+            seq_len: The length of the sequence
+            add_cache: Whether to add the cache to the mask
+            batch_dim: Whether to add a batch dimension to the mask
+        Returns:
+            The attention matrix mask of shape (bsz, 1, seq_len, seq_len + cache_len)
+            if batch_dim is True, otherwise (seq_len, seq_len + cache_len).
+        """
+        t, s, o = seq_len, seq_len, 0
+        if add_cache:
+            if self.local_window_size is None:
+                raise ValueError("local_window_size must be set for caching")
+            s += self.local_window_size
+            o -= self.local_window_size
+        mask = jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-o)
+        if self.local_window_size is not None:
+            neg_mask = ~jnp.tril(jnp.ones((t, s), dtype=jnp.bool_), k=-(self.local_window_size + 1 + o))
+            mask = mask & neg_mask
+        mask = mask.reshape(1, 1, t, s) if batch_dim else mask.reshape(t, s)
+        return mask
     def forward(
         self,
         x_tn: Array,
         *,
+        mask: Array | None = None,
         cache: AttentionCache | None = None,
     ) -> tuple[Array, AttentionCache]:
         """Apply self-attention.
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: The cached key and value tensors (fixed-length)
         Returns:
@@ -256,26 +289,28 @@ class SelfAttentionBlock(eqx.Module):
             k = jnp.concatenate([k_cache, k], axis=0)
             v = jnp.concatenate([v_cache, v], axis=0)
-            # Pads query with `k_cache.shape[0]` zeros.
-            q = jnp.pad(q, ((k_cache.shape[0], 0), (0, 0), (0, 0)), mode="constant", constant_values=0)
             new_position = cache["position"] + seq_len
         else:
             new_position = seq_len
-        attn_output = jax.nn.dot_product_attention(
-            q,
-            k,
-            v,
-            scale=1.0 / math.sqrt(self.head_dim),
-            is_causal=self.causal,
-            local_window_size=(self.local_window_size, 0) if self.local_window_size is not None else None,
-        )
+        if seq_len == 1:
+            attn_output = jax.nn.dot_product_attention(q, k, v)
-        if cache is not None:
-            # Remove the padding.
-            attn_output = attn_output[cache["k"].shape[0] :]
+        elif mask is not None:
+            attn_output = jax.nn.dot_product_attention(q, k, v, mask=mask)
+        elif cache is not None:
+            raise NotImplementedError("For training with a cache, provide a mask instead.")
+        else:
+            attn_output = jax.nn.dot_product_attention(
+                q,
+                k,
+                v,
+                is_causal=self.causal,
+                local_window_size=(self.local_window_size, 0) if self.local_window_size is not None else None,
+            )
         attn_output = self._combine_heads(attn_output)
         output = jax.vmap(self.output_proj)(attn_output)
@@ -403,6 +438,7 @@ class CrossAttentionBlock(eqx.Module):
             q_rot,
             k_rot,
             v,
+            scale=1.0 / math.sqrt(self.head_dim),
             is_causal=False,
         )
@@ -498,11 +534,24 @@ class TransformerBlock(eqx.Module):
             cache["cross_attn"] = self.cross_attn.init_cache(kv_sn=context_sn)
         return cache
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.self_attn.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
+        mask: Array | None = None,
         cache: AttentionCacheDict | None = None,
     ) -> tuple[Array, AttentionCacheDict]:
         """Apply transformer block.
@@ -510,6 +559,8 @@ class TransformerBlock(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -522,6 +573,7 @@ class TransformerBlock(eqx.Module):
         attn_output, self_attn_cache = self.self_attn.forward(
             x_tn=norm_x,
+            mask=mask,
             cache=None if cache is None else cache["self_attn"],
         )
         updated_cache: AttentionCacheDict = {"self_attn": self_attn_cache}
@@ -598,11 +650,24 @@ class TransformerStack(eqx.Module):
             cache[f"layer_{i}"] = layer.init_cache(dtype=dtype, context_sn=x_tn)
         return {"layers": cache}
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers[0].init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def forward(
         self,
         x_tn: Array,
         *,
         context_sn: Array | None = None,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Apply transformer stack.
@@ -610,6 +675,8 @@ class TransformerStack(eqx.Module):
         Args:
             x_tn: Input tensor of shape (seq_len, embed_dim)
             context_sn: Optional context for cross-attention
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -629,6 +696,7 @@ class TransformerStack(eqx.Module):
             x_tn, updated_cache["layers"][f"layer_{i}"] = layer.forward(
                 x_tn,
                 context_sn=context_sn,
+                mask=mask,
                 cache=layer_cache,
             )
@@ -693,16 +761,31 @@ class Transformer(eqx.Module):
         """Initialize cache for the input."""
         return self.layers.init_cache(dtype=dtype, x_tn=x_tn)
+    def init_mask(
+        self,
+        seq_len: int,
+        add_cache: bool = False,
+        batch_dim: bool = False,
+    ) -> Array:
+        return self.layers.init_mask(
+            seq_len,
+            add_cache=add_cache,
+            batch_dim=batch_dim,
+        )
     def encode(
         self,
         x: Array,
         *,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Encode the input sequence.
         Args:
             x: Input token indices of shape (seq_len)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -712,7 +795,7 @@ class Transformer(eqx.Module):
         x_embedded = jax.vmap(self.token_embedding)(x)
         # Apply transformer stack
-        x_embedded, updated_cache = self.layers.forward(x_embedded, cache=cache)
+        x_embedded, updated_cache = self.layers.forward(x_embedded, mask=mask, cache=cache)
         # Apply final layer norm
         output = jax.vmap(self.layer_norm)(x_embedded)
@@ -724,6 +807,7 @@ class Transformer(eqx.Module):
         x_t: Array,
         context_s: Array,
         *,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Decode with self-attention and cross-attention.
@@ -732,6 +816,8 @@ class Transformer(eqx.Module):
             x_t: Input token indices, shape (seq_len)
             context_s: Context from encoder (token indices or embedded),
                 shape (context_len, embed_dim)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -747,6 +833,7 @@ class Transformer(eqx.Module):
         x_embedded, updated_cache = self.layers.forward(
             x_embedded,
             context_sn=context_embedded,
+            mask=mask,
             cache=cache,
         )
@@ -759,12 +846,15 @@ class Transformer(eqx.Module):
         self,
         x: Array,
         *,
+        mask: Array | None = None,
         cache: TransformerCache | None = None,
     ) -> tuple[Array, TransformerCache]:
         """Forward pass for encoder-only or decoder-only transformers.
         Args:
             x: Input token indices of shape (seq_len)
+            mask: Optional mask of shape (batch_size, num_heads, seq_len,
+                seq_len + cache_len)
             cache: Optional dictionary containing cached key and value tensors
         Returns:
@@ -772,7 +862,7 @@ class Transformer(eqx.Module):
         """
         chex.assert_rank(x, 1)
-        output, updated_cache = self.encode(x, cache=cache)
+        output, updated_cache = self.encode(x, mask=mask, cache=cache)
         # Apply output layer if it exists
         if self.output_layer is not None:
@@ -817,7 +907,8 @@ class Transformer(eqx.Module):
         # Initialize cache with prompt
         cache = self.init_cache()
-        _, cache = self.encode(prompt_seq, cache=cache)
+        mask = self.init_mask(prompt_len, add_cache=True, batch_dim=False)
+        _, cache = self.encode(prompt_seq, cache=cache, mask=mask)
         # Define scan function for autoregressive generation
         def scan_fn(

{xax-0.3.5 → xax-0.3.6}/xax/utils/pytree.py RENAMED Viewed

@@ -253,3 +253,16 @@ def tuple_insert(t: tuple[T, ...], index: int, value: T) -> tuple[T, ...]:
     mut = list(t)
     mut[index] = value
     return tuple(mut)
+def get_pytree_mapping(pytree: PyTree) -> dict[str, Array]:
+    leaves: dict[str, Array] = {}
+    def _get_leaf(path: tuple, x: PyTree) -> None:
+        if isinstance(x, jnp.ndarray):
+            # Convert path tuple to string, e.g. (1, 'a', 2) -> '1/a/2'
+            path_str = "/".join(str(p) for p in path)
+            leaves[path_str] = x
+    jax.tree.map_with_path(_get_leaf, pytree)
+    return leaves

{xax-0.3.5 → xax-0.3.6/xax.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.3.5
+Version: 0.3.6
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte