PyPI - xax - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

xax 0.2.22py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

xax/__init__.py +10 -1
xax/core/state.py +10 -37
xax/nn/attention.py +738 -0
xax/task/logger.py +1 -1
xax/task/mixins/train.py +10 -16
xax/utils/experiments.py +2 -2
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/METADATA +1 -1
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/RECORD +12 -11
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/WHEEL +1 -1
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/entry_points.txt +0 -0
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/licenses/LICENSE +0 -0
{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/top_level.txt +0 -0

xax/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.2.22"
+__version__ = "0.2.23"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -23,6 +23,10 @@ __all__ = [
     "get_run_dir",
     "load_user_config",
     "State",
+    "CrossAttentionBlock",
+    "SelfAttentionBlock",
+    "Transformer",
+    "TransformerBlock",
     "FourierEmbeddings",
     "IdentityPositionalEmbeddings",
     "LearnedPositionalEmbeddings",
@@ -200,6 +204,10 @@ NAME_MAP: dict[str, str] = {
     "get_run_dir": "core.conf",
     "load_user_config": "core.conf",
     "State": "core.state",
+    "CrossAttentionBlock": "nn.attention",
+    "SelfAttentionBlock": "nn.attention",
+    "Transformer": "nn.attention",
+    "TransformerBlock": "nn.attention",
     "FourierEmbeddings": "nn.embeddings",
     "IdentityPositionalEmbeddings": "nn.embeddings",
     "LearnedPositionalEmbeddings": "nn.embeddings",
@@ -370,6 +378,7 @@ if IMPORT_ALL or TYPE_CHECKING:
         load_user_config,
     )
     from xax.core.state import Phase, State
+    from xax.nn.attention import CrossAttentionBlock, SelfAttentionBlock, Transformer, TransformerBlock
     from xax.nn.embeddings import (
         EmbeddingKind,
         FourierEmbeddings,

xax/core/state.py CHANGED Viewed

@@ -27,11 +27,8 @@ def _int_to_phase(i: int) -> Phase:
 class StateDict(TypedDict, total=False):
     num_steps: NotRequired[int | Array]
     num_samples: NotRequired[int | Array]
-    num_valid_steps: NotRequired[int | Array]
-    num_valid_samples: NotRequired[int | Array]
     start_time_s: NotRequired[float | Array]
     elapsed_time_s: NotRequired[float | Array]
-    valid_elapsed_time_s: NotRequired[float | Array]
     phase: NotRequired[Phase]
     _phase: NotRequired[int | Array]
@@ -47,38 +44,26 @@ class State:
         return self._int32_arr[0]
     @property
-    def num_valid_steps(self) -> Array:
-        return self._int32_arr[1]
+    def phase(self) -> Phase:
+        return _int_to_phase(self._int32_arr[1].item())
     @property
     def num_samples(self) -> Array:
         return self._float32_arr[0]
-    @property
-    def num_valid_samples(self) -> Array:
-        return self._float32_arr[1]
     @property
     def start_time_s(self) -> Array:
-        return self._float32_arr[2]
+        return self._float32_arr[1]
     @property
     def elapsed_time_s(self) -> Array:
-        return self._float32_arr[3]
-    @property
-    def valid_elapsed_time_s(self) -> Array:
-        return self._float32_arr[4]
-    @property
-    def phase(self) -> Phase:
-        return _int_to_phase(self._int32_arr[2].item())
+        return self._float32_arr[2]
     @classmethod
     def init_state(cls) -> "State":
         return cls(
-            _int32_arr=jnp.array([0, 0, 0], dtype=jnp.int32),
-            _float32_arr=jnp.array([0.0, 0.0, time.time(), 0.0, 0.0], dtype=jnp.float32),
+            _int32_arr=jnp.array([0, 0], dtype=jnp.int32),
+            _float32_arr=jnp.array([0.0, time.time(), 0.0], dtype=jnp.float32),
         )
     @property
@@ -91,25 +76,19 @@ class State:
         if "num_steps" in kwargs:
             int32_arr = int32_arr.at[0].set(kwargs["num_steps"])
-        if "num_valid_steps" in kwargs:
-            int32_arr = int32_arr.at[1].set(kwargs["num_valid_steps"])
         if "phase" in kwargs:
-            int32_arr = int32_arr.at[2].set(_phase_to_int(kwargs["phase"]))
+            int32_arr = int32_arr.at[1].set(_phase_to_int(kwargs["phase"]))
         if "_phase" in kwargs:
-            int32_arr = int32_arr.at[2].set(kwargs["_phase"])
+            int32_arr = int32_arr.at[1].set(kwargs["_phase"])
         if "num_samples" in kwargs:
             float32_arr = float32_arr.at[0].set(kwargs["num_samples"])
-        if "num_valid_samples" in kwargs:
-            float32_arr = float32_arr.at[1].set(kwargs["num_valid_samples"])
         if "start_time_s" in kwargs:
-            float32_arr = float32_arr.at[2].set(kwargs["start_time_s"])
+            float32_arr = float32_arr.at[1].set(kwargs["start_time_s"])
         if "elapsed_time_s" in kwargs:
-            float32_arr = float32_arr.at[3].set(kwargs["elapsed_time_s"])
-        if "valid_elapsed_time_s" in kwargs:
-            float32_arr = float32_arr.at[4].set(kwargs["valid_elapsed_time_s"])
+            float32_arr = float32_arr.at[2].set(kwargs["elapsed_time_s"])
         return State(
             _int32_arr=int32_arr,
@@ -119,12 +98,9 @@ class State:
     def to_dict(self) -> dict[str, int | float | str]:
         return {
             "num_steps": int(self.num_steps.item()),
-            "num_valid_steps": int(self.num_valid_steps.item()),
             "num_samples": int(self.num_samples.item()),
-            "num_valid_samples": int(self.num_valid_samples.item()),
             "start_time_s": float(self.start_time_s.item()),
             "elapsed_time_s": float(self.elapsed_time_s.item()),
-            "valid_elapsed_time_s": float(self.valid_elapsed_time_s.item()),
             "phase": str(self.phase),
         }
@@ -136,7 +112,6 @@ class State:
         int32_arr = jnp.array(
             [
                 d.get("num_steps", 0),
-                d.get("num_valid_steps", 0),
                 d.get("_phase", 0),
             ],
             dtype=jnp.int32,
@@ -145,10 +120,8 @@ class State:
         float32_arr = jnp.array(
             [
                 d.get("num_samples", 0),
-                d.get("num_valid_samples", 0),
                 d.get("start_time_s", time.time()),
                 d.get("elapsed_time_s", 0.0),
-                d.get("valid_elapsed_time_s", 0.0),
             ],
             dtype=jnp.float32,
         )

xax/nn/attention.py ADDED Viewed

@@ -0,0 +1,738 @@
+"""Attention mechanisms for transformer models."""
+from typing import Literal, cast, overload
+import chex
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+from jaxtyping import Array, PRNGKeyArray
+class SelfAttentionBlock(eqx.Module):
+    """Self-attention block using jax.nn.dot_product_attention."""
+    q_proj: eqx.nn.Linear
+    k_proj: eqx.nn.Linear
+    v_proj: eqx.nn.Linear
+    output_proj: eqx.nn.Linear
+    num_heads: int = eqx.static_field()
+    head_dim: int = eqx.static_field()
+    causal: bool = eqx.static_field()
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        *,
+        key: PRNGKeyArray,
+        causal: bool = False,
+    ) -> None:
+        keys = jax.random.split(key, 4)
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        self.q_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[0])
+        self.k_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[1])
+        self.v_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[2])
+        self.output_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[3])
+        self.causal = causal
+    def _reshape_for_multihead(self, x: Array) -> Array:
+        """Reshape from (seq_len, embed_dim) to (seq_len, num_heads, head_dim)."""
+        seq_len, _ = x.shape
+        return x.reshape(seq_len, self.num_heads, self.head_dim)
+    def _combine_heads(self, x: Array) -> Array:
+        """Reshape from (seq_len, num_heads, head_dim) to (seq_len, embed_dim)."""
+        seq_len, _, _ = x.shape
+        return x.reshape(seq_len, -1)
+    def __call__(
+        self,
+        x: Array,
+        *,
+        key: PRNGKeyArray | None = None,
+        mask: Array | None = None,
+        cache: dict[str, Array] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, Array]]:
+        """Apply self-attention to the input.
+        Args:
+            x: Input tensor of shape (seq_len, embed_dim)
+            key: PRNGKey for dropout randomness
+            mask: Optional mask tensor of shape (seq_len, seq_len) or broadcastable
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Output tensor of shape (seq_len, embed_dim)
+            If update_cache is True: Tuple of (output tensor, updated cache)
+        """
+        chex.assert_rank(x, 2)
+        # Project inputs to queries, keys, and values
+        q = jax.vmap(self.q_proj)(x)
+        # Use cached key/value if provided and not updating cache
+        if cache is not None and not update_cache:
+            k = cache["k"]
+            v = cache["v"]
+        else:
+            k = jax.vmap(self.k_proj)(x)
+            v = jax.vmap(self.v_proj)(x)
+            # Update cache if needed
+            if update_cache:
+                if cache is None:
+                    cache = {}
+                cache = {"k": k, "v": v}
+        # Reshape to multihead format
+        q = self._reshape_for_multihead(q)
+        k = self._reshape_for_multihead(k)
+        v = self._reshape_for_multihead(v)
+        # Apply dot product attention.
+        # Note that Apple Silicon struggles with this:
+        # https://github.com/jax-ml/jax/issues/20114
+        attn_output = jax.nn.dot_product_attention(
+            q,
+            k,
+            v,
+            mask=mask,
+            is_causal=self.causal and mask is None,
+        )
+        # Combine heads
+        attn_output = self._combine_heads(attn_output)
+        # Final projection
+        output = jax.vmap(self.output_proj)(attn_output)
+        if update_cache:
+            return output, cast(dict[str, Array], cache)
+        return output
+class CrossAttentionBlock(eqx.Module):
+    """Cross-attention block using jax.nn.dot_product_attention."""
+    q_proj: eqx.nn.Linear
+    k_proj: eqx.nn.Linear
+    v_proj: eqx.nn.Linear
+    output_proj: eqx.nn.Linear
+    num_heads: int = eqx.static_field()
+    head_dim: int = eqx.static_field()
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        *,
+        key: PRNGKeyArray,
+    ) -> None:
+        keys = jax.random.split(key, 4)
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        self.q_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[0])
+        self.k_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[1])
+        self.v_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[2])
+        self.output_proj = eqx.nn.Linear(embed_dim, embed_dim, key=keys[3])
+    def _reshape_for_multihead(self, x: Array) -> Array:
+        """Reshape from (seq_len, embed_dim) to (seq_len, num_heads, head_dim)."""
+        seq_len, _ = x.shape
+        return x.reshape(seq_len, self.num_heads, self.head_dim)
+    def _combine_heads(self, x: Array) -> Array:
+        """Reshape from (seq_len, num_heads, head_dim) to (seq_len, embed_dim)."""
+        seq_len, _, _ = x.shape
+        return x.reshape(seq_len, -1)
+    def __call__(
+        self,
+        q_input: Array,
+        kv_input: Array,
+        *,
+        key: PRNGKeyArray | None = None,
+        mask: Array | None = None,
+        cache: dict[str, Array] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, Array]]:
+        """Apply cross-attention.
+        Args:
+            q_input: Query input tensor of shape (q_seq_len, embed_dim)
+            kv_input: Key/value input tensor of shape (kv_seq_len, embed_dim)
+            key: PRNGKey for dropout randomness
+            mask: Optional mask tensor
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Output tensor of shape (q_seq_len, embed_dim)
+            If update_cache is True: Tuple of (output tensor, updated cache)
+        """
+        chex.assert_rank(q_input, 2)
+        chex.assert_rank(kv_input, 2)
+        # Project inputs to queries, keys, and values
+        q = jax.vmap(self.q_proj)(q_input)
+        # Use cached key/value if provided and not updating cache
+        if cache is not None and not update_cache:
+            k = cache["k"]
+            v = cache["v"]
+        else:
+            k = jax.vmap(self.k_proj)(kv_input)
+            v = jax.vmap(self.v_proj)(kv_input)
+            # Update cache if needed
+            if update_cache:
+                if cache is None:
+                    cache = {}
+                cache = {"k": k, "v": v}
+        # Reshape to multihead format
+        q = self._reshape_for_multihead(q)
+        k = self._reshape_for_multihead(k)
+        v = self._reshape_for_multihead(v)
+        # Apply dot product attention
+        attn_output = jax.nn.dot_product_attention(
+            q,
+            k,
+            v,
+            mask=mask,
+            is_causal=False,
+        )
+        # Combine heads
+        attn_output = self._combine_heads(attn_output)
+        # Final projection
+        output = jax.vmap(self.output_proj)(attn_output)
+        if update_cache:
+            return output, cast(dict[str, Array], cache)
+        return output
+class TransformerBlock(eqx.Module):
+    self_attn: SelfAttentionBlock
+    cross_attn: CrossAttentionBlock | None
+    feed_forward: eqx.nn.MLP
+    layer_norm1: eqx.nn.LayerNorm
+    layer_norm2: eqx.nn.LayerNorm
+    layer_norm3: eqx.nn.LayerNorm | None
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ff_dim: int,
+        *,
+        key: PRNGKeyArray,
+        causal: bool = False,
+        cross_attention: bool = False,
+    ) -> None:
+        keys = jax.random.split(key, 4)
+        self.self_attn = SelfAttentionBlock(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            key=keys[0],
+            causal=causal,
+        )
+        if cross_attention:
+            self.cross_attn = CrossAttentionBlock(
+                embed_dim=embed_dim,
+                num_heads=num_heads,
+                key=keys[1],
+            )
+            self.layer_norm3 = eqx.nn.LayerNorm(embed_dim)
+        else:
+            self.cross_attn = None
+            self.layer_norm3 = None
+        self.layer_norm1 = eqx.nn.LayerNorm(embed_dim)
+        self.layer_norm2 = eqx.nn.LayerNorm(embed_dim)
+        self.feed_forward = eqx.nn.MLP(
+            in_size=embed_dim,
+            out_size=embed_dim,
+            width_size=ff_dim,
+            depth=1,
+            activation=jax.nn.gelu,
+            key=keys[2],
+        )
+    @overload
+    def __call__(
+        self,
+        x: Array,
+        *,
+        context: Array | None = None,
+        self_mask: Array | None = None,
+        cross_mask: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, Array]] | None = None,
+        update_cache: Literal[True],
+    ) -> tuple[Array, dict[str, dict[str, Array]]]: ...
+    @overload
+    def __call__(
+        self,
+        x: Array,
+        *,
+        context: Array | None = None,
+        self_mask: Array | None = None,
+        cross_mask: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, Array]] | None = None,
+        update_cache: Literal[False] = False,
+    ) -> Array: ...
+    def __call__(
+        self,
+        x: Array,
+        *,
+        context: Array | None = None,
+        self_mask: Array | None = None,
+        cross_mask: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, Array]] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, dict[str, Array]]]:
+        """Apply transformer block.
+        Args:
+            x: Input tensor
+            context: Optional context for cross-attention
+            self_mask: Mask for self-attention
+            cross_mask: Mask for cross-attention
+            key: Optional PRNG key for dropout
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Output tensor
+            If update_cache is True: Tuple of (output tensor, updated cache)
+        """
+        chex.assert_rank(x, 2)
+        if key is not None:
+            key1, key2 = jax.random.split(key)
+        else:
+            key1 = key2 = None
+        # Initialize cache if needed
+        updated_cache = {}
+        if cache is None:
+            cache = {}
+        # Self-attention block with pre-norm
+        norm_x = jax.vmap(self.layer_norm1)(x)
+        self_attn_cache = cache.get("self_attn")
+        if update_cache:
+            attn_output, self_attn_cache = self.self_attn(
+                norm_x, key=key1, mask=self_mask, cache=self_attn_cache, update_cache=True
+            )
+            updated_cache["self_attn"] = self_attn_cache
+        else:
+            attn_output = self.self_attn(norm_x, key=key1, mask=self_mask, cache=self_attn_cache)
+        x = x + attn_output
+        # Cross-attention block (if enabled) with pre-norm
+        if self.cross_attn is not None and context is not None:
+            assert self.layer_norm3 is not None
+            norm_x = jax.vmap(self.layer_norm3)(x)
+            cross_attn_cache = cache.get("cross_attn")
+            if update_cache:
+                cross_attn_output, cross_attn_cache = self.cross_attn(
+                    norm_x, context, key=key2, mask=cross_mask, cache=cross_attn_cache, update_cache=True
+                )
+                updated_cache["cross_attn"] = cross_attn_cache
+            else:
+                cross_attn_output = self.cross_attn(norm_x, context, key=key2, mask=cross_mask, cache=cross_attn_cache)
+            x = x + cross_attn_output
+        # Feed-forward block with pre-norm
+        norm_x = jax.vmap(self.layer_norm2)(x)
+        ff_output = jax.vmap(self.feed_forward)(norm_x)
+        x = x + ff_output
+        if update_cache:
+            return x, updated_cache
+        return x
+class Transformer(eqx.Module):
+    token_embedding: eqx.nn.Embedding
+    position_embedding: eqx.nn.Embedding | None
+    layers: list[TransformerBlock]
+    output_layer: eqx.nn.Linear | None
+    layer_norm: eqx.nn.LayerNorm
+    max_seq_len: int = eqx.static_field()
+    embed_dim: int = eqx.static_field()
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_dim: int,
+        num_heads: int,
+        ff_dim: int,
+        num_layers: int,
+        max_seq_len: int,
+        output_size: int | None = None,
+        *,
+        key: PRNGKeyArray,
+        causal: bool = False,
+        cross_attention: bool = False,
+        use_absolute_position: bool = True,
+    ) -> None:
+        keys = jax.random.split(key, num_layers + 3)
+        self.token_embedding = eqx.nn.Embedding(vocab_size, embed_dim, key=keys[0])
+        # Position embeddings can be disabled
+        if use_absolute_position:
+            self.position_embedding = eqx.nn.Embedding(max_seq_len, embed_dim, key=keys[1])
+        else:
+            self.position_embedding = None
+        self.layers = [
+            TransformerBlock(
+                embed_dim=embed_dim,
+                num_heads=num_heads,
+                ff_dim=ff_dim,
+                key=keys[i + 2],
+                causal=causal,
+                cross_attention=cross_attention,
+            )
+            for i in range(num_layers)
+        ]
+        self.layer_norm = eqx.nn.LayerNorm(embed_dim)
+        if output_size is not None:
+            self.output_layer = eqx.nn.Linear(embed_dim, output_size, key=keys[-1])
+        else:
+            self.output_layer = None
+        self.max_seq_len = max_seq_len
+        self.embed_dim = embed_dim
+    def _add_positional_embedding(self, x_embedded: Array, positions: Array | None = None) -> Array:
+        """Add positional embeddings to the token embeddings."""
+        if self.position_embedding is None:
+            return x_embedded
+        seq_len, _ = x_embedded.shape
+        if positions is None:
+            positions = jnp.arange(seq_len)
+        pos_embedded = jax.vmap(self.position_embedding)(positions)
+        return x_embedded + pos_embedded
+    def encode(
+        self,
+        x: Array,
+        mask: Array | None = None,
+        positions: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, dict[str, Array]]] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, dict[str, dict[str, Array]]]]:
+        """Encode the input sequence.
+        Args:
+            x: Input token indices of shape (seq_len)
+            mask: Optional attention mask
+            positions: Optional positions
+            key: Optional PRNG key for dropout
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Encoded representation
+            If update_cache is True: Tuple of (encoded representation, updated cache)
+        """
+        # Token embedding
+        x_embedded = jax.vmap(self.token_embedding)(x)
+        # Add positional embedding
+        x_embedded = self._add_positional_embedding(x_embedded, positions)
+        # Initialize layer caches
+        if cache is None and update_cache:
+            cache = {f"layer_{i}": {} for i in range(len(self.layers))}
+        # Updated cache will be built if needed
+        updated_cache = {}
+        # Apply transformer layers
+        keys: Array | list[None] = [None] * len(self.layers)
+        if key is not None:
+            keys = jax.random.split(key, len(self.layers))
+        for i, (layer, layer_key) in enumerate(zip(self.layers, keys, strict=False)):
+            layer_cache = None if cache is None else cache.get(f"layer_{i}")
+            if update_cache:
+                x_embedded, layer_updated_cache = layer.__call__(
+                    x_embedded,
+                    self_mask=mask,
+                    key=layer_key,
+                    cache=layer_cache,
+                    update_cache=True,
+                )
+                updated_cache[f"layer_{i}"] = layer_updated_cache
+            else:
+                x_embedded = layer.__call__(
+                    x_embedded,
+                    self_mask=mask,
+                    key=layer_key,
+                    cache=layer_cache,
+                )
+        # Apply final layer norm
+        output = jax.vmap(self.layer_norm)(x_embedded)
+        if update_cache:
+            return output, updated_cache
+        return output
+    def decode(
+        self,
+        x: Array,
+        context: Array,
+        self_mask: Array | None = None,
+        cross_mask: Array | None = None,
+        positions: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, dict[str, Array]]] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, dict[str, dict[str, Array]]]]:
+        """Decode with self-attention and cross-attention.
+        Args:
+            x: Input token indices
+            context: Context from encoder
+            self_mask: Optional self-attention mask
+            cross_mask: Optional cross-attention mask
+            positions: Optional positions
+            key: Optional PRNG key for dropout
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Decoded representation
+            If update_cache is True: Tuple of (decoded representation, updated cache)
+        """
+        # Token embedding
+        x_embedded = jax.vmap(lambda x_seq: jax.vmap(self.token_embedding)(x_seq))(x)
+        # Add positional embedding
+        x_embedded = self._add_positional_embedding(x_embedded, positions)
+        # Initialize layer caches
+        if cache is None and update_cache:
+            cache = {f"layer_{i}": {} for i in range(len(self.layers))}
+        # Updated cache will be built if needed
+        updated_cache = {}
+        # Apply transformer layers with cross-attention
+        keys: Array | list[None] = [None] * len(self.layers)
+        if key is not None:
+            keys = jax.random.split(key, len(self.layers))
+        for i, (layer, layer_key) in enumerate(zip(self.layers, keys, strict=False)):
+            layer_cache = None if cache is None else cache.get(f"layer_{i}")
+            if update_cache:
+                x_embedded, layer_updated_cache = layer.__call__(
+                    x_embedded,
+                    context=context,
+                    self_mask=self_mask,
+                    cross_mask=cross_mask,
+                    key=layer_key,
+                    cache=layer_cache,
+                    update_cache=True,
+                )
+                updated_cache[f"layer_{i}"] = layer_updated_cache
+            else:
+                x_embedded = layer(
+                    x_embedded,
+                    context=context,
+                    self_mask=self_mask,
+                    cross_mask=cross_mask,
+                    key=layer_key,
+                    cache=layer_cache,
+                )
+        # Apply final layer norm
+        output = jax.vmap(self.layer_norm)(x_embedded)
+        if update_cache:
+            return output, updated_cache
+        return output
+    @overload
+    def __call__(
+        self,
+        x: Array,
+        *,
+        mask: Array | None = None,
+        positions: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, dict[str, Array]]] | None = None,
+        update_cache: Literal[True],
+    ) -> tuple[Array, dict[str, dict[str, dict[str, Array]]]]: ...
+    @overload
+    def __call__(
+        self,
+        x: Array,
+        *,
+        mask: Array | None = None,
+        positions: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, dict[str, Array]]] | None = None,
+        update_cache: Literal[False] = False,
+    ) -> Array: ...
+    def __call__(
+        self,
+        x: Array,
+        *,
+        mask: Array | None = None,
+        positions: Array | None = None,
+        key: PRNGKeyArray | None = None,
+        cache: dict[str, dict[str, dict[str, Array]]] | None = None,
+        update_cache: bool = False,
+    ) -> Array | tuple[Array, dict[str, dict[str, dict[str, Array]]]]:
+        """Forward pass for encoder-only or decoder-only transformers.
+        Args:
+            x: Input token indices of shape (seq_len)
+            mask: Optional attention mask
+            positions: Optional positions
+            key: Optional PRNG key for dropout
+            cache: Optional dictionary containing cached key and value tensors
+            update_cache: Whether to update the cache and return it
+        Returns:
+            If update_cache is False: Output representation
+            If update_cache is True: Tuple of (output representation, updated cache)
+        """
+        chex.assert_rank(x, 1)
+        if update_cache:
+            output, updated_cache = self.encode(
+                x, mask=mask, positions=positions, key=key, cache=cache, update_cache=True
+            )
+        else:
+            output = self.encode(x, mask=mask, positions=positions, key=key, cache=cache)
+        # Apply output layer if it exists
+        if self.output_layer is not None:
+            output = jax.vmap(self.output_layer)(output)
+        if update_cache:
+            return output, updated_cache
+        return output
+    def predict_sequence(self, x_seq: Array) -> Array:
+        return self(x=x_seq)
+    def generate_sequence(
+        self,
+        prompt_seq: Array,
+        max_len: int,
+        temperature: float = 1.0,
+        top_k: int | None = None,
+        key: PRNGKeyArray | None = None,
+    ) -> Array:
+        """Generate a sequence autoregressively with KV caching.
+        Args:
+            prompt_seq: Input token indices of shape (prompt_len,)
+            max_len: Maximum length of generated sequence
+            temperature: Sampling temperature
+            top_k: Optional top-k sampling parameter
+            key: PRNG key for sampling
+        Returns:
+            Generated sequence of shape (prompt_len + max_len,)
+        """
+        if key is None:
+            key = jax.random.PRNGKey(0)
+        prompt_len = prompt_seq.shape[0]
+        sequence = prompt_seq
+        # Create causal mask for generation
+        causal_mask = jnp.tril(jnp.ones((self.max_seq_len, self.max_seq_len), dtype=jnp.bool_))
+        # Initialize cache with the prompt
+        _, cache = self(x=prompt_seq, mask=causal_mask[:prompt_len, :prompt_len], update_cache=True)
+        # Define decode step function (for clarity)
+        def decode_step(seq: Array, pos: int, cur_cache: dict, rng: PRNGKeyArray) -> tuple[Array, dict, PRNGKeyArray]:
+            # Get the next position and last token
+            pos_tensor = jnp.array([pos])
+            last_token = seq[-1:]
+            # Get logits for next token
+            rng, subrng = jax.random.split(rng)
+            logits, new_cache = self(
+                x=last_token,
+                positions=pos_tensor,
+                key=subrng,
+                cache=cur_cache,
+                update_cache=True,
+            )
+            # Extract final logits and apply temperature
+            logits = logits[-1] / temperature
+            # Apply top-k sampling if specified
+            if top_k is not None:
+                top_logits, top_indices = jax.lax.top_k(logits, top_k)
+                logits = jnp.full_like(logits, float("-inf"))
+                logits = logits.at[top_indices].set(top_logits)
+            # Sample next token
+            rng, subrng = jax.random.split(rng)
+            next_token = jax.random.categorical(subrng, logits[None, ...])[0]
+            # Add token to sequence
+            new_seq = jnp.concatenate([seq, next_token[None]], axis=0)
+            return new_seq, new_cache, rng
+        # Generate tokens one by one
+        for _ in range(max_len):
+            # Break if max sequence length reached
+            if sequence.shape[0] >= self.max_seq_len:
+                break
+            # Decode next token
+            sequence, cache, key = decode_step(seq=sequence, pos=sequence.shape[0] - 1, cur_cache=cache, rng=key)
+        return sequence

xax/task/logger.py CHANGED Viewed

@@ -526,7 +526,7 @@ class LoggerImpl(ABC):
         Returns:
             If the logger should log the current step.
         """
-        elapsed_time = state.elapsed_time_s.item() if state.phase == "train" else state.valid_elapsed_time_s.item()
+        elapsed_time = state.elapsed_time_s.item()
         return self.tickers[state.phase].tick(elapsed_time)

xax/task/mixins/train.py CHANGED Viewed

@@ -121,7 +121,7 @@ class ValidStepTimer:
         self.last_valid_step = state.num_steps.item()
     def __call__(self, state: State) -> bool:
-        if state.num_steps < self.valid_first_n_steps and state.num_valid_steps < self.valid_first_n_steps:
+        if state.num_steps < self.valid_first_n_steps:
             return True
         if self.last_valid_time is None or self.last_valid_step is None:
@@ -130,18 +130,15 @@ class ValidStepTimer:
         # Step-based validation.
         valid_every_n_steps = self.valid_every_n_steps
-        if valid_every_n_steps is not None and (
-            state.num_steps >= valid_every_n_steps + self.last_valid_step
-            or state.num_valid_steps >= valid_every_n_steps + self.last_valid_step
-        ):
+        if valid_every_n_steps is not None and state.num_steps >= valid_every_n_steps + self.last_valid_step:
             self._reset(state)
             return True
         # Time-based validation.
         valid_every_n_seconds = self.valid_every_n_seconds
-        if valid_every_n_seconds is not None and (
-            state.elapsed_time_s.item() - self.last_valid_time >= valid_every_n_seconds
-            or state.valid_elapsed_time_s.item() - self.last_valid_time >= valid_every_n_seconds
+        if (
+            valid_every_n_seconds is not None
+            and state.elapsed_time_s.item() - self.last_valid_time >= valid_every_n_seconds
         ):
             self._reset(state)
             return True
@@ -149,10 +146,7 @@ class ValidStepTimer:
         # Time-based validation for first validation step.
         if self.first_valid_step_flag:
             valid_first_n_seconds = self.valid_first_n_seconds
-            if valid_first_n_seconds is not None and (
-                state.elapsed_time_s.item() >= valid_first_n_seconds
-                or state.valid_elapsed_time_s.item() >= valid_first_n_seconds
-            ):
+            if valid_first_n_seconds is not None and state.elapsed_time_s.item() >= valid_first_n_seconds:
                 self._reset(state)
                 self.first_valid_step_flag = False
                 return True
@@ -777,12 +771,12 @@ class TrainMixin(
                     self.log_step(eqx.combine(model_arr, model_static), valid_batch, output, metrics, state)
                     state = state.replace(
-                        num_valid_steps=state.num_valid_steps + 1,
-                        num_valid_samples=state.num_valid_samples + (self.get_size_of_batch(valid_batch) or 0),
+                        num_steps=state.num_steps + 1,
+                        num_samples=state.num_samples + (self.get_size_of_batch(valid_batch) or 0),
                     )
                 state = state.replace(
-                    valid_elapsed_time_s=state.valid_elapsed_time_s + timer.elapsed_time,
+                    elapsed_time_s=state.elapsed_time_s + timer.elapsed_time,
                 )
             with ContextTimer() as timer:
@@ -882,7 +876,7 @@ class TrainMixin(
             key, model_key = jax.random.split(key)
             models, optimizers, opt_states, state = self.load_initial_state(model_key, load_optimizer=True)
             logger.info("Model size: %s", f"{get_pytree_param_count(models):,}")
-            logger.info("Optimizer size: %s", f"{get_pytree_param_count(optimizers):,}")
+            logger.info("Optimizer size: %s", f"{get_pytree_param_count(opt_states):,}")
             state = self.on_training_start(state)

xax/utils/experiments.py CHANGED Viewed

@@ -111,8 +111,8 @@ class StateTimer:
     def step(self, state: State) -> None:
         cur_time = time.time()
-        num_steps = int((state.num_steps if state.phase == "train" else state.num_valid_steps).item())
-        num_samples = int((state.num_samples if state.phase == "train" else state.num_valid_samples).item())
+        num_steps = int(state.num_steps.item())
+        num_samples = int(state.num_samples.item())
         self.step_timer.step(num_steps, cur_time)
         self.sample_timer.step(num_samples, cur_time)
         self.iter_timer.step(cur_time)

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.2.22
+Version: 0.2.23
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-xax/__init__.py,sha256=Wh6x1Nohprb7ZxS_Y1aHPSo2xD7rAFSbmz31HLRl5og,15293
+xax/__init__.py,sha256=Q0boKxPtEUiiJ9j7Cdx51bLLFtYx3fPfCTG-o8o2Chk,15653
 xax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/requirements-dev.txt,sha256=qkscNkFzWd1S5fump-AKH53rR65v2x5FmboFdy_kKvs,128
 xax/requirements.txt,sha256=6qY-84e-sTmlfJNrSjwONQKqzAn5h8G_oGIhnhmfSr4,302
@@ -6,8 +6,9 @@ xax/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/cli/edit_config.py,sha256=LQUIlOS6hvPZyVEaMme3FP-62M0BKQPYavCwVDWuBLw,2600
 xax/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/core/conf.py,sha256=d7Dp_GwKnaxtkztlSrJSM_LR0UYJX_FWTtceIWCBkxc,5138
-xax/core/state.py,sha256=KsNMnM_RgsZ2Ntc2pp4Fi6zG4rZb_89-kqmyGxDvyRg,4974
+xax/core/state.py,sha256=F9Tj3FfCw8zFKaDEoEGiThZE2ntYEtzNjnBX3pQ1g60,3826
 xax/nn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+xax/nn/attention.py,sha256=0essK90OO3x9FxnUqU0DhufwXKRMN41zMtRCki5iKzQ,24742
 xax/nn/embeddings.py,sha256=bQGxBFxkLwi2MQLkRfGaHPH5P_KKB21HdI7VNWTKIOQ,11847
 xax/nn/functions.py,sha256=bA5kJYzMtFM8eUqBC086i355zJMAO7k_vPFNSDBI9-s,2814
 xax/nn/geom.py,sha256=A7WPefMvgwUNReZC7_HX1GmvHPASyghbaXaKsuhwDrE,7382
@@ -17,7 +18,7 @@ xax/nn/parallel.py,sha256=fnTiT7MsG7eQrJvqwjIz2Ifo3P27TuxIJzmpGYSa_dQ,4608
 xax/nn/ssm.py,sha256=8dLAcQ1hBaMT-kkHvwGu_ecxJeTY32WeMYmd4T4KtxA,10745
 xax/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/task/base.py,sha256=TYANmjNcce4_V5ZSYLnE91PXRn7Nn0nT7hN8plW_Au0,8117
-xax/task/logger.py,sha256=W_BpluYvQai1lh1dDCAj-2_mWUC1buhwJncHygDffjc,41125
+xax/task/logger.py,sha256=Bmhl4mv08Aq49ZyX6BdjPIsPJK28e8s3mVFatM4IY2Q,41060
 xax/task/script.py,sha256=bMMIJoUtpSBvPp6-7bejTrajTXvSg0794sYLKdPIToE,972
 xax/task/task.py,sha256=UHMpnv__gqMcfbC_L-Hhk-DCnUYlFVsgbNf-v8o8B7U,1424
 xax/task/launchers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -41,10 +42,10 @@ xax/task/mixins/logger.py,sha256=6oXsJJyNUx6YT3q58FVXMZBUpMgjVkGre6BXFN20cVI,280
 xax/task/mixins/process.py,sha256=hqDEsMp_SL6ee97iq26-G0g49OcWZZaX82JD4F22eJU,1781
 xax/task/mixins/runnable.py,sha256=IYIsLd2k09g-_y6o44EhJqT7E6BpsyEMmsyLSuzqjtc,1979
 xax/task/mixins/step_wrapper.py,sha256=-Yu5Nft2CRw1JvZt6J_94SM1vqX8fk08IDK95Pmd2ew,1648
-xax/task/mixins/train.py,sha256=eueQc6P15Gkc9_lU7sp7fIHt4qrqOmhc4Xt6pCYZPkw,33636
+xax/task/mixins/train.py,sha256=TZatz5QwTfrNhQTiO2IqrmQY9P4Lay6FAD2VsQpWa54,33245
 xax/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/debugging.py,sha256=OtUdu-3tQsQtik0Q9UM-SNV46IbPjwrAfZcywzoB5d4,1940
-xax/utils/experiments.py,sha256=bj8BftSHT3fFzfiJ0Co0WvqWo0rUS8kQnQYpVvH8FTM,29942
+xax/utils/experiments.py,sha256=5k5hPYSaVjzoR_nm2Q3DAHMMYi3Bcp3N3PAQbwZq7Gg,29830
 xax/utils/jax.py,sha256=6cP95-rcjkRt1fefkZWJQhJhH0uUYWJB3w4NP1-aDp0,10136
 xax/utils/jaxpr.py,sha256=H7pWl48ROXIB1-ZPWYfOn-ou3EBMxYWIwc_A0reJQoo,2333
 xax/utils/logging.py,sha256=GAhTne2rdB4Fa1lzk06DMO15U8MTejn6XTClShC-ZtU,6622
@@ -58,9 +59,9 @@ xax/utils/data/collate.py,sha256=Rd9vMomr_S_zCa_Hi4dO-8ntzAfVwndIUtuXFA3iNcc,706
 xax/utils/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/types/frozen_dict.py,sha256=ebtHENhyUzSjyJTlbMaLtcckQIJ7EtgJiok_40TJZpo,4689
 xax/utils/types/hashable_array.py,sha256=l5iIcFmkYzfGeaZmcSoeFkthFASqM8xJYK3AXhZQYwc,992
-xax-0.2.22.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
-xax-0.2.22.dist-info/METADATA,sha256=FtyVr4ve7FYrZCkDUWqneCAszYI-QSNs_ZTPrdbXUxg,1247
-xax-0.2.22.dist-info/WHEEL,sha256=QZxptf4Y1BKFRCEDxD4h2V0mBFQOVFLFEpvxHmIs52A,91
-xax-0.2.22.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
-xax-0.2.22.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
-xax-0.2.22.dist-info/RECORD,,
+xax-0.2.23.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
+xax-0.2.23.dist-info/METADATA,sha256=mA98vsIjdfb8XM2mN1vUb2VRVEPU4xf10IWLxxFJjmY,1247
+xax-0.2.23.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+xax-0.2.23.dist-info/entry_points.txt,sha256=uRC6rx5ce0bf-FblJaZSBMxxKFfMyoWTf8OWbBmLSe8,61
+xax-0.2.23.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
+xax-0.2.23.dist-info/RECORD,,

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.6.0)
+Generator: setuptools (80.7.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xax-0.2.22.dist-info → xax-0.2.23.dist-info}/top_level.txt RENAMED Viewed

File without changes

xax 0.2.22__py3-none-any.whl → 0.2.23__py3-none-any.whl

xax 0.2.22py3-none-any.whl → 0.2.23py3-none-any.whl