PyPI - xax - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

xax 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

xax/__init__.py +9 -10
xax/nn/geom.py +57 -0
xax/nn/ssm.py +194 -174
xax/task/mixins/train.py +15 -7
{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/METADATA +1 -1
{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/RECORD +9 -9
{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/WHEEL +0 -0
{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/licenses/LICENSE +0 -0
{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/top_level.txt +0 -0

xax/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ and running the update script:
     python -m scripts.update_api --inplace
 """
-__version__ = "0.1.11"
+__version__ = "0.1.12"
 # This list shouldn't be modified by hand; instead, run the update script.
 __all__ = [
@@ -43,15 +43,14 @@ __all__ = [
     "euler_to_quat",
     "get_projected_gravity_vector_from_quat",
     "quat_to_euler",
+    "rotate_vector_by_quat",
     "cross_entropy",
     "cast_norm_type",
     "get_norm",
     "is_master",
+    "BaseSSMBlock",
     "DiagSSMBlock",
-    "DiscreteTimeS4",
-    "S4",
-    "S4Layer",
-    "S6Layer",
+    "SSM",
     "SSMBlock",
     "BaseLauncher",
     "CliLauncher",
@@ -203,15 +202,14 @@ NAME_MAP: dict[str, str] = {
     "euler_to_quat": "nn.geom",
     "get_projected_gravity_vector_from_quat": "nn.geom",
     "quat_to_euler": "nn.geom",
+    "rotate_vector_by_quat": "nn.geom",
     "cross_entropy": "nn.losses",
     "cast_norm_type": "nn.norm",
     "get_norm": "nn.norm",
     "is_master": "nn.parallel",
+    "BaseSSMBlock": "nn.ssm",
     "DiagSSMBlock": "nn.ssm",
-    "DiscreteTimeS4": "nn.ssm",
-    "S4": "nn.ssm",
-    "S4Layer": "nn.ssm",
-    "S6Layer": "nn.ssm",
+    "SSM": "nn.ssm",
     "SSMBlock": "nn.ssm",
     "BaseLauncher": "task.launchers.base",
     "CliLauncher": "task.launchers.cli",
@@ -364,11 +362,12 @@ if IMPORT_ALL or TYPE_CHECKING:
         euler_to_quat,
         get_projected_gravity_vector_from_quat,
         quat_to_euler,
+        rotate_vector_by_quat,
     )
     from xax.nn.losses import cross_entropy
     from xax.nn.norm import NormType, cast_norm_type, get_norm
     from xax.nn.parallel import is_master
-    from xax.nn.ssm import S4, DiagSSMBlock, DiscreteTimeS4, S4Layer, S6Layer, SSMBlock
+    from xax.nn.ssm import SSM, BaseSSMBlock, DiagSSMBlock, SSMBlock
     from xax.task.base import RawConfigType
     from xax.task.launchers.base import BaseLauncher
     from xax.task.launchers.cli import CliLauncher

xax/nn/geom.py CHANGED Viewed

@@ -99,3 +99,60 @@ def get_projected_gravity_vector_from_quat(quat: jax.Array, eps: float = 1e-6) -
     # Note: We're rotating [0,0,-1], so we negate gz to match the expected direction
     return jnp.concatenate([gx, gy, -gz], axis=-1)
+def rotate_vector_by_quat(vector: jax.Array, quat: jax.Array, eps: float = 1e-6) -> jax.Array:
+    """Rotates a vector by a quaternion.
+    Args:
+        vector: The vector to rotate, shape (*, 3).
+        quat: The quaternion to rotate by, shape (*, 4).
+        eps: A small epsilon value to avoid division by zero.
+    Returns:
+        The rotated vector, shape (*, 3).
+    """
+    # Normalize quaternion
+    quat = quat / (jnp.linalg.norm(quat, axis=-1, keepdims=True) + eps)
+    w, x, y, z = jnp.split(quat, 4, axis=-1)
+    # Extract vector components
+    vx, vy, vz = jnp.split(vector, 3, axis=-1)
+    # Terms for x component
+    xx = (
+        w * w * vx
+        + 2 * y * w * vz
+        - 2 * z * w * vy
+        + x * x * vx
+        + 2 * y * x * vy
+        + 2 * z * x * vz
+        - z * z * vx
+        - y * y * vx
+    )
+    # Terms for y component
+    yy = (
+        2 * x * y * vx
+        + y * y * vy
+        + 2 * z * y * vz
+        + 2 * w * z * vx
+        - z * z * vy
+        + w * w * vy
+        - 2 * w * x * vz
+        - x * x * vy
+    )
+    # Terms for z component
+    zz = (
+        2 * x * z * vx
+        + 2 * y * z * vy
+        + z * z * vz
+        - 2 * w * y * vx
+        + w * w * vz
+        + 2 * w * x * vy
+        - y * y * vz
+        - x * x * vz
+    )
+    return jnp.concatenate([xx, yy, zz], axis=-1)

xax/nn/ssm.py CHANGED Viewed

@@ -13,140 +13,18 @@ def glorot(key: PRNGKeyArray, shape: tuple[int, ...]) -> Array:
     return jax.random.uniform(key, shape, minval=-1.0, maxval=1.0) * jnp.sqrt(2 / sum(shape))
-class DiscreteTimeS4(eqx.Module):
-    a: Array
-    B: Array
-    C: Array
-    proj_in: eqx.nn.Linear
-    proj_out: eqx.nn.Linear
-    def __init__(
-        self,
-        hidden_size: int,
-        projection_size: int,
-        input_size: int,
-        output_size: int,
-        *,
-        key: PRNGKeyArray,
-    ) -> None:
-        self.a = jax.nn.initializers.glorot_uniform()(key, (hidden_size,))
-        self.B = jax.nn.initializers.glorot_uniform()(key, (projection_size, hidden_size))
-        self.C = jax.nn.initializers.glorot_uniform()(key, (hidden_size, projection_size))
-        self.proj_in = eqx.nn.Linear(input_size, projection_size, key=key)
-        self.proj_out = eqx.nn.Linear(projection_size, output_size, key=key)
-    def __call__(self, h: Array, x: Array) -> tuple[Array, Array]:
-        h = self.a * h + self.B.T @ x
-        y = self.C.T @ h
-        return h, y
-    def predict_sequence(self, x_seq: Array) -> Array:
-        x_proj = jax.vmap(lambda x: jax.nn.relu(self.proj_in(x)))(x_seq)
-        h = jnp.zeros(self.a.shape[0])
-        def scan_fn(h: Array, x: Array) -> tuple[Array, Array]:
-            h = self.a * h + self.B.T @ x
-            y = self.C.T @ h
-            return h, y
-        _, y_seq = jax.lax.scan(scan_fn, h, x_proj)
-        y_out = jax.vmap(self.proj_out)(y_seq)
-        return y_out
-class S4Layer(eqx.Module):
-    a: Array
-    B: Array
-    C: Array
-    proj_in: eqx.nn.Linear
-    proj_out: eqx.nn.Linear
-    delta: Array
-    def __init__(
-        self,
-        hidden_size: int,
-        projection_size: int,
-        input_size: int,
-        output_size: int,
-        *,
-        key: PRNGKeyArray,
-    ) -> None:
-        self.a = jax.nn.initializers.glorot_uniform()(key, (hidden_size,))
-        self.B = jax.nn.initializers.glorot_uniform()(key, (projection_size, hidden_size))
-        self.C = jax.nn.initializers.glorot_uniform()(key, (hidden_size, projection_size))
-        self.proj_in = eqx.nn.Linear(input_size, projection_size, key=key)
-        self.proj_out = eqx.nn.Linear(projection_size, output_size, key=key)
-        self.delta = jax.random.uniform(key, (hidden_size,))
-    def __call__(self, h: Array, x: Array) -> tuple[Array, Array]:
-        delta_a = self.delta * self.a
-        a_bar = jnp.exp(delta_a)
-        b_bar = jnp.linalg.inv(delta_a) * (a_bar - 1) @ (self.delta * self.B)
-        h = a_bar * h + b_bar.T @ x
-        y = self.C.T @ h
-        return h, y
-    def predict_sequence(self, x_seq: Array) -> Array:
-        x_proj = jax.vmap(lambda x: jax.nn.gelu(self.proj_in(x)))(x_seq)
-        h = jnp.zeros(self.a.shape[0])
-        def scan_fn(h: Array, x: Array) -> tuple[Array, Array]:
-            h = self.a * h + self.B.T @ x
-            y = self.C.T @ h
-            return h, y
-        _, y_seq = jax.lax.scan(scan_fn, h, x_proj)
-        y_out = jax.vmap(self.proj_out)(y_seq)
-        return y_out
-class S6Layer(eqx.Module):
-    a: Array
-    B: Array
-    C: Array
-    proj_in: eqx.nn.Linear
-    proj_out: eqx.nn.Linear
-    delta: Array
-    def __init__(
-        self,
-        hidden_size: int,
-        projection_size: int,
-        input_size: int,
-        output_size: int,
-        *,
-        key: PRNGKeyArray,
-    ) -> None:
-        self.a = jax.nn.initializers.glorot_uniform()(key, (hidden_size,))
-        self.B = jax.nn.initializers.glorot_uniform()(key, (projection_size, hidden_size))
-        self.C = jax.nn.initializers.glorot_uniform()(key, (hidden_size, projection_size))
-        self.proj_in = eqx.nn.Linear(input_size, projection_size, key=key)
-        self.proj_out = eqx.nn.Linear(projection_size, output_size, key=key)
-        self.delta = jax.random.uniform(key, (hidden_size,))
-    def __call__(self, h: Array, x: Array) -> tuple[Array, Array]:
-        h = self.a * h + self.B.T @ x
-        y = self.C.T @ h
-        return h, y
-    def predict_sequence(self, x_seq: Array) -> Array:
-        x_proj = jax.vmap(lambda x: jax.nn.gelu(self.proj_in(x)))(x_seq)
-        h = jnp.zeros(self.a.shape[0])
-        def scan_fn(h: Array, x: Array) -> tuple[Array, Array]:
-            h = self.a * h + self.B.T @ x
-            y = self.C.T @ h
-            return h, y
+class BaseSSMBlock(eqx.Module, ABC):
+    @abstractmethod
+    def forward(self, h: Array, x: Array) -> Array: ...
-        _, y_seq = jax.lax.scan(scan_fn, h, x_proj)
-        y_out = jax.vmap(self.proj_out)(y_seq)
-        return y_out
+    @abstractmethod
+    def forward_sequence(self, x_seq: Array) -> Array: ...
+    @abstractmethod
+    def get_a_mat(self, x: Array) -> Array: ...
-class BaseSSMBlock(eqx.Module, ABC):
     @abstractmethod
-    def forward(self, h: Array, x: Array) -> Array:
-        pass
+    def get_b_mat(self, x: Array) -> Array: ...
 class SSMBlock(BaseSSMBlock):
@@ -158,80 +36,194 @@ class SSMBlock(BaseSSMBlock):
         self.a_mat = glorot(key_a, (hidden_size, hidden_size))
         self.b_mat = glorot(key_b, (hidden_size, hidden_size))
+    def get_a_mat(self, x: Array) -> Array:
+        return self.a_mat
+    def get_b_mat(self, x: Array) -> Array:
+        return self.b_mat
     def forward(self, h: Array, x: Array) -> Array:
-        h = self.a_mat @ h + self.b_mat.T @ x
+        """Perform a forward pass.
+        Args:
+            h: Hidden state of shape (H,).
+            x: Input of shape (H,).
+        Returns:
+            Hidden state of shape (H,).
+        """
+        a_mat = self.get_a_mat(x)
+        b_mat = self.get_b_mat(x)
+        h = a_mat @ h + b_mat.T @ x
         return h
-    def get_kernel(self, length: int) -> Array:
-        return self.a_mat
+    def forward_sequence(self, x_seq: Array) -> Array:
+        """Perform a forward pass across time.
+        Args:
+            x_seq: Input sequence of shape (T, H).
+        Returns:
+            Hidden state sequence of shape (T, H).
+        """
+        def step(h: Array, x: Array) -> tuple[Array, Array]:
+            h = self.forward(h, x)
+            return h, h
+        a_mat = self.get_a_mat(x_seq)
+        h_0 = jnp.zeros(a_mat.shape[0])
+        _, h_seq = jax.lax.scan(step, h_0, x_seq)
+        return h_seq
 class DiagSSMBlock(BaseSSMBlock):
-    a_mat: Array
+    a_diag: Array
     b_mat: Array
     def __init__(self, hidden_size: int, *, key: PRNGKeyArray) -> None:
         keys = jax.random.split(key, 2)
-        self.a_mat = glorot(keys[0], (hidden_size,))
+        self.a_diag = glorot(keys[0], (hidden_size,))
         self.b_mat = glorot(keys[1], (hidden_size, hidden_size))
+    def get_a_mat(self, x: Array) -> Array:
+        return self.a_diag
+    def get_b_mat(self, x: Array) -> Array:
+        return self.b_mat
     def forward(self, h: Array, x: Array) -> Array:
-        h = self.a_mat * h + self.b_mat.T @ x
-        h = jax.nn.tanh(h)
+        """Perform a forward pass.
+        Args:
+            h: Hidden state of shape (H,).
+            x: Input of shape (H,).
+        Returns:
+            Hidden state of shape (H,).
+        """
+        a_diag = self.get_a_mat(x)
+        b_mat = self.get_b_mat(x)
+        h = a_diag * h + b_mat.T @ x
         return h
-    def get_kernel(self, length: int) -> Array:
+    def forward_sequence(self, x_seq: Array, *, use_conv: bool = True, recursive_kernel_calc: bool = False) -> Array:
+        """Perform a potentially parallelized forward pass across time.
+        Args:
+            x_seq: Input sequence of shape (T, H).
+            use_conv: Whether to use convolution to compute the sequence.
+            recursive_kernel_calc: Whether to use a recursive kernel calculation.
+        Returns:
+            Hidden state sequence of shape (T, H).
+        """
+        if use_conv:
+            return self._forward_sequence_conv(x_seq, recursive_kernel_calc=recursive_kernel_calc)
+        else:
+            return self._forward_sequence_scan(x_seq)
+    def _get_kernel(self, x_seq: Array, length: int) -> Array:
         """Returns the kernel with time as the final dimension."""
         exponents = jnp.arange(length)
-        kernel = jnp.power(self.a_mat[:, None], exponents)  # (H, L)
-        kernel = kernel[:, None, :]  # (H, 1, L)
+        a_diag = self.get_a_mat(x_seq)
+        kernel = jnp.power(a_diag[:, None], exponents)  # (H, T)
+        kernel = kernel[:, None, :]  # (H, 1, T)
         return kernel
-    def forward_across_time(self, x: Array) -> Array:
+    def _get_kernel_recursive(self, x_seq: Array, length: int) -> Array:
+        """Returns the kernel with time as the final dimension."""
+        assert length % 2 == 0, "Length must be even."
+        a_diag = self.get_a_mat(x_seq)
+        def helper(length: int) -> tuple[Array, Array]:
+            """Returns the kernel and the sqrt of the diagonal."""
+            if length == 1:
+                return jnp.ones_like(a_diag)[:, None], a_diag[:, None]
+            half_length = length // 2
+            kernel_half, a_half = helper(half_length)
+            kernel = jnp.concatenate([kernel_half, a_half * kernel_half], axis=-1)
+            return kernel, a_half * a_half
+        kernel, a_diag = helper(length)
+        return kernel[:, None, :]  # (H, 1, L)
+    def _forward_sequence_conv(self, x_seq: Array, *, recursive_kernel_calc: bool = False) -> Array:
         """Convolves x (T, H) across time using the kernel."""
-        tsz, nhid = x.shape
+        seq_len, hidden_size = x_seq.shape
+        b_mat = self.get_b_mat(x_seq)
-        # Compute s = x @ U.T + b, with shape (N, T, H)
-        s = self.b_mat.T @ x
-        s = s.T  # (H, T)
+        s = b_mat.T @ x_seq.T  # (H, T)
+        s_padded = jnp.pad(s, ((0, 0), (seq_len - 1, 0)))[None, :, :]  # (1, H, 2T-1)
-        kernel = self.get_kernel(tsz)  # (H, 1, T)
-        kernel_flipped = jnp.flip(kernel, axis=-1)
+        if recursive_kernel_calc:
+            kernel = self._get_kernel_recursive(x_seq, seq_len)
+        else:
+            kernel = self._get_kernel(x_seq, seq_len)
-        # Pad s on the left along the time axis (pad length T-1)
-        s_padded = jnp.pad(s, ((0, 0), (0, 0), (tsz - 1, 0)))
+        kernel_flipped = jnp.flip(kernel, axis=-1)  # (H, 1, L)
-        # Perform depthwise (grouped) 1D convolution.
-        # We use input shape (N, H, L) and kernel shape (H, 1, T) with feature_group_count=H.
-        # The dimension_numbers are chosen so that the channel dimension is second.
         conv_out = jax.lax.conv_general_dilated(
             s_padded,
             kernel_flipped,
             window_strides=(1,),
             padding="VALID",
-            dimension_numbers=("NCH", "OIH", "NCH"),
-            feature_group_count=nhid,
+            dimension_numbers=("NCT", "OIT", "NCT"),  # convolving over time
+            feature_group_count=hidden_size,
         )
-        # conv_out has shape (N, H, T); transpose to (N, T, H)
-        conv_out = jnp.transpose(conv_out, (0, 2, 1))
+        conv_out = conv_out[0].T  # (T, H)
         return conv_out
-    def naive_forward_accross_time(self, x: Array) -> Array:
+    def _forward_sequence_scan(self, x_seq: Array) -> Array:
         """Naively forward across time."""
         def step(h: Array, x: Array) -> tuple[Array, Array]:
             h = self.forward(h, x)
             return h, h
-        h_0 = jnp.zeros(self.a_mat.shape[0])
-        _, h_seq = jax.lax.scan(step, h_0, x)
+        a_diag = self.get_a_mat(x_seq)
+        h_0 = jnp.zeros(a_diag.shape[0])
+        _, h_seq = jax.lax.scan(step, h_0, x_seq)
         return h_seq
-class S4(eqx.Module):
+class DiscreteDiagSSMBlock(DiagSSMBlock):
+    delta: Array
+    def __init__(
+        self,
+        hidden_size: int,
+        *,
+        key: PRNGKeyArray,
+        init_delta: float = 1.0,
+        init_scale: float = 10.0,
+    ) -> None:
+        super().__init__(hidden_size, key=key)
+        self.delta = jnp.array(init_delta)
+        # A positive scale helps reduce the gradient at the start.
+        self.a_diag = jax.random.uniform(key, (hidden_size,), minval=-1.0, maxval=0.0) * init_scale
+    def get_a_mat(self, x: Array) -> Array:
+        """Discretize the diagonal matrix using zero-order hold."""
+        a_diag_discrete = jnp.exp(self.a_diag * self.delta)
+        return a_diag_discrete
+    def get_b_mat(self, x: Array) -> Array:
+        """Discretize the input matrix using zero-order hold."""
+        delta_a_diag = self.a_diag * self.delta
+        exp_a_diag = jnp.exp(delta_a_diag)
+        delta_a_inv = 1 / delta_a_diag
+        delta_b_mat = self.delta * self.b_mat
+        b_discrete = delta_a_inv * (exp_a_diag - 1) * delta_b_mat
+        return b_discrete
+class SSM(eqx.Module):
     vocab_embedding: eqx.nn.Embedding
-    proj_in: eqx.nn.Linear
-    proj_out: eqx.nn.Linear
+    output_layer: eqx.nn.Linear
     blocks: list[BaseSSMBlock]
     num_layers: int = eqx.static_field()
     hidden_size: int = eqx.static_field()
@@ -243,24 +235,30 @@ class S4(eqx.Module):
         hidden_size: int,
         output_size: int,
         num_layers: int,
-        block_type: Literal["ssm", "diag"] = "ssm",
+        block_type: Literal["diagonal", "full_rank"] = "full_rank",
         skip_connections: bool = False,
+        discretize: bool = False,
         *,
         key: PRNGKeyArray,
     ) -> None:
         vocab_key, s4_key = jax.random.split(key, 2)
         self.vocab_embedding = eqx.nn.Embedding(input_size, hidden_size, key=vocab_key)
-        self.proj_in = eqx.nn.Linear(hidden_size, hidden_size, key=key)
-        self.proj_out = eqx.nn.Linear(hidden_size, output_size, key=key)
+        self.output_layer = eqx.nn.Linear(hidden_size, output_size, key=key)
         block_keys = jax.random.split(s4_key, num_layers)
         def get_block(key: PRNGKeyArray) -> BaseSSMBlock:
             match block_type:
-                case "ssm":
+                case "diagonal":
+                    return (
+                        DiscreteDiagSSMBlock(hidden_size, key=key, init_delta=0.1)
+                        if discretize
+                        else DiagSSMBlock(hidden_size, key=key)
+                    )
+                case "full_rank":
+                    if discretize:
+                        raise ValueError("Full rank blocks do not support discretization due to instability.")
                     return SSMBlock(hidden_size, key=key)
-                case "diag":
-                    return DiagSSMBlock(hidden_size, key=key)
                 case _:
                     raise ValueError(f"Unknown block type: {block_type}")
@@ -276,21 +274,43 @@ class S4(eqx.Module):
             new_hs.append(h)
             xh = jax.nn.gelu(h)
             x = xh + x if self.skip_connections else xh
-        y = self.proj_out(x)
+        y = self.output_layer(x)
         return new_hs, y
     def _embed_input(self, x: Array) -> Array:
         """U is the input to the S4 cell."""
-        embedded = self.vocab_embedding(x)
-        return jax.nn.gelu(self.proj_in(embedded))
+        return self.vocab_embedding(x)
     def predict_sequence(self, x_seq: Array) -> Array:
         x_emb = jax.vmap(self._embed_input)(x_seq)
+        for block in self.blocks:
+            h = block.forward_sequence(x_emb)
+            # h = block.naive_forward_sequence(x_emb)
+            h = jax.nn.gelu(h)
+            x_emb = h + x_emb if self.skip_connections else h
+        y = jax.vmap(self.output_layer)(x_emb)
+        return y
+    def generate_sequence(self, prompt_seq: Array, max_len: int) -> Array:
         hs = [jnp.zeros(self.hidden_size) for _ in range(self.num_layers)]
+        prompt_seq_embedded = jax.vmap(self._embed_input)(prompt_seq)
-        def step(hs: list[Array], x: Array) -> tuple[list[Array], Array]:
+        def encode_step(hs: list[Array], x: Array) -> tuple[list[Array], Array]:
             hs, y = self(hs, x)
             return hs, y
-        _, y_seq = jax.lax.scan(step, hs, x_emb)
-        return y_seq
+        def decode_step(
+            carry: tuple[list[Array], Array, PRNGKeyArray],
+            _: None,
+        ) -> tuple[tuple[list[Array], Array, PRNGKeyArray], Array]:
+            hs, last_token, rng = carry
+            token_embedded = self._embed_input(last_token)
+            hs, y = self(hs, token_embedded)
+            token = jax.random.categorical(rng, y)
+            rng = jax.random.split(rng)[0]
+            return (hs, token, rng), token
+        hs, _ = jax.lax.scan(encode_step, hs, prompt_seq_embedded)
+        _, sequence = jax.lax.scan(decode_step, (hs, prompt_seq[-1], jax.random.PRNGKey(0)), None, length=max_len)
+        return sequence

xax/task/mixins/train.py CHANGED Viewed

@@ -218,26 +218,32 @@ class TrainMixin(
         state = super().on_step_end(state)
         return state.replace(elapsed_time_s=time.time() - state.start_time_s)
-    def log_train_step(self, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State) -> None:
+    def log_train_step(
+        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+    ) -> None:
         """Override this function to do logging during the training phase.
         This function is called after the model forward pass and before the
         backward pass. It is called in the training phase.
         Args:
+            model: The current model.
             batch: The batch from the dataloader.
             output: The model output.
             metrics: The metrics for the current batch.
             state: The current training state.
         """
-    def log_valid_step(self, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State) -> None:
+    def log_valid_step(
+        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+    ) -> None:
         """Override this function to do logging during the validation phase.
         This function is called after the model forward pass. It is called in
         the validation phase.
         Args:
+            model: The current model.
             batch: The batch from the dataloader.
             output: The model output.
             metrics: The metrics for the current batch.
@@ -251,7 +257,9 @@ class TrainMixin(
             for k, v in d.items():
                 self.logger.log_scalar(k, v, namespace=ns)
-    def log_step(self, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State) -> None:
+    def log_step(
+        self, model: PyTree, batch: Batch, output: Output, metrics: FrozenDict[str, Array], state: State
+    ) -> None:
         phase = state.phase
         for k, v in metrics.items():
@@ -265,9 +273,9 @@ class TrainMixin(
         # Delegate to the appropriate logging function based on the phase.
         match phase:
             case "train":
-                self.log_train_step(batch, output, metrics, state)
+                self.log_train_step(model, batch, output, metrics, state)
             case "valid":
-                self.log_valid_step(batch, output, metrics, state)
+                self.log_valid_step(model, batch, output, metrics, state)
             case _:
                 raise KeyError(f"Unknown phase: {phase}")
@@ -579,7 +587,7 @@ class TrainMixin(
                 )
                 output, metrics = self.val_step(model_arr, model_static, valid_batch, state)
-                self.log_step(valid_batch, output, metrics, state)
+                self.log_step(eqx.combine(model_arr, model_static), valid_batch, output, metrics, state)
             state = self.on_step_start(state)
             train_batch = next(train_pf)
@@ -597,7 +605,7 @@ class TrainMixin(
                 batch=train_batch,
                 state=state,
             )
-            self.log_step(train_batch, output, metrics, state)
+            self.log_step(eqx.combine(model_arr, model_static), train_batch, output, metrics, state)
             state = self.on_step_end(state)

{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xax
-Version: 0.1.11
+Version: 0.1.12
 Summary: A library for fast Jax experimentation
 Home-page: https://github.com/kscalelabs/xax
 Author: Benjamin Bolte

{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-xax/__init__.py,sha256=2JdSxsZphJJFVMGBVXNc0hP2p0FVOu5y7xSgPRNeyNY,13835
+xax/__init__.py,sha256=7vdTYO7jAJdDxKZURlFxc3Y5kr5mVQcTQjeh_sYjD6I,13834
 xax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/requirements-dev.txt,sha256=qkscNkFzWd1S5fump-AKH53rR65v2x5FmboFdy_kKvs,128
 xax/requirements.txt,sha256=9LAEZ5c5gqRSARRVA6xJsVTa4MebPZuC4yOkkwkZJFw,297
@@ -10,11 +10,11 @@ xax/nn/embeddings.py,sha256=bQGxBFxkLwi2MQLkRfGaHPH5P_KKB21HdI7VNWTKIOQ,11847
 xax/nn/equinox.py,sha256=5fdOKRXqAVZPsV-aEez3i1wamr_oBYnG74GP1jEthjM,4843
 xax/nn/export.py,sha256=7Yemw3T33QGEP8RkmTkpu6tRVOhut2RUJmttNFfCgFw,5537
 xax/nn/functions.py,sha256=CI_OmspaQwN9nl4hwefIU3_I7m6gBZwJ9aGK1JGUgr0,2713
-xax/nn/geom.py,sha256=eK7I8fUHBc3FT7zpm5Yf__bXFQ4LtX6sa17-DxojLTo,3202
+xax/nn/geom.py,sha256=Bj9Z4Y-uoNQuaA_eB_MyG7yImZLuOq8KCLUj1l3daoc,4545
 xax/nn/losses.py,sha256=Q_NVnm5n4UPBvp5nI_1aUptfXnqFYoUeFwySiyvopHg,272
 xax/nn/norm.py,sha256=WgZ3QCrUnf-YecwhEtVPcr99fKK3ECl_UeiAs2uv7oo,564
 xax/nn/parallel.py,sha256=fnTiT7MsG7eQrJvqwjIz2Ifo3P27TuxIJzmpGYSa_dQ,4608
-xax/nn/ssm.py,sha256=eFeGkV1pkVGc0vNrQbykCbFnlPXQqsqVA_JVzLBHD28,9865
+xax/nn/ssm.py,sha256=8dLAcQ1hBaMT-kkHvwGu_ecxJeTY32WeMYmd4T4KtxA,10745
 xax/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/task/base.py,sha256=E4l1yCrAkM2TVTbVYrmk6BoVHMkbD4IYsTT921XOyi0,7760
 xax/task/logger.py,sha256=1SZjVC6UCtZUoMPcpp3ckotL324QDeYDvHVhf5MHVqg,36271
@@ -41,7 +41,7 @@ xax/task/mixins/logger.py,sha256=6oXsJJyNUx6YT3q58FVXMZBUpMgjVkGre6BXFN20cVI,280
 xax/task/mixins/process.py,sha256=d1opVgvc6bOFXb7R58b07F4P5lbSZIzYaajtE0eBbpw,1477
 xax/task/mixins/runnable.py,sha256=IYIsLd2k09g-_y6o44EhJqT7E6BpsyEMmsyLSuzqjtc,1979
 xax/task/mixins/step_wrapper.py,sha256=-Yu5Nft2CRw1JvZt6J_94SM1vqX8fk08IDK95Pmd2ew,1648
-xax/task/mixins/train.py,sha256=lgLHiHQtnDK0XS3SwHTYZtDv5CTbPRN1-p_K9KiIpHQ,26000
+xax/task/mixins/train.py,sha256=aIebtOIvERYofSyqzNGBpNYlNrXweqFUqM9dHiTx3Dc,26253
 xax/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/debugging.py,sha256=9WlCrEqbq-SVXPEM4rhsLYERH97XNX7XSYLSI3sgKGk,1619
 xax/utils/experiments.py,sha256=5CUja1H_cx4dnVqTGQekOpIhqISwHtAgLxZ34GV7cwM,29229
@@ -58,8 +58,8 @@ xax/utils/data/collate.py,sha256=Rd9vMomr_S_zCa_Hi4dO-8ntzAfVwndIUtuXFA3iNcc,706
 xax/utils/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 xax/utils/types/frozen_dict.py,sha256=ZCMGfSfr2_b2qZbq9ywPD0zej5tpVSId2JftXpwfB5k,4686
 xax/utils/types/hashable_array.py,sha256=l5iIcFmkYzfGeaZmcSoeFkthFASqM8xJYK3AXhZQYwc,992
-xax-0.1.11.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
-xax-0.1.11.dist-info/METADATA,sha256=qDhn5EGxdiuEe5gQUZiBC430sXhJOPRWboTvsh2onxs,1878
-xax-0.1.11.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-xax-0.1.11.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
-xax-0.1.11.dist-info/RECORD,,
+xax-0.1.12.dist-info/licenses/LICENSE,sha256=HCN2bImAzUOXldAZZI7JZ9PYq6OwMlDAP_PpX1HnuN0,1071
+xax-0.1.12.dist-info/METADATA,sha256=hLRAX5__7QjBgjzhxbRftGvEsNrt8IAdgd22dMtHu_Y,1878
+xax-0.1.12.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+xax-0.1.12.dist-info/top_level.txt,sha256=g4Au_r2XhvZ-lTybviH-Fh9g0zF4DAYHYxPue1-xbs8,4
+xax-0.1.12.dist-info/RECORD,,

{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xax-0.1.11.dist-info → xax-0.1.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

xax 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

xax 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl