PyPI - tsagentkit-timesfm - Versions diffs - 1.0.0__py3-none-any.whl - Mend

tsagentkit-timesfm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

timesfm/__init__.py +29 -0
timesfm/configs.py +105 -0
timesfm/flax/__init__.py +13 -0
timesfm/flax/dense.py +110 -0
timesfm/flax/normalization.py +71 -0
timesfm/flax/transformer.py +356 -0
timesfm/flax/util.py +107 -0
timesfm/timesfm_2p5/timesfm_2p5_base.py +422 -0
timesfm/timesfm_2p5/timesfm_2p5_flax.py +602 -0
timesfm/timesfm_2p5/timesfm_2p5_torch.py +472 -0
timesfm/torch/__init__.py +13 -0
timesfm/torch/dense.py +94 -0
timesfm/torch/normalization.py +39 -0
timesfm/torch/transformer.py +370 -0
timesfm/torch/util.py +94 -0
timesfm/utils/xreg_lib.py +520 -0
tsagentkit_timesfm-1.0.0.dist-info/METADATA +152 -0
tsagentkit_timesfm-1.0.0.dist-info/RECORD +21 -0
tsagentkit_timesfm-1.0.0.dist-info/WHEEL +5 -0
tsagentkit_timesfm-1.0.0.dist-info/licenses/LICENSE +202 -0
tsagentkit_timesfm-1.0.0.dist-info/top_level.txt +1 -0

timesfm/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TimesFM API."""
+from .configs import ForecastConfig
+try:
+  from .timesfm_2p5 import timesfm_2p5_torch
+  TimesFM_2p5_200M_torch = timesfm_2p5_torch.TimesFM_2p5_200M_torch
+except ImportError:
+  pass
+try:
+  from .timesfm_2p5 import timesfm_2p5_flax
+  TimesFM_2p5_200M_flax = timesfm_2p5_flax.TimesFM_2p5_200M_flax
+except ImportError:
+  pass

timesfm/configs.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Abstract configs for TimesFM layers."""
+import dataclasses
+from typing import Literal
+@dataclasses.dataclass(frozen=True)
+class ForecastConfig:
+  """Options for forecasting.
+  Attributes:
+    max_context: The maximum context length. This is used by the complied decode
+      function at inference time during batched inference. Any input time series
+      with length less than max_context will be padded with zeros, and with
+      length greater than max_context will be truncated.
+    max_horizon: The maximum horizon length. This is used by the complied decode
+      function at inference time during batched inference. The compiled cached
+      decoding function will by default forecast till max_horizon.
+    normalize_inputs: Whether to normalize the inputs. This is useful when the
+      raw inputs are of extremely large or small magnitudes which may result in
+      numerical issues.
+    window_size: The window size for decomposed forecasting.
+      TODO(siriuz42):implement it.
+    per_core_batch_size: The batch size per core. Used at inference time during
+      batched inference when multiple GPU / TPU devices are used.
+    use_continuous_quantile_head: Whether to use a separate continuous quantile
+      head to avoid quantile collapsing.
+    force_flip_invariance: Whether to force flip invariance. TimesFM guarantees
+      that TimesFM(aX + b) = a * TimesFM(x) + b for a >= 0 by default. This flag
+      extends it to a < 0 as well.
+    infer_is_positive: Whether to guarantee nonnegativity of the output if the
+      input is nonnegative.
+    fix_quantile_crossing: Whether to fix quantile crossing.
+    return_backcast: Whether to return backcast.
+  """
+  max_context: int = 0
+  max_horizon: int = 0
+  normalize_inputs: bool = False
+  window_size: int = 0
+  per_core_batch_size: int = 1
+  use_continuous_quantile_head: bool = False
+  force_flip_invariance: bool = True
+  infer_is_positive: bool = True
+  fix_quantile_crossing: bool = False
+  return_backcast: bool = False
+@dataclasses.dataclass(frozen=True)
+class ResidualBlockConfig:
+  """Framework-agnostic config for a residual block."""
+  input_dims: int
+  hidden_dims: int
+  output_dims: int
+  use_bias: bool
+  activation: Literal["relu", "swish", "none"]
+@dataclasses.dataclass(frozen=True)
+class RandomFourierFeaturesConfig:
+  """Framework-agnostic config for random fourier features."""
+  input_dims: int
+  output_dims: int
+  projection_stddev: float
+  use_bias: bool
+@dataclasses.dataclass(frozen=True)
+class TransformerConfig:
+  """Framework-agnostic config for a transformer."""
+  model_dims: int
+  hidden_dims: int
+  num_heads: int
+  attention_norm: Literal["rms"]
+  feedforward_norm: Literal["rms"]
+  qk_norm: Literal["rms", "none"]
+  use_bias: bool
+  use_rotary_position_embeddings: bool
+  ff_activation: Literal["relu", "swish", "none"]
+  fuse_qkv: bool
+@dataclasses.dataclass(frozen=True)
+class StackedTransformersConfig:
+  """Framework-agnostic config for a stacked transformers."""
+  num_layers: int
+  transformer: TransformerConfig

timesfm/flax/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

timesfm/flax/dense.py ADDED Viewed

@@ -0,0 +1,110 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense layers for TimesFM."""
+from flax import nnx
+import jax
+import jax.numpy as jnp
+import jaxtyping
+from .. import configs
+Array = jaxtyping.Array
+Bool = jaxtyping.Bool
+Float = jaxtyping.Float
+Integer = jaxtyping.Integer
+Num = jaxtyping.Num
+ResidualBlockConfig = configs.ResidualBlockConfig
+RandomFourierFeaturesConfig = configs.RandomFourierFeaturesConfig
+class ResidualBlock(nnx.Module):
+  """Residual block with two linear layers and a linear residual connection."""
+  def __init__(self, config: ResidualBlockConfig, *, rngs=nnx.Rngs(42)):
+    self.config = config
+    self.hidden_layer = nnx.Linear(
+      in_features=config.input_dims,
+      out_features=config.hidden_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    self.output_layer = nnx.Linear(
+      in_features=config.hidden_dims,
+      out_features=config.output_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    self.residual_layer = nnx.Linear(
+      in_features=config.input_dims,
+      out_features=config.output_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    if config.activation == "relu":
+      self.activation = jax.nn.relu
+    elif config.activation == "swish":
+      self.activation = jax.nn.swish
+    elif config.activation == "none":
+      self.activation = lambda x: x
+    else:
+      raise ValueError(f"Activation: {config.activation} not supported.")
+  def __call__(self, x: Float[Array, "b ... i"]) -> Float[Array, "b ... o"]:
+    return self.output_layer(
+      self.activation(self.hidden_layer(x))
+    ) + self.residual_layer(x)
+class RandomFourierFeatures(nnx.Module):
+  """Random Fourier features layer."""
+  __data__ = ("phrase_shifts",)
+  def __init__(self, config: RandomFourierFeaturesConfig, *, rngs=nnx.Rngs(42)):
+    self.config = config
+    if config.output_dims % 4 != 0:
+      raise ValueError(
+        f"Output dims must be a multiple of 4: {config.output_dims} % 4 != 0."
+      )
+    num_projected_features = config.output_dims // 4
+    self.phase_shifts = nnx.Param(jnp.zeros(shape=(2, num_projected_features)))
+    self.projection_layer = nnx.Linear(
+      in_features=config.input_dims,
+      out_features=num_projected_features,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    self.residual_layer = nnx.Linear(
+      in_features=config.input_dims,
+      out_features=config.output_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+  def __call__(self, x: Float[Array, "b ... i"]) -> Float[Array, "b ... o"]:
+    projected = self.projection_layer(x)
+    cos_features = jnp.cos(projected)
+    sin_features = jnp.sin(projected)
+    sq_wave_1 = jnp.sign(jnp.sin(projected + self.phase_shifts[0, :]))
+    sq_wave_2 = jnp.sign(jnp.sin(projected + self.phase_shifts[1, :]))
+    fourier_features = jnp.concatenate(
+      [cos_features, sin_features, sq_wave_1, sq_wave_2], axis=-1
+    )
+    residual = self.residual_layer(x)
+    return fourier_features + residual

timesfm/flax/normalization.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalization layers for TimesFM."""
+from flax import nnx
+import jax
+import jax.numpy as jnp
+import jaxtyping
+Array = jaxtyping.Array
+Bool = jaxtyping.Bool
+Float = jaxtyping.Float
+Integer = jaxtyping.Integer
+Num = jaxtyping.Num
+class RMSNorm(nnx.Module):
+  """RMS normalization."""
+  __data__ = ("scale",)
+  def __init__(
+    self,
+    num_features: int,
+    *,
+    epsilon: float = 1e-6,
+    rngs=nnx.Rngs(42),
+  ):
+    del rngs
+    self.scale = nnx.Param(jnp.zeros(shape=(num_features,)))
+    self.num_features = num_features
+    self.epsilon = epsilon
+  def __call__(self, inputs: Float[Array, "b ... d"]) -> Float[Array, "b ... d"]:
+    var = jnp.mean(jnp.square(inputs), axis=-1, keepdims=True)
+    normed_inputs = inputs * jax.lax.rsqrt(var + self.epsilon)
+    normed_inputs *= self.scale
+    return normed_inputs
+class LayerNorm(nnx.Module):
+  """Layer normalization replica of  LayerNorm."""
+  __data__ = ("scale", "bias")
+  def __init__(self, num_features: int, *, epsilon: float = 1e-6, rngs=nnx.Rngs(42)):
+    del rngs
+    self.scale = nnx.Param(jnp.ones(shape=(num_features,)))
+    self.bias = nnx.Param(jnp.zeros(shape=(num_features,)))
+    self.num_features = num_features
+    self.epsilon = epsilon
+  def __call__(self, inputs: Float[Array, "b ... d"]) -> Float[Array, "b ... d"]:
+    mean = jnp.mean(inputs, axis=-1, keepdims=True)
+    var = jnp.mean(jnp.square(inputs - mean), axis=-1, keepdims=True)
+    normed_inputs = (inputs - mean) * jax.lax.rsqrt(var + self.epsilon)
+    normed_inputs *= self.scale
+    normed_inputs += self.bias
+    return normed_inputs

timesfm/flax/transformer.py ADDED Viewed

@@ -0,0 +1,356 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer layers for TimesFM."""
+import functools
+from typing import Callable
+from flax import nnx
+from flax.nnx.nn import linear
+import jax
+from jax import lax
+import jax.numpy as jnp
+import jaxtyping
+from .. import configs
+from . import normalization, util
+Array = jaxtyping.Array
+Bool = jaxtyping.Bool
+Float = jaxtyping.Float
+Integer = jaxtyping.Integer
+Num = jaxtyping.Num
+LayerNorm = normalization.LayerNorm
+RMSNorm = normalization.RMSNorm
+LinearGeneral = linear.LinearGeneral
+TransformerConfig = configs.TransformerConfig
+DecodeCache = util.DecodeCache
+@functools.partial(
+  jax.jit,
+  static_argnames=("query_length", "kv_length"),
+)
+def make_attn_mask(
+  query_length: int,
+  num_all_masked_kv: Integer[Array, "b"],
+  query_index_offset: Integer[Array, "b"] | None = None,
+  kv_length: int = 0,
+) -> Bool[Array, "b 1 q n"]:
+  """Makes attention mask."""
+  if kv_length == 0:
+    kv_length = query_length
+  q_index = jnp.arange(query_length)[None, None, :, None]
+  if query_index_offset is not None:
+    q_index += query_index_offset[:, None, None, None]
+  kv_index = jnp.arange(kv_length)[None, None, None, :]
+  return jnp.logical_and(
+    q_index >= kv_index,
+    kv_index >= num_all_masked_kv[:, None, None, None],
+  )
+class RotaryPositionalEmbedding(nnx.Module):
+  """Rotary positional embedding."""
+  def __init__(
+    self,
+    embedding_dims: int,
+    min_timescale: int = 1,
+    max_timescale: int = 10000,
+  ):
+    self.embedding_dims = embedding_dims
+    self.min_timescale = min_timescale
+    self.max_timescale = max_timescale
+  def __call__(
+    self,
+    inputs: Float[Array, "b ... d"],
+    position: Array | None = None,
+  ):
+    """Generates a JTensor of sinusoids with different frequencies."""
+    if self.embedding_dims != inputs.shape[-1]:
+      raise ValueError(
+        "The embedding dims of the rotary position embedding"
+        "must match the hidden dimension of the inputs."
+      )
+    half_embedding_dim = self.embedding_dims // 2
+    fraction = 2 * jnp.arange(0, half_embedding_dim) / self.embedding_dims
+    timescale = (
+      self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction
+    )
+    if position is None:
+      seq_length = inputs.shape[1]
+      position = jnp.arange(seq_length, dtype=jnp.float32)[None, :]
+    if len(inputs.shape) == 4:
+      position = position[..., None, None]
+      timescale = timescale[None, None, None, :]
+    elif len(inputs.shape) == 3:
+      position = position[..., None]
+      timescale = timescale[None, None, :]
+    else:
+      raise ValueError("Inputs must be of rank 3 or 4.")
+    sinusoid_inp = position / timescale
+    sin = jnp.sin(sinusoid_inp)
+    cos = jnp.cos(sinusoid_inp)
+    first_half, second_half = jnp.split(inputs, 2, axis=-1)
+    first_part = first_half * cos - second_half * sin
+    second_part = second_half * cos + first_half * sin
+    first_part = first_part.astype(None)
+    second_part = second_part.astype(None)
+    return jnp.concatenate([first_part, second_part], axis=-1)
+class PerDimScale(nnx.Module):
+  """Per-dimension scaling."""
+  __data__ = ("per_dim_scale",)
+  def __init__(self, num_dims: int, *, rngs=nnx.Rngs(42)):
+    del rngs
+    self.num_dims = num_dims
+    self.per_dim_scale = nnx.Param(jnp.zeros(shape=(num_dims,)))
+  def __call__(self, x: Float[Array, "b ... d"]) -> Float[Array, "b ... d"]:
+    return x * (
+      1.442695041 / jnp.sqrt(self.num_dims) * jax.nn.softplus(self.per_dim_scale)
+    )
+class MultiHeadAttention(nnx.Module):
+  """Multi-head attention."""
+  def __init__(
+    self,
+    num_heads: int,
+    in_features: int,
+    *,
+    use_per_dim_scale: bool = True,
+    use_rotary_position_embeddings: bool = True,
+    use_bias: bool = False,
+    deterministic: bool | None = None,
+    attention_fn: Callable[..., Array] = nnx.dot_product_attention,
+    qk_norm: str = "rms",
+    rngs=nnx.Rngs(42),
+  ):
+    self.num_heads = num_heads
+    self.in_features = in_features
+    self.qkv_features = in_features
+    self.out_features = in_features
+    self.in_kv_features = in_features
+    self.deterministic = deterministic
+    self.use_bias = use_bias
+    self.attention_fn = attention_fn
+    self.qk_norm = qk_norm
+    if self.qkv_features % self.num_heads != 0:
+      raise ValueError(
+        f"Memory dimension ({self.qkv_features}) must be divisible by "
+        f"'num_heads' heads ({self.num_heads})."
+      )
+    self.head_dim = self.qkv_features // self.num_heads
+    linear_general = functools.partial(
+      LinearGeneral,
+      out_features=(self.num_heads, self.head_dim),
+      use_bias=self.use_bias,
+    )
+    # project inputs_q to multi-headed q/k/v
+    # dimensions are then [batch..., length, n_heads, n_features_per_head]
+    self.query = linear_general(self.in_features, rngs=rngs)
+    self.key = linear_general(self.in_kv_features, rngs=rngs)
+    self.value = linear_general(self.in_kv_features, rngs=rngs)
+    if self.qk_norm == "rms":
+      self.query_ln = RMSNorm(self.head_dim)
+      self.key_ln = RMSNorm(self.head_dim)
+    else:
+      self.query_ln = None
+      self.key_ln = None
+    self.out = LinearGeneral(
+      in_features=(self.num_heads, self.head_dim),
+      out_features=self.out_features,
+      axis=(-2, -1),
+      use_bias=self.use_bias,
+      rngs=rngs,
+    )
+    self.use_per_dim_scale = use_per_dim_scale
+    self.use_rotary_position_embeddings = use_rotary_position_embeddings
+    if self.use_rotary_position_embeddings:
+      self.rotary_position_embedding = RotaryPositionalEmbedding(
+        embedding_dims=self.head_dim,
+      )
+    else:
+      self.rotary_position_embedding = None
+    if use_per_dim_scale:
+      self.per_dim_scale = PerDimScale(num_dims=self.head_dim, rngs=rngs)
+    else:
+      self.per_dim_scale = None
+  def __call__(
+    self,
+    inputs_q: Array,
+    *,
+    decode_cache: DecodeCache | None = None,
+    patch_mask: Array | None = None,
+    deterministic: bool | None = None,
+    sow_weights: bool = False,
+  ) -> tuple[Float[Array, "b ... o"], DecodeCache | None]:
+    """Applies multi-head dot product attention on the input data."""
+    _, n_patches, input_in_features = inputs_q.shape
+    if input_in_features != self.in_features:
+      raise ValueError(
+        f"Incompatible input dimension, got {input_in_features} "
+        f"but module expects {self.in_features}."
+      )
+    if patch_mask is None:
+      patch_mask = jnp.zeros_like(inputs_q.shape[:-1], dtype=jnp.bool)
+    # For query: rope -> ln -> per_dim_scale
+    query = self.query(inputs_q)
+    key = self.key(inputs_q)
+    value = self.value(inputs_q)
+    if decode_cache is None:
+      num_masked = jnp.sum(patch_mask.astype(jnp.int32), axis=-1, keepdims=False)
+      next_index = jnp.zeros_like(num_masked, dtype=jnp.int32)
+    else:
+      num_masked = (
+        jnp.sum(patch_mask.astype(jnp.int32), axis=-1, keepdims=False)
+        + decode_cache.num_masked
+      )
+      next_index = decode_cache.next_index
+    if self.use_rotary_position_embeddings:
+      position = (
+        jnp.arange(n_patches, dtype=jnp.int32)[None, :]
+        + next_index[:, None]
+        - num_masked[:, None]
+      )
+      query = self.rotary_position_embedding(query, position)
+      key = self.rotary_position_embedding(key, position)
+    if self.query_ln is not None:
+      query = self.query_ln(query)
+    if self.key_ln is not None:
+      key = self.key_ln(key)
+    if self.use_per_dim_scale:
+      query = self.per_dim_scale(query)
+    if decode_cache is not None:
+      # Cached decoding.
+      _, decode_cache_size, _, _ = decode_cache.value.shape
+      zero = jnp.array(0, dtype=lax.dtype(next_index.dtype))
+      start_indices = (zero, next_index[0], zero, zero)
+      key = lax.dynamic_update_slice(decode_cache.key, key, start_indices)
+      value = lax.dynamic_update_slice(decode_cache.value, value, start_indices)
+      decode_cache.key = key
+      decode_cache.value = value
+      decode_cache.next_index = next_index + n_patches
+      decode_cache.num_masked = num_masked
+      attn_mask = make_attn_mask(
+        query_length=n_patches,
+        num_all_masked_kv=num_masked,
+        query_index_offset=next_index,
+        kv_length=decode_cache_size,
+      )
+    else:
+      # Training
+      attn_mask = make_attn_mask(query_length=n_patches, num_all_masked_kv=num_masked)
+    # apply attention
+    x = self.attention_fn(
+      query * jnp.sqrt(self.head_dim),
+      key,
+      value,
+      mask=attn_mask,
+      deterministic=deterministic,
+      module=self if sow_weights else None,
+    )
+    # back to the original inputs dimensions
+    out = self.out(x)
+    return out, decode_cache
+class Transformer(nnx.Module):
+  """Classic Transformer used in TimesFM."""
+  def __init__(self, config: TransformerConfig, *, rngs=nnx.Rngs(42)):
+    self.config = config
+    if config.attention_norm == "rms":
+      self.pre_attn_ln = RMSNorm(num_features=config.model_dims, rngs=rngs)
+      self.post_attn_ln = RMSNorm(num_features=config.model_dims, rngs=rngs)
+    else:
+      raise ValueError(f"Layer norm: {config.attention_norm} not supported.")
+    self.attn = MultiHeadAttention(
+      num_heads=config.num_heads,
+      in_features=config.model_dims,
+      use_per_dim_scale=True,
+      use_rotary_position_embeddings=config.use_rotary_position_embeddings,
+      qk_norm=config.qk_norm,
+      rngs=rngs,
+    )
+    if config.feedforward_norm == "rms":
+      self.pre_ff_ln = RMSNorm(num_features=config.model_dims, rngs=rngs)
+      self.post_ff_ln = RMSNorm(num_features=config.model_dims, rngs=rngs)
+    else:
+      raise ValueError(f"Layer norm: {config.feedforward_norm} not supported.")
+    self.ff0 = nnx.Linear(
+      in_features=config.model_dims,
+      out_features=config.hidden_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    self.ff1 = nnx.Linear(
+      in_features=config.hidden_dims,
+      out_features=config.model_dims,
+      use_bias=config.use_bias,
+      rngs=rngs,
+    )
+    if config.ff_activation == "relu":
+      self.activation = jax.nn.relu
+    elif config.ff_activation == "swish":
+      self.activation = jax.nn.swish
+    elif config.ff_activation == "none":
+      self.activation = lambda x: x
+    else:
+      raise ValueError(f"Activation: {config.ff_activation} not supported.")
+  def __call__(
+    self,
+    input_embeddings: Float[Array, "b n d"],
+    patch_mask: Bool[Array, "b n"],
+    decode_cache: DecodeCache | None = None,
+  ) -> tuple[Float[Array, "b n d"], DecodeCache | None]:
+    attn_output, decode_cache = self.attn(
+      inputs_q=self.pre_attn_ln(input_embeddings),
+      decode_cache=decode_cache,
+      patch_mask=patch_mask,
+      sow_weights=False,
+      deterministic=True,
+    )
+    attn_output = self.post_attn_ln(attn_output) + input_embeddings
+    output_embeddings = (
+      self.post_ff_ln(self.ff1(self.activation(self.ff0(self.pre_ff_ln(attn_output)))))
+      + attn_output
+    )
+    return output_embeddings, decode_cache