PyPI - lalamo - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

lalamo 0.3.4py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

lalamo/__init__.py +20 -5
lalamo/data/__init__.py +8 -0
lalamo/data/huggingface_message.py +38 -0
lalamo/data/lalamo_completions.py +43 -0
lalamo/data/utils.py +8 -0
lalamo/language_model.py +152 -69
lalamo/main.py +273 -45
lalamo/message_processor.py +11 -1
lalamo/model_import/common.py +10 -6
lalamo/model_import/decoder_configs/__init__.py +3 -0
lalamo/model_import/decoder_configs/executorch.py +12 -6
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +1 -3
lalamo/model_import/decoder_configs/huggingface/gemma2.py +11 -5
lalamo/model_import/decoder_configs/huggingface/gemma3.py +14 -5
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +195 -0
lalamo/model_import/decoder_configs/huggingface/llama.py +38 -8
lalamo/model_import/decoder_configs/huggingface/mistral.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen2.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen3.py +12 -6
lalamo/model_import/huggingface_tokenizer_config.py +1 -3
lalamo/model_import/loaders/executorch.py +10 -9
lalamo/model_import/loaders/huggingface.py +104 -9
lalamo/model_import/loaders/utils.py +92 -0
lalamo/model_import/model_specs/__init__.py +4 -1
lalamo/model_import/model_specs/common.py +15 -12
lalamo/model_import/model_specs/gpt_oss.py +21 -0
lalamo/modules/__init__.py +35 -7
lalamo/modules/activations.py +24 -14
lalamo/modules/attention.py +73 -20
lalamo/modules/common.py +8 -57
lalamo/modules/decoder.py +48 -34
lalamo/modules/decoder_layer.py +57 -43
lalamo/modules/embedding.py +13 -19
lalamo/modules/kv_cache.py +53 -16
lalamo/modules/linear.py +260 -79
lalamo/modules/mlp.py +395 -23
lalamo/modules/normalization.py +2 -3
lalamo/modules/rope.py +32 -21
lalamo/modules/utils.py +10 -0
lalamo/speculator/__init__.py +11 -0
lalamo/speculator/common.py +22 -0
lalamo/speculator/inference.py +75 -0
lalamo/speculator/ngram.py +154 -0
lalamo/speculator/utils.py +52 -0
lalamo/utils.py +27 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.1.dist-info}/METADATA +11 -4
lalamo-0.4.1.dist-info/RECORD +71 -0
lalamo-0.3.4.dist-info/RECORD +0 -59
{lalamo-0.3.4.dist-info → lalamo-0.4.1.dist-info}/WHEEL +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.1.dist-info}/entry_points.txt +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.1.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.1.dist-info}/top_level.txt +0 -0

lalamo/modules/mlp.py CHANGED Viewed

@@ -1,60 +1,170 @@
+import math
+from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, replace
+from functools import partial
 from typing import Self
 import equinox as eqx
 import jax
-from jaxtyping import Array, DTypeLike, Float, PRNGKeyArray
+import jax.numpy as jnp
+from einops import rearrange
+from jax import vmap
+from jaxtyping import Array, Bool, DTypeLike, Float, Int, PRNGKeyArray
 from lalamo.common import ParameterTree
+from lalamo.modules.utils import vmap_twice
 from .activations import Activation
-from .common import LalamoModule, WeightLayout
+from .common import DummyUnionMember, ForwardPassMode, LalamoModule, register_config_union
 from .linear import LinearBase, LinearConfig
-__all__ = ["MLP", "MLPConfig"]
+__all__ = [
+    "DenseMLP",
+    "DenseMLPConfig",
+    "MLPBase",
+    "MLPConfig",
+    "MLPForwardPassConfig",
+    "MixtureOfExperts",
+    "MixtureOfExpertsConfig",
+    "RoutingFunction",
+    "SoftmaxRouting",
+]
+_SENTINEL = 2**31 - 1
+@dataclass(frozen=True)
+class MLPForwardPassConfig:
+    moe_chunk_size_ratio: float = 0.2
+class MLPBase[ConfigT: MLPConfig](LalamoModule[ConfigT]):
+    @property
+    @abstractmethod
+    def activation_precision(self) -> DTypeLike: ...
+    @property
+    @abstractmethod
+    def model_dim(self) -> int: ...
+    @property
+    @abstractmethod
+    def hidden_dim(self) -> int: ...
+    @abstractmethod
+    def __call__(
+        self,
+        inputs: Float[Array, "batch suffix_tokens channels"],
+        lengths_without_padding: Int[Array, " batch"] | None = None,
+        forward_pass_mode: ForwardPassMode = ForwardPassMode.MULTI_TOKEN,
+        forward_pass_config: MLPForwardPassConfig | None = None,
+    ) -> Float[Array, "batch suffix_tokens channels"]: ...
+@dataclass(frozen=True)
+class MLPConfigBase(ABC):
+    @abstractmethod
+    def random_init(self, model_dim: int, hidden_dim: int, *, key: PRNGKeyArray) -> MLPBase: ...
+    @abstractmethod
+    def empty(self, model_dim: int, hidden_dim: int) -> MLPBase: ...
 @dataclass(frozen=True)
-class MLPConfig:
+class DenseMLPConfig(MLPConfigBase):
     linear_config: LinearConfig
     activation: Activation
+    has_up_biases: bool
+    has_down_biases: bool
+    gate_clipping: tuple[float | None, float | None] | None
+    up_clipping: tuple[float | None, float | None] | None
-    def random_init(self, model_dim: int, hidden_dim: int, *, key: PRNGKeyArray) -> "MLP":
+    def random_init(self, model_dim: int, hidden_dim: int, *, key: PRNGKeyArray) -> "DenseMLP":
         up_key, down_key = jax.random.split(key)
-        return MLP(
+        return DenseMLP(
             self,
             up_projection=self.linear_config.random_init(
                 model_dim,
                 (hidden_dim, hidden_dim),
-                has_biases=False,
+                has_biases=self.has_up_biases,
                 key=up_key,
             ),
             down_projection=self.linear_config.random_init(
                 hidden_dim,
                 (model_dim,),
-                has_biases=False,
+                has_biases=self.has_down_biases,
                 key=down_key,
             ),
         )
-    def empty(self, model_dim: int, hidden_dim: int) -> "MLP":
-        return MLP(
+    def empty(self, model_dim: int, hidden_dim: int) -> "DenseMLP":
+        return DenseMLP(
             self,
             up_projection=self.linear_config.empty(
                 model_dim,
                 (hidden_dim, hidden_dim),
-                has_biases=False,
+                has_biases=self.has_up_biases,
             ),
             down_projection=self.linear_config.empty(
                 hidden_dim,
                 (model_dim,),
-                has_biases=False,
+                has_biases=self.has_down_biases,
             ),
         )
+    def random_init_mixture(
+        self,
+        mixture_size: int,
+        model_dim: int,
+        hidden_dim: int,
+        *,
+        key: PRNGKeyArray,
+    ) -> "DenseMLP":
+        up_key, down_key = jax.random.split(key)
+        return DenseMLP(
+            self,
+            up_projection=self.linear_config.random_init_mixture(
+                mixture_size,
+                model_dim,
+                (hidden_dim, hidden_dim),
+                has_biases=self.has_up_biases,
+                key=up_key,
+            ),
+            down_projection=self.linear_config.random_init_mixture(
+                mixture_size,
+                hidden_dim,
+                (model_dim,),
+                has_biases=self.has_down_biases,
+                key=down_key,
+            ),
+        )
-class MLP(LalamoModule[MLPConfig]):
+    def empty_mixture(
+        self,
+        mixture_size: int,
+        model_dim: int,
+        hidden_dim: int,
+    ) -> "DenseMLP":
+        return DenseMLP(
+            self,
+            up_projection=self.linear_config.empty_mixture(
+                mixture_size,
+                model_dim,
+                (hidden_dim, hidden_dim),
+                has_biases=self.has_up_biases,
+            ),
+            down_projection=self.linear_config.empty_mixture(
+                mixture_size,
+                hidden_dim,
+                (model_dim,),
+                has_biases=self.has_down_biases,
+            ),
+        )
+class DenseMLP(MLPBase[DenseMLPConfig]):
     up_projection: LinearBase
     down_projection: LinearBase
@@ -70,6 +180,10 @@ class MLP(LalamoModule[MLPConfig]):
     def hidden_dim(self) -> int:
         return self.down_projection.input_dim
+    @property
+    def mixture_size(self) -> int | None:
+        return self.up_projection.mixture_size
     def __post_init__(self) -> None:
         up_output_dim, gate_output_dim = self.up_projection.output_dims
         if up_output_dim != gate_output_dim:
@@ -78,35 +192,293 @@ class MLP(LalamoModule[MLPConfig]):
                 f" the gate output dimension {gate_output_dim}",
             )
         (down_output_dim,) = self.down_projection.output_dims
-        if self.up_projection.input_dim != down_output_dim:
+        if (self.up_projection.input_dim, up_output_dim) != (down_output_dim, self.down_projection.input_dim):
             raise ValueError(
-                f"Down projection input dimension {down_output_dim} does not match"
-                f" the up projection output dimension {self.up_projection.input_dim}",
+                f"Down projection dimensions {self.down_projection.input_dim, down_output_dim} do not match"
+                f" the up projection output dimensions {self.up_projection.input_dim, up_output_dim}",
             )
     @eqx.filter_jit
-    def __call__(self, inputs: Float[Array, " channels"]) -> Float[Array, " channels"]:
+    def __call__(
+        self,
+        inputs: Float[Array, "batch suffix_tokens channels"],
+        lengths_without_padding: Int[Array, " batch"] | None = None,  # noqa: ARG002
+        forward_pass_mode: ForwardPassMode = ForwardPassMode.MULTI_TOKEN,  # noqa: ARG002
+        forward_pass_config: MLPForwardPassConfig | None = None,  # noqa: ARG002
+    ) -> Float[Array, "batch suffix_tokens channels"]:
+        return vmap_twice(self.call_unbatched)(inputs)
+    @eqx.filter_jit
+    def call_unbatched(self, inputs: Float[Array, " channels"]) -> Float[Array, " channels"]:
+        if self.mixture_size is not None:
+            raise ValueError(
+                "Mixtures of linear layers cannot be called directly."
+                "They are intended to be used with methods eqx.filter_vmap or lax.scan instead.",
+            )
         up_proj, gate = self.up_projection(inputs)
+        if self.config.gate_clipping:
+            gate = jnp.clip(gate, *self.config.gate_clipping)
+        if self.config.up_clipping:
+            up_proj = jnp.clip(up_proj, *self.config.up_clipping)
         gate = self.config.activation(gate)
         (result,) = self.down_projection(up_proj * gate)
         return result
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+    def export_weights(self) -> ParameterTree:
         return {
-            "up_projection": self.up_projection.export_weights(weight_layout),
-            "down_projection": self.down_projection.export_weights(weight_layout),
+            "up_projection": self.up_projection.export_weights(),
+            "down_projection": self.down_projection.export_weights(),
         }
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,
     ) -> Self:
         assert isinstance(weights, Mapping)
         assert isinstance(weights["up_projection"], Mapping)
         assert isinstance(weights["down_projection"], Mapping)
         return replace(
             self,
-            up_projection=self.up_projection.import_weights(weights["up_projection"], weight_layout),
-            down_projection=self.down_projection.import_weights(weights["down_projection"], weight_layout),
+            up_projection=self.up_projection.import_weights(weights["up_projection"]),
+            down_projection=self.down_projection.import_weights(weights["down_projection"]),
+        )
+class RoutingMap(eqx.Module):
+    expert_mask: Bool[Array, "*batch_tokens experts"]
+    expert_weights: Float[Array, "*batch_tokens experts"]
+@dataclass(frozen=True)
+class RoutingFunctionBase(ABC):
+    def __call__(self, logits: Float[Array, "batch_tokens experts"], num_active: int) -> RoutingMap:
+        return vmap(partial(self.call_unbatched, num_active=num_active))(logits)
+    @abstractmethod
+    def call_unbatched(self, logits: Float[Array, " experts"], num_active: int) -> RoutingMap: ...
+@dataclass(frozen=True)
+class SoftmaxRouting(RoutingFunctionBase):
+    def call_unbatched(self, logits: Float[Array, " experts"], num_active: int) -> RoutingMap:
+        active_logits, active_indices = jax.lax.top_k(logits, num_active)
+        active_weights = jax.nn.softmax(active_logits)
+        mask = jnp.zeros_like(logits, dtype=bool)
+        mask = mask.at[active_indices].set(True)
+        expert_weights = jnp.zeros_like(logits)
+        expert_weights = expert_weights.at[active_indices].set(active_weights)
+        return RoutingMap(expert_mask=mask, expert_weights=expert_weights)
+RoutingFunction = SoftmaxRouting | DummyUnionMember
+register_config_union(RoutingFunction)
+@dataclass(frozen=True)
+class MixtureOfExpertsConfig(ABC):
+    mixture_size: int
+    num_experts_per_token: int
+    routing_function: RoutingFunction
+    router_config: LinearConfig
+    router_has_biases: bool
+    expert_config: DenseMLPConfig
+    def random_init(self, model_dim: int, hidden_dim: int, *, key: PRNGKeyArray) -> "MixtureOfExperts":
+        experts_key, router_key = jax.random.split(key)
+        router = self.router_config.random_init(
+            model_dim,
+            (self.mixture_size,),
+            has_biases=self.router_has_biases,
+            key=router_key,
+        )
+        experts = self.expert_config.random_init_mixture(self.mixture_size, model_dim, hidden_dim, key=experts_key)
+        return MixtureOfExperts(self, router, experts)
+    def empty(self, model_dim: int, hidden_dim: int) -> "MixtureOfExperts":
+        router = self.router_config.empty(model_dim, (self.mixture_size,), has_biases=self.router_has_biases)
+        experts = self.expert_config.empty_mixture(self.mixture_size, model_dim, hidden_dim)
+        return MixtureOfExperts(self, router, experts)
+class MixtureOfExperts(MLPBase[MixtureOfExpertsConfig]):
+    router: LinearBase
+    experts: DenseMLP
+    @property
+    def mixture_size(self) -> int:
+        return self.config.mixture_size
+    @property
+    def num_experts_per_token(self) -> int:
+        return self.config.num_experts_per_token
+    @property
+    def activation_precision(self) -> DTypeLike:
+        return self.experts.activation_precision
+    @property
+    def model_dim(self) -> int:
+        return self.experts.model_dim
+    @property
+    def hidden_dim(self) -> int:
+        return self.experts.hidden_dim
+    def __post_init__(self) -> None:
+        if self.router.input_dim != self.experts.model_dim:
+            raise ValueError(
+                f"Router input dimension ({self.router.input_dim}) must match experts model_dim"
+                f" ({self.experts.model_dim}).",
+            )
+        (router_output_dim,) = self.router.output_dims
+        if router_output_dim != self.mixture_size:
+            raise ValueError(
+                f"Router output dimension ({router_output_dim}) must equal mixture_size ({self.mixture_size}).",
+            )
+        if self.experts.mixture_size != self.mixture_size:
+            raise ValueError(
+                f"Experts mixture_size ({self.experts.mixture_size}) does not match specified mixture_size"
+                f" ({self.mixture_size}).",
+            )
+    def __call__(
+        self,
+        inputs: Float[Array, "batch suffix_tokens channels"],
+        lengths_without_padding: Int[Array, " batch"] | None = None,
+        forward_pass_mode: ForwardPassMode = ForwardPassMode.MULTI_TOKEN,
+        forward_pass_config: MLPForwardPassConfig | None = None,
+    ) -> Float[Array, "batch suffix_tokens channels"]:
+        match forward_pass_mode:
+            case ForwardPassMode.MULTI_TOKEN:
+                return self.call_prefill_mode(inputs, lengths_without_padding, forward_pass_config)
+            case ForwardPassMode.SINGLE_TOKEN:
+                return self.call_decode_mode(inputs)
+    @eqx.filter_jit
+    def call_decode_mode(
+        self,
+        inputs: Float[Array, "batch suffix_tokens channels"],
+    ) -> Float[Array, "batch suffix_tokens channels"]:
+        def per_token(x: Float[Array, " channels"]) -> Float[Array, " channels"]:
+            (router_logits,) = self.router(x)
+            routing = self.config.routing_function.call_unbatched(
+                router_logits,
+                num_active=self.num_experts_per_token,
+            )
+            active_indices = jnp.flatnonzero(routing.expert_mask, size=self.num_experts_per_token)
+            active_weights = routing.expert_weights[active_indices]
+            def apply_one(idx: Int[Array, ""], w: Float[Array, ""]) -> Float[Array, " channels"]:
+                selected_expert = jax.tree_util.tree_map(
+                    lambda leaf: jax.lax.dynamic_index_in_dim(leaf, idx, axis=0, keepdims=False),
+                    self.experts,
+                )
+                return selected_expert.call_unbatched(x) * w
+            contributions = vmap(apply_one)(active_indices, active_weights)
+            return jnp.sum(contributions, axis=0)
+        return vmap_twice(per_token)(inputs)
+    @eqx.filter_jit
+    def call_prefill_mode(
+        self,
+        inputs: Float[Array, "batch suffix_tokens channels"],
+        lengths_without_padding: Int[Array, " batch"] | None = None,
+        forward_pass_config: MLPForwardPassConfig | None = None,
+    ) -> Float[Array, "batch suffix_tokens channels"]:
+        forward_pass_config = forward_pass_config or MLPForwardPassConfig()
+        batch_size, sequence_length, _ = inputs.shape
+        num_tokens = batch_size * sequence_length
+        if lengths_without_padding is None:
+            lengths_without_padding = jnp.ones(batch_size, dtype=jnp.int32) * sequence_length
+        padding_mask = jnp.arange(sequence_length)[None, :] < lengths_without_padding[:, None]
+        flattened_inputs = rearrange(inputs, "batch suffix_tokens channels -> (batch suffix_tokens) channels")
+        flattened_padding_mask = rearrange(padding_mask, "batch suffix_tokens -> (batch suffix_tokens)")
+        (router_logits,) = vmap(self.router)(flattened_inputs)
+        routing_map = self.config.routing_function(router_logits, self.num_experts_per_token)
+        token_mask = rearrange(
+            routing_map.expert_mask & flattened_padding_mask[:, None],
+            "tokens experts -> experts tokens",
+        )
+        expert_weights = rearrange(
+            routing_map.expert_weights,
+            "tokens experts -> experts tokens",
+        )
+        expert_weights = jnp.where(token_mask, expert_weights, 0.0)
+        chunk_size = math.ceil(num_tokens * forward_pass_config.moe_chunk_size_ratio)
+        num_padded_tokens = math.ceil(num_tokens / chunk_size) * chunk_size
+        token_indices = vmap(lambda m: jnp.flatnonzero(m, size=num_padded_tokens, fill_value=_SENTINEL))(token_mask)
+        chunked_token_indices = rearrange(
+            token_indices,
+            "experts (chunks chunk_tokens) -> chunks experts chunk_tokens",
+            chunk_tokens=chunk_size,
+        )
+        def loop_iteration(
+            accumulator: Float[Array, "tokens channels"],
+            token_indices_for_chunk: Int[Array, "experts chunk_tokens"],
+        ) -> tuple[Float[Array, "tokens channels"], None]:
+            def inner() -> Float[Array, "tokens channels"]:
+                weights_for_chunk = jnp.take_along_axis(
+                    expert_weights,
+                    token_indices_for_chunk,
+                    axis=1,
+                    mode="fill",
+                    fill_value=0.0,
+                )
+                def run_expert(
+                    expert: DenseMLP,
+                    indices: Int[Array, " tokens_per_chunk"],
+                    weights: Float[Array, " tokens_per_chunk"],
+                ) -> Float[Array, "tokens_per_chunk channels"]:
+                    inputs = flattened_inputs.at[indices].get(mode="fill", fill_value=0.0)
+                    return vmap(expert.call_unbatched)(inputs) * weights[:, None]
+                expert_outputs = vmap(run_expert)(self.experts, token_indices_for_chunk, weights_for_chunk)
+                return accumulator.at[token_indices_for_chunk].add(
+                    expert_outputs,
+                    mode="drop",
+                )
+            return jax.lax.cond(jnp.any(token_indices_for_chunk != _SENTINEL), inner, lambda: accumulator), None
+        result, _ = jax.lax.scan(loop_iteration, jnp.zeros_like(flattened_inputs), chunked_token_indices)
+        return rearrange(result, "(batch suffix_tokens) channels -> batch suffix_tokens channels", batch=batch_size)
+    def export_weights(
+        self,
+    ) -> ParameterTree[Array]:
+        return {
+            "router": self.router.export_weights(),
+            "experts": self.experts.export_weights(),
+        }
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+    ) -> Self:
+        assert isinstance(weights, Mapping)
+        assert isinstance(weights["router"], Mapping)
+        assert isinstance(weights["experts"], Mapping)
+        return replace(
+            self,
+            router=self.router.import_weights(weights["router"]),
+            experts=self.experts.import_weights(weights["experts"]),
         )
+MLPConfig = DenseMLPConfig | MixtureOfExpertsConfig
+register_config_union(MLPConfig)

lalamo/modules/normalization.py CHANGED Viewed

@@ -10,7 +10,7 @@ from jaxtyping import Array, DTypeLike, Float
 from lalamo.common import ParameterTree, dummy_array
-from .common import LalamoModule, WeightLayout
+from .common import LalamoModule
 __all__ = [
     "RMSNorm",
@@ -83,13 +83,12 @@ class RMSNorm(LalamoModule[RMSNormConfig]):
         result = normalized_x * adjusted_scales
         return result.astype(inputs.dtype)
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:  # noqa: ARG002
+    def export_weights(self) -> ParameterTree:
         return {"scales": self.scales}
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,  # noqa: ARG002
     ) -> Self:
         assert isinstance(weights, Mapping)
         return replace(self, scales=weights["scales"])

lalamo/modules/rope.py CHANGED Viewed

@@ -25,7 +25,7 @@ from jaxtyping import Array, DTypeLike, Float, Int
 from lalamo.common import ParameterTree
-from .common import LalamoModule, WeightLayout, register_config_union
+from .common import LalamoModule, register_config_union
 __all__ = [
     "LinearScalingRoPEConfig",
@@ -39,22 +39,25 @@ __all__ = [
 class PositionalEmbeddings(eqx.Module):
-    cosines: Float[Array, "tokens head_channels"]
-    sines: Float[Array, "tokens head_channels"]
+    cosines: Float[Array, "*batch tokens head_channels"]
+    sines: Float[Array, "*batch tokens head_channels"]
     @property
     def head_dim(self) -> int:
         return self.cosines.shape[-1]
-    def rotate_half(self, heads: Float[Array, "tokens head_channels"]) -> Float[Array, "tokens head_channels"]:
+    def rotate_half(
+        self,
+        heads: Float[Array, "*batch tokens head_channels"],
+    ) -> Float[Array, "*batch tokens head_channels"]:
         x1 = heads[..., : self.head_dim // 2]
         x2 = heads[..., self.head_dim // 2 :]
         return jnp.concatenate((-x2, x1), axis=-1)
-    def apply(self, heads: Float[Array, "tokens head_channels"]) -> Float[Array, "tokens head_channels"]:
+    def apply(self, heads: Float[Array, "*batch tokens head_channels"]) -> Float[Array, "*batch tokens head_channels"]:
         return heads * self.cosines + self.rotate_half(heads) * self.sines
-    def export(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:  # noqa: ARG002
+    def export(self) -> ParameterTree:
         return dict(
             cosines=self.cosines,
             sines=self.sines,
@@ -105,9 +108,9 @@ class RoPE(LalamoModule[RoPEConfigBase]):
     def __post_init__(self) -> None:
         num_tokens, _ = self.sines.shape
-        if num_tokens != self.config.max_sequence_length:
+        if num_tokens > self.config.max_sequence_length:
             raise ValueError(
-                f"{num_tokens} does not match the specified max sequence length {self.config.max_sequence_length}",
+                f"{num_tokens} exceeds the specified max sequence length {self.config.max_sequence_length}",
             )
         if self.cosines.dtype != self.config.precision:
             raise ValueError(
@@ -140,7 +143,7 @@ class RoPE(LalamoModule[RoPEConfigBase]):
             sines=self.sines[timesteps],
         )
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree[Array]:  # noqa: ARG002
+    def export_weights(self) -> ParameterTree[Array]:
         return {
             "cosines": self.cosines,
             "sines": self.sines,
@@ -149,7 +152,6 @@ class RoPE(LalamoModule[RoPEConfigBase]):
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,  # noqa: ARG002
     ) -> "RoPE":
         assert isinstance(weights, Mapping)
         return replace(self, cosines=weights["cosines"], sines=weights["sines"])
@@ -199,13 +201,15 @@ class LlamaRoPEConfig(RoPEConfigBase):
 @dataclass(frozen=True)
 class YARNRoPEConfig(RoPEConfigBase):
     scaling_factor: float
+    original_context_length: int
     beta_fast: float
     beta_slow: float
+    truncate: bool
     @classmethod
-    def _find_correction_dim(cls, num_rotations: float, dim: int, base: float, max_position_embeddings: int) -> float:
+    def _find_correction_dim(cls, num_rotations: float, dim: int, base: float, original_context_length: int) -> float:
         """Inverse dimension formula to find the dimension based on the number of rotations"""
-        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+        return (dim * math.log(original_context_length / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
     @classmethod
     def _find_correction_range(
@@ -214,19 +218,25 @@ class YARNRoPEConfig(RoPEConfigBase):
         high_rot: float,
         dim: int,
         base: float,
-        max_position_embeddings: int,
-    ) -> tuple[int, int]:
+        original_context_length: int,
+        truncate: bool,
+    ) -> tuple[float, float]:
         """Find dimension range bounds based on rotations"""
-        low = math.floor(cls._find_correction_dim(low_rot, dim, base, max_position_embeddings))
-        high = math.ceil(cls._find_correction_dim(high_rot, dim, base, max_position_embeddings))
-        return max(low, 0), min(high, dim - 1)
+        low = cls._find_correction_dim(low_rot, dim, base, original_context_length)
+        high = cls._find_correction_dim(high_rot, dim, base, original_context_length)
+        if truncate:
+            low = math.floor(low)
+            high = math.ceil(high)
+        return max(low, 0.0), min(high, float(dim - 1))
     @classmethod
     def _linear_ramp_factor(cls, min_value: float, max_value: float, dim: int) -> Float[Array, " head_dim"]:
         if min_value == max_value:
             max_value += 0.001  # Prevent singularity
-        linear_func = (jnp.arange(dim, dtype=jnp.float32) - min_value) / (max_value - min_value)
+        min_v = jnp.float32(min_value)
+        max_v = jnp.float32(max_value)
+        linear_func = (jnp.arange(dim, dtype=jnp.float32) - min_v) / (max_v - min_v)
         ramp_func = jnp.clip(linear_func, 0, 1)
         return ramp_func
@@ -234,7 +244,7 @@ class YARNRoPEConfig(RoPEConfigBase):
         self,
         inverse_frequencies: Float[Array, " tokens"],
         head_dim: int,
-        max_sequence_length: int,
+        max_sequence_length: int,  # noqa: ARG002
     ) -> Float[Array, " tokens"]:
         scaled_frequencies = inverse_frequencies / self.scaling_factor
@@ -243,7 +253,8 @@ class YARNRoPEConfig(RoPEConfigBase):
             self.beta_slow,
             head_dim,
             self.base,
-            max_sequence_length,
+            self.original_context_length,
+            self.truncate,
         )
         # Get n-dimensional rotational scaling corrected for extrapolation
@@ -251,7 +262,7 @@ class YARNRoPEConfig(RoPEConfigBase):
         return scaled_frequencies * (1 - smoothing_factor) + inverse_frequencies * smoothing_factor
     @property
-    def attention_scaling_factor(self) -> float:
+    def _attention_scaling_factor(self) -> float:
         return 0.1 * math.log(self.scaling_factor) + 1.0

lalamo/modules/utils.py CHANGED Viewed

@@ -1,11 +1,21 @@
+from collections.abc import Callable
 import jax
+from jax import vmap
 from jaxtyping import Array, Float
 __all__ = [
     "apply_soft_capping",
+    "vmap_twice",
 ]
+def vmap_twice[F: Callable](
+    func: F,
+) -> F:
+    return vmap(vmap(func, in_axes=0), in_axes=0)
 def apply_soft_capping(
     values: Float[Array, "*"],
     soft_cap: float,

lalamo 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl

lalamo 0.3.4py3-none-any.whl → 0.4.1py3-none-any.whl