PyPI - lalamo - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

lalamo 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

lalamo/__init__.py +1 -1
lalamo/language_model.py +22 -23
lalamo/main.py +2 -16
lalamo/model_import/common.py +24 -6
lalamo/model_import/decoder_configs/__init__.py +2 -0
lalamo/model_import/decoder_configs/common.py +4 -4
lalamo/model_import/decoder_configs/executorch.py +17 -10
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +37 -2
lalamo/model_import/decoder_configs/huggingface/gemma2.py +33 -28
lalamo/model_import/decoder_configs/huggingface/gemma3.py +34 -26
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +36 -29
lalamo/model_import/decoder_configs/huggingface/llama.py +14 -12
lalamo/model_import/decoder_configs/huggingface/llamba.py +170 -0
lalamo/model_import/decoder_configs/huggingface/mistral.py +31 -30
lalamo/model_import/decoder_configs/huggingface/qwen2.py +33 -25
lalamo/model_import/decoder_configs/huggingface/qwen3.py +55 -28
lalamo/model_import/loaders/executorch.py +5 -4
lalamo/model_import/loaders/huggingface.py +321 -69
lalamo/model_import/model_specs/__init__.py +2 -0
lalamo/model_import/model_specs/common.py +16 -5
lalamo/model_import/model_specs/llamba.py +40 -0
lalamo/model_import/model_specs/qwen.py +29 -1
lalamo/modules/__init__.py +33 -6
lalamo/modules/activations.py +9 -2
lalamo/modules/common.py +10 -5
lalamo/modules/decoder.py +93 -97
lalamo/modules/decoder_layer.py +85 -103
lalamo/modules/embedding.py +279 -5
lalamo/modules/linear.py +335 -30
lalamo/modules/mlp.py +6 -7
lalamo/modules/mlx_interop.py +19 -0
lalamo/modules/rope.py +1 -1
lalamo/modules/token_mixers/__init__.py +30 -0
lalamo/modules/{attention.py → token_mixers/attention.py} +72 -70
lalamo/modules/token_mixers/common.py +78 -0
lalamo/modules/token_mixers/mamba.py +553 -0
lalamo/modules/token_mixers/state/__init__.py +12 -0
lalamo/modules/token_mixers/state/common.py +26 -0
lalamo/modules/{kv_cache.py → token_mixers/state/kv_cache.py} +5 -16
lalamo/modules/token_mixers/state/mamba_state.py +51 -0
lalamo/utils.py +24 -2
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/METADATA +3 -2
lalamo-0.5.0.dist-info/RECORD +80 -0
lalamo-0.4.1.dist-info/RECORD +0 -71
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/WHEEL +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/top_level.txt +0 -0

lalamo/modules/linear.py CHANGED Viewed

@@ -12,6 +12,7 @@ from jaxtyping import Array, DTypeLike, Float, Int, PRNGKeyArray
 from lalamo.common import ParameterTree, dummy_array
 from lalamo.quantization import QuantizationMode, dynamically_quantize_activations, quantize_weights
+from lalamo.utils import jax_uint4_to_packed_uint8, jax_uint8_to_unpacked_uint4
 from .common import (
     LalamoModule,
@@ -59,7 +60,7 @@ class LinearBase[ConfigT: LinearConfigBase](LalamoModule[ConfigT]):
         assert isinstance(self.output_dims, tuple)
     @staticmethod
-    def _get_split_points(output_dims: Sequence[int]) -> tuple[int, ...]:
+    def get_split_points(output_dims: Sequence[int]) -> tuple[int, ...]:
         result = []
         last_split_point = 0
         for dim in output_dims[:-1]:
@@ -258,7 +259,7 @@ class FullPrecisionLinear(LinearBase[FullPrecisionLinearConfig]):
         result = self.weights @ inputs
         if self.biases is not None:
             result = result + self.biases
-        return tuple(jnp.split(result, self._get_split_points(self.output_dims)))
+        return tuple(jnp.split(result, self.get_split_points(self.output_dims)))
     def export_weights(self) -> ParameterTree:
         result = dict(weights=self.weights)
@@ -279,12 +280,39 @@ class FullPrecisionLinear(LinearBase[FullPrecisionLinearConfig]):
 @dataclass(frozen=True)
-class GroupQuantizedLinearConfig(LinearConfigBase):
+class QuantizedLinearConfigBase(LinearConfigBase):
     group_size: int
     weight_quantization_mode: QuantizationMode
     activation_quantization_mode: QuantizationMode | None
     activation_precision: DTypeLike
+class QuantizedLinearBase[ConfigT: QuantizedLinearConfigBase](LinearBase[ConfigT]):
+    biases: Float[Array, "*components total_out_channels"] | None
+    @abstractmethod
+    def _prepare_scaled_weights(self) -> Float[Array, "*components in_channels total_out_channels"]: ...
+    def _apply_weights(self, inputs: Float[Array, " in_channels"]) -> Float[Array, " total_out_channels"]:
+        if self.config.activation_quantization_mode is not None:
+            inputs = dynamically_quantize_activations(inputs, self.config.activation_quantization_mode)
+        return self._prepare_scaled_weights() @ inputs
+    @eqx.filter_jit
+    def __call__(self, inputs: Float[Array, " in_channels"]) -> tuple[Float[Array, " out_channels"], ...]:
+        if self.mixture_size is not None:
+            raise ValueError(
+                "Mixtures of linear layers cannot be called directly."
+                "They are intended to be used with methods eqx.filter_vmap or lax.scan instead.",
+            )
+        result = self._apply_weights(inputs)
+        if self.biases is not None:
+            result = result + self.biases
+        return tuple(jnp.split(result, self.get_split_points(self.output_dims)))
+@dataclass(frozen=True)
+class GroupQuantizedLinearConfig(QuantizedLinearConfigBase):
     def random_init(
         self,
         input_dim: int,
@@ -381,7 +409,7 @@ class GroupQuantizedLinearConfig(LinearConfigBase):
         return self._empty_general((mixture_size,), input_dim, output_dims, has_biases)
-class GroupQuantizedLinearBase[ConfigT: GroupQuantizedLinearConfig](LinearBase[ConfigT]):
+class GroupQuantizedLinearBase[ConfigT: GroupQuantizedLinearConfig](QuantizedLinearBase[ConfigT]):
     weights: Float[Array, "*components total_out_channels in_channels"]
     scales: Float[Array, "*components total_out_channels groups"]
     zero_points: Float[Array, "*components total_out_channels groups"]
@@ -414,13 +442,27 @@ class GroupQuantizedLinearBase[ConfigT: GroupQuantizedLinearConfig](LinearBase[C
     @property
     def int_weights(self) -> Int[Array, "*components in_channels out_channels"]:
-        result = quantize_weights(self.weights, self.config.weight_quantization_mode)
-        return result.astype(self.config.weight_quantization_mode.dtype)
+        quantized = quantize_weights(self.weights, self.config.weight_quantization_mode)
+        casted = quantized.astype(self.config.weight_quantization_mode.dtype)
+        if self.config.weight_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
     @property
     def int_zero_points(self) -> Int[Array, "*components groups out_channels"]:
-        result = quantize_weights(self.zero_points, self.config.weight_quantization_mode)
-        return result.astype(self.config.weight_quantization_mode.dtype)
+        quantized = quantize_weights(self.zero_points, self.config.weight_quantization_mode)
+        casted = quantized.astype(self.config.weight_quantization_mode.dtype)
+        if self.config.weight_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
     def __post_init__(self) -> None:  # noqa: PLR0912
         if self.weights.dtype != self.config.activation_precision:
@@ -520,28 +562,286 @@ class GroupQuantizedLinearBase[ConfigT: GroupQuantizedLinearConfig](LinearBase[C
         )
         return result
-    def _apply_weights(self, inputs: Float[Array, " in_channels"]) -> Float[Array, " total_out_channels"]:
-        if self.config.activation_quantization_mode is not None:
-            inputs = dynamically_quantize_activations(inputs, self.config.activation_quantization_mode)
-        return self._prepare_scaled_weights() @ inputs
+    def export_weights(self) -> ParameterTree:
+        result = dict(
+            weights=self.int_weights,
+            zero_points=self.int_zero_points,
+            scales=self.scales,
+        )
+        if self.biases is not None:
+            result["biases"] = self.biases
+        return result
-    @eqx.filter_jit
-    def __call__(self, inputs: Float[Array, " in_channels"]) -> tuple[Float[Array, " out_channels"], ...]:
-        if self.mixture_size is not None:
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+    ) -> Self:
+        assert isinstance(weights, Mapping)
+        assert isinstance(weights["weights"], Array)
+        assert isinstance(weights["zero_points"], Array)
+        unpacked_weights = weights["weights"]
+        unpacked_zero_points = weights["zero_points"]
+        if self.config.weight_quantization_mode == QuantizationMode.UINT4:
+            unpacked_weights = jax_uint8_to_unpacked_uint4(weights["weights"])
+            unpacked_zero_points = jax_uint8_to_unpacked_uint4(weights["zero_points"])
+        return replace(
+            self,
+            weights=unpacked_weights.astype(self.weights.dtype),
+            scales=weights["scales"],
+            zero_points=unpacked_zero_points.astype(self.zero_points.dtype),
+            biases=weights["biases"] if self.has_biases else None,
+        )
+class GroupQuantizedLinear(GroupQuantizedLinearBase[GroupQuantizedLinearConfig]):
+    pass
+@dataclass(frozen=True)
+class MLXQuantizedLinearConfig(QuantizedLinearConfigBase):
+    def random_init(
+        self,
+        input_dim: int,
+        output_dims: tuple[int, ...],
+        has_biases: bool,
+        *,
+        key: PRNGKeyArray,
+    ) -> LinearBase:
+        min_val, max_val = self.weight_quantization_mode.range
+        weights = jax.random.uniform(
+            key,
+            (sum(output_dims), input_dim),
+            minval=min_val - 1,
+            maxval=max_val + 1,
+            dtype=self.activation_precision,
+        )
+        num_groups = input_dim // self.group_size
+        scale = 1 / ((max_val - min_val) / 2 * math.sqrt(input_dim))
+        scales = scale * jnp.ones((sum(output_dims), num_groups), dtype=self.activation_precision)
+        if has_biases:
+            biases = jnp.zeros((sum(output_dims),), dtype=self.activation_precision)
+        else:
+            biases = None
+        deq_bias = min_val + 2 ** (self.weight_quantization_mode.bits - 1)
+        deq_biases = deq_bias * jnp.ones((sum(output_dims), num_groups), dtype=self.activation_precision)
+        return MLXQuantizedLinear(
+            config=self,
+            output_dims=output_dims,
+            weights=weights,
+            scales=scales,
+            deq_biases=deq_biases,
+            biases=biases,
+        )
+    def random_init_mixture(
+        self,
+        mixture_size: int,
+        input_dim: int,
+        output_dims: tuple[int, ...],
+        has_biases: bool,
+        *,
+        key: PRNGKeyArray,
+    ) -> LinearBase:
+        subkeys = jax.random.split(key, mixture_size)
+        return eqx.filter_vmap(lambda key: self.random_init(input_dim, output_dims, has_biases, key=key))(subkeys)
+    def _empty_general(
+        self,
+        leading_dims: tuple[int, ...],
+        input_dim: int,
+        output_dims: tuple[int, ...],
+        has_biases: bool,
+    ) -> LinearBase:
+        weights = dummy_array(
+            (*leading_dims, sum(output_dims), input_dim),
+            dtype=self.activation_precision,
+        )
+        num_groups = input_dim // self.group_size
+        scales = dummy_array((*leading_dims, sum(output_dims), num_groups), dtype=self.activation_precision)
+        if has_biases:
+            biases = dummy_array((*leading_dims, sum(output_dims)), dtype=self.activation_precision)
+        else:
+            biases = None
+        deq_biases = dummy_array((*leading_dims, sum(output_dims), num_groups), dtype=self.activation_precision)
+        return MLXQuantizedLinear(
+            config=self,
+            output_dims=output_dims,
+            weights=weights,
+            scales=scales,
+            deq_biases=deq_biases,
+            biases=biases,
+        )
+    def empty(
+        self,
+        input_dim: int,
+        output_dims: tuple[int, ...],
+        has_biases: bool,
+    ) -> LinearBase:
+        return self._empty_general((), input_dim, output_dims, has_biases)
+    def empty_mixture(
+        self,
+        mixture_size: int,
+        input_dim: int,
+        output_dims: tuple[int, ...],
+        has_biases: bool,
+    ) -> LinearBase:
+        return self._empty_general((mixture_size,), input_dim, output_dims, has_biases)
+class MLXQuantizedLinearBase[ConfigT: MLXQuantizedLinearConfig](QuantizedLinearBase[ConfigT]):
+    weights: Float[Array, "*components total_out_channels in_channels"]
+    scales: Float[Array, "*components total_out_channels groups"]
+    deq_biases: Float[Array, "*components total_out_channels groups"]
+    biases: Float[Array, "*components total_out_channels"] | None
+    @property
+    def mixture_size(self) -> int | None:
+        match self.weights.shape:
+            case [num_components, _, _]:
+                return num_components
+            case _:
+                return None
+    @property
+    def activation_precision(self) -> DTypeLike:
+        return self.config.activation_precision
+    @property
+    def input_dim(self) -> int:
+        *_, _, input_dim = self.weights.shape
+        return input_dim
+    @property
+    def has_biases(self) -> bool:
+        return self.biases is not None
+    @property
+    def num_groups(self) -> int:
+        return self.input_dim // self.config.group_size
+    @property
+    def int_weights(self) -> Int[Array, "*components in_channels out_channels"]:
+        quantized = quantize_weights(self.weights, self.config.weight_quantization_mode)
+        casted = quantized.astype(self.config.weight_quantization_mode.dtype)
+        if self.config.weight_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
+    def __post_init__(self) -> None:  # noqa: PLR0912
+        if self.weights.dtype != self.config.activation_precision:
             raise ValueError(
-                "Mixtures of linear layers cannot be called directly."
-                "They are intended to be used with methods eqx.filter_vmap or lax.scan instead.",
+                f"Weight dtype ({self.weights.dtype}) is not equal to specified activation precision"
+                f" ({self.config.activation_precision}).",
+                " Quantized layers require parameter dtypes to be equal to the activation precision.",
             )
-        result = self._apply_weights(inputs)
+        *w_num_components, w_output_dim, _ = self.weights.shape
+        if w_output_dim != sum(self.output_dims):
+            raise ValueError(
+                f"Number of output channels in weights ({w_output_dim}) is not"
+                f" equal to sum of output dims ({sum(self.output_dims)}).",
+            )
+        if self.scales.dtype != self.config.activation_precision:
+            raise ValueError(
+                f"Scale dtype ({self.scales.dtype}) is not equal to specified activation precision"
+                f" ({self.config.activation_precision}).",
+                " Quantized layers require parameter dtypes to be equal to the activation precision.",
+            )
+        *s_num_components, s_output_dim, s_num_groups = self.scales.shape
+        if w_output_dim != s_output_dim:
+            raise ValueError(
+                f"Number of output channels in weights ({w_output_dim}) is not"
+                f" equal to number of output channels in scales ({s_output_dim}).",
+            )
+        if tuple(s_num_components) != tuple(w_num_components):
+            raise ValueError(
+                f"Number of mixture components in weights ({w_num_components}) is not"
+                f" equal to number of mixture components in scales ({s_num_components}).",
+            )
+        if s_num_groups != self.num_groups:
+            raise ValueError(
+                f"Number of groups in scales ({s_num_groups}) is incompatible with"
+                f" the specified group size ({self.config.group_size}).",
+            )
+        if self.deq_biases.dtype != self.config.activation_precision:
+            raise ValueError(
+                f"Dequantization bias dtype ({self.deq_biases.dtype}) is not equal to specified activation precision"
+                f" ({self.config.activation_precision}).",
+                " Quantized layers require parameter dtypes to be equal to the activation precision.",
+            )
+        *zp_num_components, zp_output_dim, zp_num_groups = self.deq_biases.shape
+        if w_output_dim != zp_output_dim:
+            raise ValueError(
+                f"Number of output channels in weights ({w_output_dim}) is not"
+                f" equal to number of output channels in zero points ({zp_output_dim}).",
+            )
+        if tuple(zp_num_components) != tuple(w_num_components):
+            raise ValueError(
+                f"Number of mixture components in weights ({w_num_components}) is not"
+                f" equal to number of mixture components in zero points ({zp_num_components}).",
+            )
+        if self.num_groups != zp_num_groups:
+            raise ValueError(
+                f"Number of groups in zero points ({zp_num_groups}) is incompatible with"
+                f" the specified group size ({self.config.group_size}).",
+            )
         if self.biases is not None:
-            result = result + self.biases
-        return tuple(jnp.split(result, self._get_split_points(self.output_dims)))
+            if self.biases.dtype != self.config.activation_precision:
+                raise ValueError(
+                    f"Bias dtype ({self.biases.dtype}) is not equal to specified activation precision"
+                    f" ({self.config.activation_precision}).",
+                    " Quantized layers require parameter dtypes to be equal to the activation precision.",
+                )
+            *b_num_components, b_output_dim = self.biases.shape
+            if w_output_dim != b_output_dim:
+                raise ValueError(
+                    f"Number of output channels in weights ({w_output_dim}) is not"
+                    f" equal to number of output channels in biases ({b_output_dim}).",
+                )
+            if tuple(b_num_components) != tuple(w_num_components):
+                raise ValueError(
+                    f"Number of mixture components in weights ({w_num_components}) is not"
+                    f" equal to number of mixture components in biases ({b_num_components}).",
+                )
+    def _prepare_scaled_weights(self) -> Float[Array, "*components in_channels total_out_channels"]:
+        quantized_weights = quantize_weights(self.weights, self.config.weight_quantization_mode)
+        grouped_weights = rearrange(
+            quantized_weights,
+            "... total_out_channels (groups group_channels) -> ... total_out_channels groups group_channels",
+            groups=self.num_groups,
+        )
+        scales = rearrange(self.scales, "... total_out_channels groups -> ... total_out_channels groups 1")
+        deq_biases = rearrange(self.deq_biases, "... total_out_channels groups -> ... total_out_channels groups 1")
+        scaled_grouped_weights = grouped_weights * scales + deq_biases
+        result = rearrange(
+            scaled_grouped_weights,
+            "... total_out_channels groups group_channels -> ... total_out_channels (groups group_channels)",
+        )
+        return result
     def export_weights(self) -> ParameterTree:
         result = dict(
             weights=self.int_weights,
-            zero_points=self.int_zero_points,
             scales=self.scales,
+            deq_biases=self.deq_biases,
         )
         if self.biases is not None:
             result["biases"] = self.biases
@@ -553,17 +853,22 @@ class GroupQuantizedLinearBase[ConfigT: GroupQuantizedLinearConfig](LinearBase[C
     ) -> Self:
         assert isinstance(weights, Mapping)
         assert isinstance(weights["weights"], Array)
-        assert isinstance(weights["zero_points"], Array)
+        unpacked_weights = weights["weights"]
+        if self.config.weight_quantization_mode == QuantizationMode.UINT4:
+            unpacked_weights = jax_uint8_to_unpacked_uint4(weights["weights"])
         return replace(
             self,
-            weights=weights["weights"].astype(self.weights.dtype),
+            weights=unpacked_weights.astype(self.weights.dtype),
             scales=weights["scales"],
-            zero_points=weights["zero_points"].astype(self.zero_points.dtype),
+            deq_biases=weights["deq_biases"],
             biases=weights["biases"] if self.has_biases else None,
         )
-class GroupQuantizedLinear(GroupQuantizedLinearBase[GroupQuantizedLinearConfig]):
+class MLXQuantizedLinear(MLXQuantizedLinearBase[MLXQuantizedLinearConfig]):
     pass
@@ -714,7 +1019,7 @@ class QLoRALinear(GroupQuantizedLinearBase[QLoRALinearConfig]):
     def _split_biases(self) -> tuple[Float[Array, "*components out_channels"] | None, ...]:
         if self.biases is not None:
-            return tuple(jnp.split(self.biases, self._get_split_points(self.output_dims)))
+            return tuple(jnp.split(self.biases, self.get_split_points(self.output_dims)))
         return (None,) * len(self.output_dims)
     def __post_init__(self) -> None:
@@ -778,10 +1083,10 @@ class QLoRALinear(GroupQuantizedLinearBase[QLoRALinearConfig]):
                 "They are intended to be used with methods eqx.filter_vmap or lax.scan instead.",
             )
         joint_q_out = self._apply_weights(inputs)
-        q_outs = jnp.split(joint_q_out, self._get_split_points(self.output_dims))
+        q_outs = jnp.split(joint_q_out, self.get_split_points(self.output_dims))
         joint_lora_hidden = inputs @ self.lora_down_weights
-        lora_hiddens = jnp.split(joint_lora_hidden, self._get_split_points([self.config.lora_rank] * self.num_outputs))
+        lora_hiddens = jnp.split(joint_lora_hidden, self.get_split_points([self.config.lora_rank] * self.num_outputs))
         lora_outs = [
             lora_hidden @ lora_up_weight
             for lora_up_weight, lora_hidden in zip(self.lora_up_weights, lora_hiddens, strict=True)
@@ -818,7 +1123,7 @@ class QLoRALinear(GroupQuantizedLinearBase[QLoRALinearConfig]):
         )
-LinearConfig = FullPrecisionLinearConfig | GroupQuantizedLinearConfig | QLoRALinearConfig
+LinearConfig = FullPrecisionLinearConfig | GroupQuantizedLinearConfig | MLXQuantizedLinearConfig | QLoRALinearConfig
-register_config_union(LinearConfig)
+register_config_union(LinearConfig)  # type: ignore (pyright bug)

lalamo/modules/mlp.py CHANGED Viewed

@@ -273,20 +273,19 @@ class SoftmaxRouting(RoutingFunctionBase):
 RoutingFunction = SoftmaxRouting | DummyUnionMember
-register_config_union(RoutingFunction)
+register_config_union(RoutingFunction)  # type: ignore (pyright bug)
 @dataclass(frozen=True)
 class MixtureOfExpertsConfig(ABC):
-    mixture_size: int
-    num_experts_per_token: int
+    expert_config: DenseMLPConfig
+    router_config: LinearConfig
     routing_function: RoutingFunction
-    router_config: LinearConfig
+    mixture_size: int
+    num_experts_per_token: int
     router_has_biases: bool
-    expert_config: DenseMLPConfig
     def random_init(self, model_dim: int, hidden_dim: int, *, key: PRNGKeyArray) -> "MixtureOfExperts":
         experts_key, router_key = jax.random.split(key)
         router = self.router_config.random_init(
@@ -481,4 +480,4 @@ class MixtureOfExperts(MLPBase[MixtureOfExpertsConfig]):
 MLPConfig = DenseMLPConfig | MixtureOfExpertsConfig
-register_config_union(MLPConfig)
+register_config_union(MLPConfig)  # type: ignore (pyright bug)

lalamo/modules/mlx_interop.py ADDED Viewed

@@ -0,0 +1,19 @@
+import jax.numpy as jnp
+import mlx.core as mx
+from jaxtyping import Array
+__all__ = ["jax_to_mlx", "mlx_to_jax"]
+def mlx_to_jax(a: mx.array) -> Array:
+    if a.dtype == mx.bfloat16:
+        return jnp.asarray(a.view(mx.uint16)).view(jnp.bfloat16)
+    return jnp.asarray(a)
+def jax_to_mlx(a: Array) -> mx.array:
+    if a.dtype == jnp.bfloat16:
+        return mx.array(a.view(jnp.uint16)).view(mx.bfloat16)  # type: ignore
+    return mx.array(a)  # type: ignore

lalamo/modules/rope.py CHANGED Viewed

@@ -281,4 +281,4 @@ class LinearScalingRoPEConfig(RoPEConfigBase):
 RoPEConfig = UnscaledRoPEConfig | LlamaRoPEConfig | YARNRoPEConfig | LinearScalingRoPEConfig
-register_config_union(RoPEConfig)
+register_config_union(RoPEConfig)  # type: ignore (pyright bug)

lalamo/modules/token_mixers/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+from lalamo.modules.common import register_config_union
+from .attention import Attention, AttentionConfig, AttentionResult
+from .common import TokenMixerBase, TokenMixerResult
+from .mamba import Mamba2, Mamba2Config, Mamba2Result, SeparableCausalConv, SeparableCausalConvConfig
+from .state import DynamicKVCacheLayer, KVCacheLayer, Mamba2StateLayer, State, StateLayerBase, StaticKVCacheLayer
+TokenMixerConfig = AttentionConfig | Mamba2Config
+register_config_union(TokenMixerConfig)  # type: ignore (pyright bug)
+__all__ = [
+    "Attention",
+    "AttentionConfig",
+    "AttentionResult",
+    "DynamicKVCacheLayer",
+    "KVCacheLayer",
+    "Mamba2",
+    "Mamba2Config",
+    "Mamba2Result",
+    "Mamba2StateLayer",
+    "SeparableCausalConv",
+    "SeparableCausalConvConfig",
+    "State",
+    "StateLayerBase",
+    "StaticKVCacheLayer",
+    "TokenMixerBase",
+    "TokenMixerConfig",
+    "TokenMixerResult",
+]

lalamo 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

lalamo 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl