PyPI - lalamo - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

lalamo 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

lalamo/__init__.py +1 -1
lalamo/language_model.py +22 -23
lalamo/main.py +4 -18
lalamo/model_import/common.py +24 -6
lalamo/model_import/decoder_configs/__init__.py +2 -0
lalamo/model_import/decoder_configs/common.py +4 -4
lalamo/model_import/decoder_configs/executorch.py +17 -10
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +37 -2
lalamo/model_import/decoder_configs/huggingface/gemma2.py +33 -28
lalamo/model_import/decoder_configs/huggingface/gemma3.py +34 -26
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +36 -29
lalamo/model_import/decoder_configs/huggingface/llama.py +14 -12
lalamo/model_import/decoder_configs/huggingface/llamba.py +170 -0
lalamo/model_import/decoder_configs/huggingface/mistral.py +31 -30
lalamo/model_import/decoder_configs/huggingface/qwen2.py +33 -25
lalamo/model_import/decoder_configs/huggingface/qwen3.py +55 -28
lalamo/model_import/loaders/executorch.py +5 -4
lalamo/model_import/loaders/huggingface.py +321 -69
lalamo/model_import/model_specs/__init__.py +2 -0
lalamo/model_import/model_specs/common.py +16 -5
lalamo/model_import/model_specs/llamba.py +40 -0
lalamo/model_import/model_specs/qwen.py +29 -1
lalamo/modules/__init__.py +33 -6
lalamo/modules/activations.py +9 -2
lalamo/modules/common.py +10 -5
lalamo/modules/decoder.py +93 -97
lalamo/modules/decoder_layer.py +85 -103
lalamo/modules/embedding.py +279 -5
lalamo/modules/linear.py +335 -30
lalamo/modules/mlp.py +6 -7
lalamo/modules/mlx_interop.py +19 -0
lalamo/modules/rope.py +1 -1
lalamo/modules/token_mixers/__init__.py +30 -0
lalamo/modules/{attention.py → token_mixers/attention.py} +72 -70
lalamo/modules/token_mixers/common.py +78 -0
lalamo/modules/token_mixers/mamba.py +553 -0
lalamo/modules/token_mixers/state/__init__.py +12 -0
lalamo/modules/token_mixers/state/common.py +26 -0
lalamo/modules/{kv_cache.py → token_mixers/state/kv_cache.py} +5 -16
lalamo/modules/token_mixers/state/mamba_state.py +51 -0
lalamo/utils.py +24 -2
{lalamo-0.4.0.dist-info → lalamo-0.5.0.dist-info}/METADATA +3 -2
lalamo-0.5.0.dist-info/RECORD +80 -0
lalamo-0.4.0.dist-info/RECORD +0 -71
{lalamo-0.4.0.dist-info → lalamo-0.5.0.dist-info}/WHEEL +0 -0
{lalamo-0.4.0.dist-info → lalamo-0.5.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.4.0.dist-info → lalamo-0.5.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.4.0.dist-info → lalamo-0.5.0.dist-info}/top_level.txt +0 -0

lalamo/modules/decoder_layer.py CHANGED Viewed

@@ -11,12 +11,11 @@ from jaxtyping import Array, DTypeLike, Float, Int, PRNGKeyArray
 from lalamo.common import ParameterTree
-from .attention import Attention, AttentionConfig
-from .common import AttentionType, ForwardPassMode, LalamoModule
-from .kv_cache import KVCacheLayer, StaticKVCacheLayer
+from .common import ForwardPassMode, LalamoModule, PositionalEmbeddingSelector
 from .mlp import MLPBase, MLPConfig, MLPForwardPassConfig
 from .normalization import RMSNorm, RMSNormConfig
 from .rope import PositionalEmbeddings
+from .token_mixers import KVCacheLayer, StateLayerBase, StaticKVCacheLayer, TokenMixerBase, TokenMixerConfig
 from .utils import vmap_twice
 __all__ = [
@@ -33,31 +32,32 @@ type DecoderLayerForwardPassConfig = MLPForwardPassConfig
 class DecoderLayerActivationTrace(eqx.Module):
     inputs: Float[Array, "batch suffix_tokens channels"]
-    positional_embeddings: PositionalEmbeddings
-    kv_cache: KVCacheLayer | None
+    positional_embeddings: PositionalEmbeddings | None
+    state: StateLayerBase | None
     mlp_inputs: Float[Array, "batch suffix_tokens channels"]
-    pre_attention_norm: Float[Array, "batch suffix_tokens channels"]
-    attention: Float[Array, "batch suffix_tokens channels"]
-    post_attention_norm: Float[Array, "batch suffix_tokens channels"] | None
+    pre_mixer_norm: Float[Array, "batch suffix_tokens channels"]
+    mixer: Float[Array, "batch suffix_tokens channels"]
+    post_mixer_norm: Float[Array, "batch suffix_tokens channels"] | None
     pre_mlp_norm: Float[Array, "batch suffix_tokens channels"]
     mlp: Float[Array, "batch suffix_tokens channels"]
     post_mlp_norm: Float[Array, "batch suffix_tokens channels"] | None
     def export(self) -> ParameterTree:
-        result = dict(
+        result: dict[str, ParameterTree | Array] = dict(
             inputs=self.inputs,
-            positional_embeddings=self.positional_embeddings.export(),
             mlp_inputs=self.mlp_inputs,
-            pre_attention_norm=self.pre_attention_norm,
-            attention=self.attention,
+            pre_mixer_norm=self.pre_mixer_norm,
+            mixer=self.mixer,
             pre_mlp_norm=self.pre_mlp_norm,
             mlp=self.mlp,
         )
-        if self.kv_cache is not None:
-            result["kv_cache"] = self.kv_cache.export()
-        if self.post_attention_norm is not None:
-            result["post_attention_norm"] = self.post_attention_norm
+        if self.positional_embeddings is not None:
+            result["positional_embeddings"] = self.positional_embeddings.export()
+        if self.state is not None:
+            result["state"] = self.state.export()
+        if self.post_mixer_norm is not None:
+            result["post_mixer_norm"] = self.post_mixer_norm
         if self.post_mlp_norm is not None:
             result["post_mlp_norm"] = self.post_mlp_norm
         return result
@@ -65,15 +65,15 @@ class DecoderLayerActivationTrace(eqx.Module):
 class DecoderLayerResult(eqx.Module):
     outputs: Float[Array, "suffix_tokens channels"]
-    updated_kv_cache: KVCacheLayer | None
+    updated_state: KVCacheLayer | None
     activation_trace: DecoderLayerActivationTrace | None
     def export(self) -> ParameterTree:
         result: dict[str, ParameterTree | Array] = dict(
             outputs=self.outputs,
         )
-        if self.updated_kv_cache is not None:
-            result["updated_kv_cache"] = self.updated_kv_cache.export()
+        if self.updated_state is not None:
+            result["updated_state"] = self.updated_state.export()
         if self.activation_trace is not None:
             result["activation_trace"] = self.activation_trace.export()
         return result
@@ -81,39 +81,32 @@ class DecoderLayerResult(eqx.Module):
 @dataclass(frozen=True)
 class DecoderLayerConfig:
-    pre_attention_norm_config: RMSNormConfig
-    attention_config: AttentionConfig
-    post_attention_norm_config: RMSNormConfig | None
+    pre_mixer_norm_config: RMSNormConfig
+    mixer_config: TokenMixerConfig
+    post_mixer_norm_config: RMSNormConfig | None
     pre_mlp_norm_config: RMSNormConfig
     mlp_config: MLPConfig
     post_mlp_norm_config: RMSNormConfig | None
+    @property
+    def rope_dim(self) -> int:
+        return self.mixer_config.rope_dim
     def random_init(
         self,
         model_dim: int,
         hidden_dim: int,
-        num_heads: int,
-        num_groups: int,
-        head_dim: int,
-        attention_scale: float | None,
-        sliding_window_size: int | None,
         *,
         key: PRNGKeyArray,
     ) -> "DecoderLayer":
         attention_key, mlp_key = jax.random.split(key)
-        pre_attention_norm = self.pre_attention_norm_config.init(model_dim)
-        attention = self.attention_config.random_init(
+        pre_attention_norm = self.pre_mixer_norm_config.init(model_dim)
+        mixer = self.mixer_config.random_init(
             model_dim=model_dim,
-            num_heads=num_heads,
-            num_groups=num_groups,
-            head_dim=head_dim,
-            is_causal=True,
-            scale=attention_scale,
-            sliding_window_size=sliding_window_size,
             key=attention_key,
         )
-        if self.post_attention_norm_config is not None:
-            post_attention_norm = self.post_attention_norm_config.init(model_dim)
+        if self.post_mixer_norm_config is not None:
+            post_attention_norm = self.post_mixer_norm_config.init(model_dim)
         else:
             post_attention_norm = None
         pre_mlp_norm = self.pre_mlp_norm_config.init(model_dim)
@@ -124,9 +117,9 @@ class DecoderLayerConfig:
             post_mlp_norm = None
         return DecoderLayer(
             config=self,
-            pre_attention_norm=pre_attention_norm,
-            attention=attention,
-            post_attention_norm=post_attention_norm,
+            pre_mixer_norm=pre_attention_norm,
+            mixer=mixer,
+            post_mixer_norm=post_attention_norm,
             pre_mlp_norm=pre_mlp_norm,
             mlp=mlp,
             post_mlp_norm=post_mlp_norm,
@@ -136,24 +129,13 @@ class DecoderLayerConfig:
         self,
         model_dim: int,
         hidden_dim: int,
-        num_heads: int,
-        num_groups: int,
-        head_dim: int,
-        attention_scale: float | None,
-        sliding_window_size: int | None,
     ) -> "DecoderLayer":
-        pre_attention_norm = self.pre_attention_norm_config.empty(model_dim)
-        attention = self.attention_config.empty(
+        pre_attention_norm = self.pre_mixer_norm_config.empty(model_dim)
+        attention = self.mixer_config.empty(
             model_dim=model_dim,
-            num_heads=num_heads,
-            num_groups=num_groups,
-            head_dim=head_dim,
-            is_causal=True,
-            scale=attention_scale,
-            sliding_window_size=sliding_window_size,
         )
-        if self.post_attention_norm_config is not None:
-            post_attention_norm = self.post_attention_norm_config.empty(model_dim)
+        if self.post_mixer_norm_config is not None:
+            post_attention_norm = self.post_mixer_norm_config.empty(model_dim)
         else:
             post_attention_norm = None
         pre_mlp_norm = self.pre_mlp_norm_config.empty(model_dim)
@@ -164,9 +146,9 @@ class DecoderLayerConfig:
             post_mlp_norm = None
         return DecoderLayer(
             config=self,
-            pre_attention_norm=pre_attention_norm,
-            attention=attention,
-            post_attention_norm=post_attention_norm,
+            pre_mixer_norm=pre_attention_norm,
+            mixer=attention,
+            post_mixer_norm=post_attention_norm,
             pre_mlp_norm=pre_mlp_norm,
             mlp=mlp,
             post_mlp_norm=post_mlp_norm,
@@ -174,31 +156,31 @@ class DecoderLayerConfig:
 class DecoderLayer(LalamoModule[DecoderLayerConfig]):
-    pre_attention_norm: RMSNorm
-    attention: Attention
-    post_attention_norm: RMSNorm | None
+    pre_mixer_norm: RMSNorm
+    mixer: TokenMixerBase
+    post_mixer_norm: RMSNorm | None
     pre_mlp_norm: RMSNorm
     mlp: MLPBase
     post_mlp_norm: RMSNorm | None
     @property
     def activation_precision(self) -> DTypeLike:
-        return self.attention.activation_precision
+        return self.mixer.activation_precision
     @property
-    def attention_type(self) -> AttentionType:
-        return self.attention.attention_type
+    def positional_embedding_selector(self) -> PositionalEmbeddingSelector:
+        return self.mixer.positional_embedding_selector
     def __post_init__(self) -> None:
-        model_dim = self.pre_attention_norm.input_dim
-        if self.attention.model_dim != model_dim:
+        model_dim = self.pre_mixer_norm.input_dim
+        if self.mixer.model_dim != model_dim:
             raise ValueError(
-                f"Attention model dim {self.attention.model_dim} does not match"
+                f"Attention model dim {self.mixer.model_dim} does not match"
                 f" the first normalization layer dim {model_dim}",
             )
-        if self.post_attention_norm is not None and self.post_attention_norm.input_dim != model_dim:
+        if self.post_mixer_norm is not None and self.post_mixer_norm.input_dim != model_dim:
             raise ValueError(
-                f"Post attention normalization dim {self.post_attention_norm.input_dim} does not match"
+                f"Post mixer normalization dim {self.post_mixer_norm.input_dim} does not match"
                 f" the first normalization layer dim {model_dim}",
             )
         if self.pre_mlp_norm.input_dim != model_dim:
@@ -216,9 +198,9 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
     def __call__(
         self,
         inputs: Float[Array, "batch suffix_tokens channels"],
-        positional_embeddings: PositionalEmbeddings,
-        kv_cache: KVCacheLayer | None = None,
-        return_updated_kv_cache: bool = False,
+        positional_embeddings: PositionalEmbeddings | None,
+        state: StateLayerBase | None = None,
+        return_updated_state: bool = False,
         return_activation_trace: bool = False,
         lengths_without_padding: Int[Array, " batch"] | None = None,
         forward_pass_mode: ForwardPassMode = ForwardPassMode.MULTI_TOKEN,
@@ -229,20 +211,20 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
                 f"Inputs to decoder layers must be a 3D arrays of size (batch_size, sequence_length, hidden_dim),"
                 f" got {inputs.shape}",
             )
-        normalized_attention_inputs = vmap_twice(self.pre_attention_norm)(inputs)
-        batched_attention_fn = vmap(partial(self.attention, return_updated_kv_cache=return_updated_kv_cache))
-        attention_outputs, updated_kv_cache = batched_attention_fn(
-            normalized_attention_inputs,
+        normalized_mixer_inputs = vmap_twice(self.pre_mixer_norm)(inputs)
+        batched_mixer_fn = vmap(partial(self.mixer, return_updated_state=return_updated_state))
+        mixer_outputs, updated_state = batched_mixer_fn(
+            normalized_mixer_inputs,
             positional_embeddings,
-            kv_cache=kv_cache,
+            state=state,
             length_without_padding=lengths_without_padding,
         )
-        if self.post_attention_norm is not None:
-            normalized_attention_outputs = vmap_twice(self.post_attention_norm)(attention_outputs)
-            mlp_inputs = inputs + normalized_attention_outputs
+        if self.post_mixer_norm is not None:
+            normalized_mixer_outputs = vmap_twice(self.post_mixer_norm)(mixer_outputs)
+            mlp_inputs = inputs + normalized_mixer_outputs
         else:
-            normalized_attention_outputs = None
-            mlp_inputs = inputs + attention_outputs
+            normalized_mixer_outputs = None
+            mlp_inputs = inputs + mixer_outputs
         normalized_mlp_inputs = vmap_twice(self.pre_mlp_norm)(mlp_inputs)
         mlp_outputs = self.mlp(
@@ -261,10 +243,10 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
             activation_trace = DecoderLayerActivationTrace(
                 inputs=inputs,
                 positional_embeddings=positional_embeddings,
-                kv_cache=kv_cache,
-                pre_attention_norm=normalized_attention_inputs,
-                attention=attention_outputs,
-                post_attention_norm=normalized_attention_outputs,
+                state=state,
+                pre_mixer_norm=normalized_mixer_inputs,
+                mixer=mixer_outputs,
+                post_mixer_norm=normalized_mixer_outputs,
                 mlp_inputs=mlp_inputs,
                 pre_mlp_norm=normalized_mlp_inputs,
                 mlp=mlp_outputs,
@@ -275,25 +257,25 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
         return DecoderLayerResult(
             outputs=outputs,
-            updated_kv_cache=updated_kv_cache,
+            updated_state=updated_state,
             activation_trace=activation_trace,
         )
-    def init_static_kv_cache(self, batch_size: int, capacity: int) -> StaticKVCacheLayer:
+    def init_static_state(self, batch_size: int, capacity: int) -> StaticKVCacheLayer:
         return jax.tree.map(
             lambda array: jnp.repeat(array[None, ...], batch_size, axis=0),
-            self.attention.init_static_kv_cache(capacity),
+            self.mixer.init_static_state(capacity),
         )
     def export_weights(self) -> ParameterTree:
         result = dict(
-            pre_attention_norm=self.pre_attention_norm.export_weights(),
-            attention=self.attention.export_weights(),
+            pre_mixer_norm=self.pre_mixer_norm.export_weights(),
+            mixer=self.mixer.export_weights(),
             pre_mlp_norm=self.pre_mlp_norm.export_weights(),
             mlp=self.mlp.export_weights(),
         )
-        if self.post_attention_norm is not None:
-            result["post_attention_norm"] = self.post_attention_norm.export_weights()
+        if self.post_mixer_norm is not None:
+            result["post_mixer_norm"] = self.post_mixer_norm.export_weights()
         if self.post_mlp_norm is not None:
             result["post_mlp_norm"] = self.post_mlp_norm.export_weights()
         return result
@@ -303,18 +285,18 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
         weights: ParameterTree[Array],
     ) -> Self:
         assert isinstance(weights, Mapping)
-        assert isinstance(weights["pre_attention_norm"], Mapping)
-        assert isinstance(weights["attention"], Mapping)
+        assert isinstance(weights["pre_mixer_norm"], Mapping)
+        assert isinstance(weights["mixer"], Mapping)
         assert isinstance(weights["mlp"], Mapping)
         assert isinstance(weights["pre_mlp_norm"], Mapping)
-        if self.post_attention_norm is not None:
-            assert isinstance(weights["post_attention_norm"], Mapping)
-            post_attention_norm = self.post_attention_norm.import_weights(
-                weights["post_attention_norm"],
+        if self.post_mixer_norm is not None:
+            assert isinstance(weights["post_mixer_norm"], Mapping)
+            post_mixer_norm = self.post_mixer_norm.import_weights(
+                weights["post_mixer_norm"],
             )
         else:
-            post_attention_norm = None
+            post_mixer_norm = None
         if self.post_mlp_norm is not None:
             assert isinstance(weights["post_mlp_norm"], Mapping)
             post_mlp_norm = self.post_mlp_norm.import_weights(weights["post_mlp_norm"])
@@ -322,9 +304,9 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
             post_mlp_norm = None
         return replace(
             self,
-            pre_attention_norm=self.pre_attention_norm.import_weights(weights["pre_attention_norm"]),
-            attention=self.attention.import_weights(weights["attention"]),
-            post_attention_norm=post_attention_norm,
+            pre_mixer_norm=self.pre_mixer_norm.import_weights(weights["pre_mixer_norm"]),
+            mixer=self.mixer.import_weights(weights["mixer"]),
+            post_mixer_norm=post_mixer_norm,
             pre_mlp_norm=self.pre_mlp_norm.import_weights(weights["pre_mlp_norm"]),
             mlp=self.mlp.import_weights(weights["mlp"]),
             post_mlp_norm=post_mlp_norm,

lalamo/modules/embedding.py CHANGED Viewed

@@ -6,10 +6,12 @@ from typing import Self
 import equinox as eqx
 import jax
 import jax.numpy as jnp
+from einops import rearrange
 from jaxtyping import Array, DTypeLike, Float, Int, PRNGKeyArray
 from lalamo.common import ParameterTree, dummy_array
 from lalamo.quantization import QuantizationMode, dynamically_quantize_activations, quantize_weights
+from lalamo.utils import jax_uint4_to_packed_uint8, jax_uint8_to_unpacked_uint4
 from .common import (
     LalamoModule,
@@ -20,6 +22,10 @@ from .utils import apply_soft_capping
 __all__ = [
     "EmbeddingBase",
     "EmbeddingConfig",
+    "MLXQuantizedTiedEmbedding",
+    "MLXQuantizedTiedEmbeddingConfig",
+    "MLXSemiQuantizedUntiedEmbedding",
+    "MLXSemiQuantizedUntiedEmbeddingConfig",
     "QuantizedTiedEmbedding",
     "QuantizedTiedEmbeddingConfig",
     "TiedEmbedding",
@@ -314,8 +320,15 @@ class QuantizedTiedEmbedding(EmbeddingBase[QuantizedTiedEmbeddingConfig]):
     @property
     def int_weights(self) -> Int[Array, "vocabulary channels"]:
-        result = quantize_weights(self.weights, self.config.embedding_quantization_mode)
-        return result.astype(self.config.embedding_quantization_mode.dtype)
+        quantized = quantize_weights(self.weights, self.config.embedding_quantization_mode)
+        casted = quantized.astype(self.config.embedding_quantization_mode.dtype)
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
     def _prepare_weights(self) -> Float[Array, "vocabulary channels"]:
         quantized_weights = quantize_weights(self.weights, self.config.embedding_quantization_mode)
@@ -346,14 +359,275 @@ class QuantizedTiedEmbedding(EmbeddingBase[QuantizedTiedEmbeddingConfig]):
     ) -> Self:
         assert isinstance(weights, Mapping)
         assert isinstance(weights["weights"], Array)
+        stored_weights = weights["weights"]
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            stored_weights = jax_uint8_to_unpacked_uint4(stored_weights)
+        return replace(
+            self,
+            weights=stored_weights.astype(self.weights.dtype),
+            scales=weights["scales"],
+        )
+@dataclass(frozen=True)
+class MLXQuantizedTiedEmbeddingConfig(EmbeddingConfigBase):
+    group_size: int
+    embedding_quantization_mode: QuantizationMode
+    activation_quantization_mode: QuantizationMode | None
+    activation_precision: DTypeLike
+    def random_init(
+        self,
+        vocab_size: int,
+        model_dim: int,
+        *,
+        key: PRNGKeyArray,
+    ) -> "QuantizedTiedEmbedding":
+        raise NotImplementedError
+    def empty(
+        self,
+        vocab_size: int,
+        model_dim: int,
+    ) -> "MLXQuantizedTiedEmbedding":
+        assert model_dim % self.group_size == 0
+        model_groups = model_dim // self.group_size
+        weights = dummy_array((vocab_size, model_dim), dtype=self.activation_precision)
+        scales = dummy_array((vocab_size, model_groups), dtype=self.activation_precision)
+        biases = dummy_array((vocab_size, model_groups), dtype=self.activation_precision)
+        return MLXQuantizedTiedEmbedding(config=self, weights=weights, scales=scales, biases=biases)
+class MLXQuantizedTiedEmbedding(EmbeddingBase[MLXQuantizedTiedEmbeddingConfig]):
+    weights: Float[Array, "vocabulary channels"]
+    scales: Float[Array, "vocabulary groups"]
+    biases: Float[Array, "vocabulary groups"]
+    @property
+    def activation_precision(self) -> DTypeLike:
+        return self.config.activation_precision
+    @property
+    def model_dim(self) -> int:
+        _, model_dim = self.weights.shape
+        return model_dim
+    @property
+    def vocab_size(self) -> int:
+        vocab_size, _ = self.weights.shape
+        return vocab_size
+    @property
+    def int_weights(self) -> Int[Array, "vocabulary channels"]:
+        quantized = quantize_weights(self.weights, self.config.embedding_quantization_mode)
+        casted = quantized.astype(self.config.embedding_quantization_mode.dtype)
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
+    def _prepare_weights(self) -> Float[Array, "vocabulary channels"]:
+        quantized_weights = quantize_weights(self.weights, self.config.embedding_quantization_mode)
+        grouped_weights = rearrange(
+            quantized_weights,
+            "vocab (groups elements) -> vocab groups elements",
+            elements=self.config.group_size,
+        )
+        scales = rearrange(self.scales, "vocab groups -> vocab groups 1")
+        biases = rearrange(self.biases, "vocab groups -> vocab groups 1")
+        scaled_grouped_weights = grouped_weights * scales + biases
+        result = rearrange(
+            scaled_grouped_weights,
+            "vocab groups elements -> vocab (groups elements)",
+        )
+        return result
+    def _prepare_input_weights(self) -> Float[Array, "vocabulary channels"]:
+        return self._prepare_weights()
+    def _prepare_output_weights(self) -> Float[Array, "vocabulary channels"]:
+        return self._prepare_weights()
+    @eqx.filter_jit
+    def readout(self, x: Float[Array, " channels"]) -> Float[Array, " vocabulary"]:
+        if self.config.activation_quantization_mode is not None:
+            x = dynamically_quantize_activations(x, self.config.activation_quantization_mode)
+        return super().readout(x)
+    def export_weights(self) -> ParameterTree:
+        return {
+            "weights": self.int_weights,
+            "scales": self.scales,
+            "biases": self.biases,
+        }
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+    ) -> Self:
+        assert isinstance(weights, Mapping)
+        assert isinstance(weights["weights"], Array)
+        assert isinstance(weights["scales"], Array)
+        assert isinstance(weights["biases"], Array)
+        unpacked_weights = weights["weights"]
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            unpacked_weights = jax_uint8_to_unpacked_uint4(weights["weights"])
         return replace(
             self,
-            weights=weights["weights"].astype(self.weights.dtype),
+            weights=unpacked_weights.astype(self.weights.dtype),
             scales=weights["scales"],
+            biases=weights["biases"],
+        )
+@dataclass(frozen=True)
+class MLXSemiQuantizedUntiedEmbeddingConfig(EmbeddingConfigBase):
+    group_size: int
+    embedding_quantization_mode: QuantizationMode
+    activation_quantization_mode: QuantizationMode | None
+    activation_precision: DTypeLike
+    def random_init(
+        self,
+        vocab_size: int,
+        model_dim: int,
+        *,
+        key: PRNGKeyArray,
+    ) -> "MLXSemiQuantizedUntiedEmbedding":
+        raise NotImplementedError
+    def empty(
+        self,
+        vocab_size: int,
+        model_dim: int,
+    ) -> "MLXSemiQuantizedUntiedEmbedding":
+        assert model_dim % self.group_size == 0
+        model_groups = model_dim // self.group_size
+        input_weights = dummy_array((vocab_size, model_dim), dtype=self.activation_precision)
+        output_weights = dummy_array((vocab_size, model_dim), dtype=self.activation_precision)
+        output_scales = dummy_array((vocab_size, model_groups), dtype=self.activation_precision)
+        output_biases = dummy_array((vocab_size, model_groups), dtype=self.activation_precision)
+        return MLXSemiQuantizedUntiedEmbedding(
+            config=self,
+            input_weights=input_weights,
+            output_weights=output_weights,
+            output_scales=output_scales,
+            output_biases=output_biases,
+        )
+class MLXSemiQuantizedUntiedEmbedding(EmbeddingBase[MLXSemiQuantizedUntiedEmbeddingConfig]):
+    input_weights: Float[Array, "vocabulary channels"]
+    output_weights: Float[Array, "vocabulary channels"]
+    output_scales: Float[Array, "vocabulary groups"]
+    output_biases: Float[Array, "vocabulary groups"]
+    @property
+    def activation_precision(self) -> DTypeLike:
+        return self.config.activation_precision
+    @property
+    def model_dim(self) -> int:
+        _, model_dim = self.input_weights.shape
+        return model_dim
+    @property
+    def vocab_size(self) -> int:
+        vocab_size, _ = self.input_weights.shape
+        return vocab_size
+    @property
+    def int_output_weights(self) -> Int[Array, "vocabulary channels"]:
+        quantized = quantize_weights(self.output_weights, self.config.embedding_quantization_mode)
+        casted = quantized.astype(self.config.embedding_quantization_mode.dtype)
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            packed = jax_uint4_to_packed_uint8(casted)
+        else:
+            packed = casted
+        return packed
+    def _prepare_input_weights(self) -> Float[Array, "vocabulary channels"]:
+        return self.input_weights
+    def _prepare_output_weights(self) -> Float[Array, "vocabulary channels"]:
+        quantized_weights = quantize_weights(self.output_weights, self.config.embedding_quantization_mode)
+        grouped_weights = rearrange(
+            quantized_weights,
+            "vocab (groups elements) -> vocab groups elements",
+            elements=self.config.group_size,
+        )
+        scales = rearrange(self.output_scales, "vocab groups -> vocab groups 1")
+        biases = rearrange(self.output_biases, "vocab groups -> vocab groups 1")
+        scaled_grouped_weights = grouped_weights * scales + biases
+        result = rearrange(
+            scaled_grouped_weights,
+            "vocab groups elements -> vocab (groups elements)",
+        )
+        return result
+    @eqx.filter_jit
+    def readout(self, x: Float[Array, " channels"]) -> Float[Array, " vocabulary"]:
+        if self.config.activation_quantization_mode is not None:
+            x = dynamically_quantize_activations(x, self.config.activation_quantization_mode)
+        return super().readout(x)
+    def export_weights(self) -> ParameterTree:
+        return {
+            "input_weights": self.input_weights,
+            "output_weights": self.int_output_weights,
+            "output_scales": self.output_scales,
+            "output_biases": self.output_biases,
+        }
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+    ) -> Self:
+        assert isinstance(weights, Mapping)
+        assert isinstance(weights["input_weights"], Array)
+        assert isinstance(weights["output_weights"], Array)
+        assert isinstance(weights["output_scales"], Array)
+        assert isinstance(weights["output_biases"], Array)
+        unpacked_output_weights = weights["output_weights"]
+        if self.config.embedding_quantization_mode == QuantizationMode.UINT4:
+            unpacked_output_weights = jax_uint8_to_unpacked_uint4(weights["output_weights"])
+        return replace(
+            self,
+            input_weights=weights["input_weights"],
+            output_weights=unpacked_output_weights.astype(self.output_weights.dtype),
+            output_scales=weights["output_scales"],
+            output_biases=weights["output_biases"],
         )
-EmbeddingConfig = TiedEmbeddingConfig | UntiedEmbeddingConfig | QuantizedTiedEmbeddingConfig
+EmbeddingConfig = (
+    TiedEmbeddingConfig
+    | UntiedEmbeddingConfig
+    | QuantizedTiedEmbeddingConfig
+    | MLXQuantizedTiedEmbeddingConfig
+    | MLXSemiQuantizedUntiedEmbeddingConfig
+)
-register_config_union(EmbeddingConfig)
+register_config_union(EmbeddingConfig)  # type: ignore (pyright bug)

lalamo 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

lalamo 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl