PyPI - lalamo - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

lalamo 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

lalamo/__init__.py +1 -1
lalamo/language_model.py +22 -23
lalamo/main.py +2 -16
lalamo/model_import/common.py +24 -6
lalamo/model_import/decoder_configs/__init__.py +2 -0
lalamo/model_import/decoder_configs/common.py +4 -4
lalamo/model_import/decoder_configs/executorch.py +17 -10
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +37 -2
lalamo/model_import/decoder_configs/huggingface/gemma2.py +33 -28
lalamo/model_import/decoder_configs/huggingface/gemma3.py +34 -26
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +36 -29
lalamo/model_import/decoder_configs/huggingface/llama.py +14 -12
lalamo/model_import/decoder_configs/huggingface/llamba.py +170 -0
lalamo/model_import/decoder_configs/huggingface/mistral.py +31 -30
lalamo/model_import/decoder_configs/huggingface/qwen2.py +33 -25
lalamo/model_import/decoder_configs/huggingface/qwen3.py +55 -28
lalamo/model_import/loaders/executorch.py +5 -4
lalamo/model_import/loaders/huggingface.py +321 -69
lalamo/model_import/model_specs/__init__.py +2 -0
lalamo/model_import/model_specs/common.py +16 -5
lalamo/model_import/model_specs/llamba.py +40 -0
lalamo/model_import/model_specs/qwen.py +29 -1
lalamo/modules/__init__.py +33 -6
lalamo/modules/activations.py +9 -2
lalamo/modules/common.py +10 -5
lalamo/modules/decoder.py +93 -97
lalamo/modules/decoder_layer.py +85 -103
lalamo/modules/embedding.py +279 -5
lalamo/modules/linear.py +335 -30
lalamo/modules/mlp.py +6 -7
lalamo/modules/mlx_interop.py +19 -0
lalamo/modules/rope.py +1 -1
lalamo/modules/token_mixers/__init__.py +30 -0
lalamo/modules/{attention.py → token_mixers/attention.py} +72 -70
lalamo/modules/token_mixers/common.py +78 -0
lalamo/modules/token_mixers/mamba.py +553 -0
lalamo/modules/token_mixers/state/__init__.py +12 -0
lalamo/modules/token_mixers/state/common.py +26 -0
lalamo/modules/{kv_cache.py → token_mixers/state/kv_cache.py} +5 -16
lalamo/modules/token_mixers/state/mamba_state.py +51 -0
lalamo/utils.py +24 -2
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/METADATA +3 -2
lalamo-0.5.0.dist-info/RECORD +80 -0
lalamo-0.4.1.dist-info/RECORD +0 -71
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/WHEEL +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.4.1.dist-info → lalamo-0.5.0.dist-info}/top_level.txt +0 -0

lalamo/model_import/loaders/executorch.py CHANGED Viewed

@@ -180,17 +180,18 @@ def load_decoder_layer(
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
 ) -> DecoderLayer:
-    if module.post_attention_norm is not None:
+    if module.post_mixer_norm is not None:
         raise ValueError("Post attention normalization is not supported")
     if module.post_mlp_norm is not None:
         raise ValueError("Post MLP normalization is not supported")
-    attention_norm = load_rmsnorm(module.pre_attention_norm, weights_dict, path / "attention_norm")
-    attention = load_attention(module.attention, weights_dict, path / "attention")
+    attention_norm = load_rmsnorm(module.pre_mixer_norm, weights_dict, path / "attention_norm")
+    assert isinstance(module.mixer, Attention)
+    attention = load_attention(module.mixer, weights_dict, path / "attention")
     mlp_norm = load_rmsnorm(module.pre_mlp_norm, weights_dict, path / "ffn_norm")
     assert isinstance(module.mlp, DenseMLP)
     mlp = load_mlp(module.mlp, weights_dict, path / "feed_forward")
     return load_parameters(
-        lambda m: (m.pre_attention_norm, m.attention, m.pre_mlp_norm, m.mlp),
+        lambda m: (m.pre_mixer_norm, m.mixer, m.pre_mlp_norm, m.mlp),
         module,
         (attention_norm, attention, mlp_norm, mlp),
     )

lalamo/model_import/loaders/huggingface.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from collections.abc import Mapping
+from dataclasses import dataclass
 import jax.numpy as jnp
 from einops import rearrange
-from jaxtyping import Array
+from jaxtyping import Array, DTypeLike
 from lalamo.common import ParameterPath
 from lalamo.modules import (
@@ -13,7 +14,12 @@ from lalamo.modules import (
     FullPrecisionLinear,
     GroupQuantizedLinear,
     LinearBase,
+    Mamba2,
+    MLXQuantizedLinear,
+    MLXQuantizedTiedEmbedding,
+    MLXSemiQuantizedUntiedEmbedding,
     RMSNorm,
+    SeparableCausalConv,
     TiedEmbedding,
     UntiedEmbedding,
 )
@@ -26,10 +32,10 @@ from .utils import decode_mxfp4, deinterleave_pairwise_columns
 __all__ = ["load_huggingface"]
-AWQ_REVERSE_ORDER = jnp.array([0, 4, 1, 5, 2, 6, 3, 7], dtype=jnp.int32)
+AWQ_UINT4_REVERSE_ORDER = jnp.array([0, 4, 1, 5, 2, 6, 3, 7], dtype=jnp.int32)
-def _reverse_uint4_awq_order(array: Array) -> Array:
+def _reverse_uint4_order(array: Array, reverse_order: Array) -> Array:
     """Reverses the AWQ packing order to get the logical order of channels for INT4."""
     pack_factor = 32 // 4
     *_, last_dim = array.shape
@@ -37,13 +43,13 @@ def _reverse_uint4_awq_order(array: Array) -> Array:
         return array
     array_reshaped = rearrange(array, "... (group pack_factor) -> ... group pack_factor", pack_factor=pack_factor)
-    array_reordered = array_reshaped[..., AWQ_REVERSE_ORDER]
+    array_reordered = array_reshaped[..., reverse_order]
     return rearrange(array_reordered, "... group pack_factor -> ... (group pack_factor)")
 def unpack_int32(packed_weights: Array, mode: QuantizationMode) -> Array:
-    assert packed_weights.dtype == jnp.int32, (
-        f"Expected packed_weights to be of dtype jnp.int32, got {packed_weights.dtype}"
+    assert packed_weights.dtype in (jnp.int32, jnp.uint32), (
+        f"Expected packed_weights to be of dtype jnp.(u)int32, got {packed_weights.dtype}"
     )
     assert 32 % mode.bits == 0
@@ -58,29 +64,18 @@ def unpack_int32(packed_weights: Array, mode: QuantizationMode) -> Array:
     return unpacked
-def _process_quantized_tensors(
-    qweights: Array,
-    qzeros: Array,
-    scales: Array,
-    module: GroupQuantizedLinear,
-) -> tuple[Array, Array, Array]:
-    """Unpacks, recenters, transposes, and casts quantized tensors to the correct dtype."""
-    mode = module.config.weight_quantization_mode
-    assert qweights.dtype == jnp.int32
-    unpacked_weights = unpack_int32(qweights, mode)
-    if mode == QuantizationMode.UINT4:
-        unpacked_weights = _reverse_uint4_awq_order(unpacked_weights)
-    assert qzeros.dtype == jnp.int32
-    unpacked_zero_points = unpack_int32(qzeros, mode)
-    if mode == QuantizationMode.UINT4:
-        unpacked_zero_points = _reverse_uint4_awq_order(unpacked_zero_points)
-    weights = unpacked_weights.astype(module.config.activation_precision)
-    zero_points = unpacked_zero_points.astype(module.config.activation_precision)
-    processed_scales = scales.astype(module.config.activation_precision)
+def _process_quantized_tensor(
+    quantized: Array,
+    weight_quantization: QuantizationMode,
+    activation_precision: DTypeLike,
+    reverse_order: Array | None = None,
+) -> Array:
+    unpacked = unpack_int32(quantized, weight_quantization)
+    if reverse_order is not None:
+        assert weight_quantization == QuantizationMode.UINT4, "reverse order only supported on uint4 quant type"
+        unpacked = _reverse_uint4_order(unpacked, reverse_order)
-    return weights, zero_points, processed_scales
+    return unpacked.astype(activation_precision)
 def _fuse_full_precision_weights(
@@ -95,26 +90,39 @@ def _fuse_full_precision_weights(
     return jnp.concatenate(weights, axis=0)
+@dataclass(frozen=True)
+class QuantizedParamLayout:
+    weight: str
+    scale: str
+    bias: str
+    transposed: bool
+AWQ_QUANTIZED_WEIGHT_LAYOUT = QuantizedParamLayout("qweight", "scales", "qzeros", transposed=True)
+MLX_QUANTIZED_WEIGHT_LAYOUT = QuantizedParamLayout("weight", "scales", "biases", transposed=False)
 def _fuse_quantized_weights(
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
     sublayers_to_fuse: list[str] | None,
+    quantized_param_layout: QuantizedParamLayout,
 ) -> tuple[Array, Array, Array]:
     # Note that AWQ quantized weights are stored transposed relative to full-precision weights
     if sublayers_to_fuse is None:
-        qweights = weights_dict[path / "qweight"]
-        qzeros = weights_dict[path / "qzeros"]
-        scales = weights_dict[path / "scales"]
+        qweights = weights_dict[path / quantized_param_layout.weight]
+        qzeros = weights_dict[path / quantized_param_layout.bias]
+        scales = weights_dict[path / quantized_param_layout.scale]
         return qweights, qzeros, scales
-    qweights = [weights_dict[path / layer_name / "qweight"] for layer_name in sublayers_to_fuse]
-    qzeros = [weights_dict[path / layer_name / "qzeros"] for layer_name in sublayers_to_fuse]
-    scales = [weights_dict[path / layer_name / "scales"] for layer_name in sublayers_to_fuse]
+    qweights = [weights_dict[path / layer_name / quantized_param_layout.weight] for layer_name in sublayers_to_fuse]
+    qzeros = [weights_dict[path / layer_name / quantized_param_layout.bias] for layer_name in sublayers_to_fuse]
+    scales = [weights_dict[path / layer_name / quantized_param_layout.scale] for layer_name in sublayers_to_fuse]
-    fused_qweights = jnp.concatenate(qweights, axis=1)
-    fused_qzeros = jnp.concatenate(qzeros, axis=1)
-    fused_scales = jnp.concatenate(scales, axis=1)
+    fused_qweights = jnp.concatenate(qweights, axis=int(quantized_param_layout.transposed))
+    fused_qzeros = jnp.concatenate(qzeros, axis=int(quantized_param_layout.transposed))
+    fused_scales = jnp.concatenate(scales, axis=int(quantized_param_layout.transposed))
     return fused_qweights, fused_qzeros, fused_scales
@@ -148,34 +156,85 @@ def load_linear(
         return load_parameters(lambda m: (m.weights, m.biases), module, (weights, bias))
     if isinstance(module, GroupQuantizedLinear):
-        qweights, qzeros, scales = _fuse_quantized_weights(weights_dict, path, sublayers_to_fuse)
+        qweights, qzeros, scales = _fuse_quantized_weights(
+            weights_dict,
+            path,
+            sublayers_to_fuse,
+            AWQ_QUANTIZED_WEIGHT_LAYOUT,
+        )
+        weight_quantization = module.config.weight_quantization_mode
+        activation_precision = module.activation_precision
+        if weight_quantization == QuantizationMode.UINT4:
+            reverse_order = AWQ_UINT4_REVERSE_ORDER
+        else:
+            reverse_order = None
-        weights, zero_points, scales = _process_quantized_tensors(
+        weights = _process_quantized_tensor(
             qweights,
+            weight_quantization,
+            activation_precision,
+            reverse_order,
+        )
+        zeros = _process_quantized_tensor(
             qzeros,
-            scales,
-            module,
+            weight_quantization,
+            activation_precision,
+            reverse_order,
         )
+        scales = scales.astype(activation_precision)
         return load_parameters(
             lambda m: (m.weights, m.scales, m.zero_points, m.biases),
             module,
-            (weights.T, scales.T, zero_points.T, bias),
+            (weights.T, scales.T, zeros.T, bias),
+        )
+    if isinstance(module, MLXQuantizedLinear):
+        qweights, deq_biases, scales = _fuse_quantized_weights(
+            weights_dict,
+            path,
+            sublayers_to_fuse,
+            MLX_QUANTIZED_WEIGHT_LAYOUT,
+        )
+        weight_quantization = module.config.weight_quantization_mode
+        activation_precision = module.activation_precision
+        weights = _process_quantized_tensor(
+            qweights,
+            weight_quantization,
+            activation_precision,
+            None,
+        )
+        scales = scales.astype(activation_precision)
+        deq_biases = deq_biases.astype(activation_precision)
+        return load_parameters(
+            lambda m: (m.weights, m.scales, m.deq_biases, m.biases),
+            module,
+            (weights, scales, deq_biases, bias),
         )
     raise TypeError(f"Unsupported module type for loading: {type(module)}")
-def load_mlp(module: MLPBase, weights_dict: Mapping[str, Array], path: ParameterPath) -> MLPBase:
+def load_mlp(
+    module: MLPBase,
+    weights_dict: Mapping[str, Array],
+    path: ParameterPath,
+    up_proj_key: str,
+    gate_proj_key: str,
+    down_proj_key: str,
+) -> MLPBase:
     if isinstance(module, DenseMLP):
         # Standard dense MLP with separate sublayers.
         up_projection = load_linear(
             module.up_projection,
             weights_dict,
             path,
-            sublayers_to_fuse=["up_proj", "gate_proj"],
+            sublayers_to_fuse=[up_proj_key, gate_proj_key],
         )
-        down_projection = load_linear(module.down_projection, weights_dict, path / "down_proj")
+        down_projection = load_linear(module.down_projection, weights_dict, path / down_proj_key)
         return load_parameters(
             lambda m: (m.up_projection, m.down_projection),
             module,
@@ -250,7 +309,7 @@ def load_moe(module: MixtureOfExperts, weights_dict: Mapping[str, Array], path:
         )
     else:
         # Fallback: recursively load a standard DenseMLP experts module
-        experts = load_mlp(module.experts, weights_dict, experts_path)
+        experts = load_mlp(module.experts, weights_dict, experts_path, "up_proj", "gate_proj", "down_proj")
     return load_parameters(
         lambda m: (m.router, m.experts),
@@ -304,28 +363,107 @@ def load_attention(
     )
+def _load_mamba_conv(
+    conv_module: SeparableCausalConv,
+    weights_dict: Mapping[str, Array],
+    path: ParameterPath,
+) -> SeparableCausalConv:
+    weight_path = path / "conv1d" / "weight"
+    if weight_path not in weights_dict:
+        weight_path = path / "conv_weight"
+    if weight_path not in weights_dict:
+        weight_path = None
+    if weight_path is not None:
+        raw = weights_dict[weight_path]
+        conv_weight = raw.squeeze(1) if raw.ndim == 3 else raw
+    else:
+        conv_weight = conv_module.weights
+    bias_path = path / "conv1d" / "bias"
+    if bias_path not in weights_dict:
+        bias_path = path / "conv_bias"
+    if bias_path not in weights_dict:
+        bias_path = None
+    if bias_path is not None and conv_module.biases is not None:
+        conv_bias = weights_dict[bias_path]
+    else:
+        conv_bias = conv_module.biases
+    return load_parameters(
+        lambda m: (m.weights, m.biases),
+        conv_module,
+        (conv_weight, conv_bias),
+    )
+def load_mamba2(
+    module: Mamba2,
+    weights_dict: Mapping[str, Array],
+    path: ParameterPath,
+) -> Mamba2:
+    in_projection = load_linear(module.in_projection, weights_dict, path / "in_proj")
+    out_projection = load_linear(module.out_projection, weights_dict, path / "out_proj")
+    conv = _load_mamba_conv(module.conv, weights_dict, path)
+    skip_connection_weight_path = path / "D"
+    if skip_connection_weight_path in weights_dict:
+        skip_connection_weight = weights_dict[skip_connection_weight_path]
+    else:
+        skip_connection_weight = module.skip_connection_weight
+    gate_bias_path = path / "z_bias"
+    if gate_bias_path in weights_dict:
+        gate_bias = weights_dict[gate_bias_path]
+    else:
+        gate_bias = module.gate_bias
+    return load_parameters(
+        lambda m: (m.in_projection, m.out_projection, m.conv, m.skip_connection_weight, m.gate_bias),
+        module,
+        (in_projection, out_projection, conv, skip_connection_weight, gate_bias),
+    )
 def load_decoder_layer(
     module: DecoderLayer,
     weights_dict: Mapping[str, Array],
-    path: ParameterPath,
+    mixer_path: ParameterPath,
+    mlp_path: ParameterPath,
+    mixer_key: str,
+    mlp_key: str,
+    pre_mixer_norm_key: str,
+    pre_mlp_norm_key: str,
+    up_proj_key: str,
+    gate_proj_key: str,
+    down_proj_key: str,
 ) -> DecoderLayer:
     pre_attention_norm = load_rmsnorm(
-        module.pre_attention_norm,
+        module.pre_mixer_norm,
         weights_dict,
-        path / "input_layernorm",
+        mixer_path / pre_mixer_norm_key,
     )
-    attention = load_attention(module.attention, weights_dict, path / "self_attn")
-    if module.post_attention_norm is not None:
+    # Load mixer (attention or mamba)
+    if isinstance(module.mixer, Attention):
+        mixer = load_attention(module.mixer, weights_dict, mixer_path / mixer_key)
+    elif isinstance(module.mixer, Mamba2):
+        mixer = load_mamba2(module.mixer, weights_dict, mixer_path / mixer_key)
+    else:
+        mixer = module.mixer
+    if module.post_mixer_norm is not None:
         post_attention_norm = load_rmsnorm(
-            module.post_attention_norm,
+            module.post_mixer_norm,
             weights_dict,
-            path / "post_attention_layernorm",
+            mixer_path / "post_attention_layernorm",
         )
         pre_mlp_norm = load_rmsnorm(
             module.pre_mlp_norm,
             weights_dict,
-            path / "pre_feedforward_layernorm",
+            mlp_path / "pre_feedforward_layernorm",
         )
     else:
         post_attention_norm = None
@@ -333,41 +471,92 @@ def load_decoder_layer(
         pre_mlp_norm = load_rmsnorm(
             module.pre_mlp_norm,
             weights_dict,
-            path / "post_attention_layernorm",
+            mlp_path / pre_mlp_norm_key,
         )
-    mlp = load_mlp(module.mlp, weights_dict, path / "mlp")
+    mlp = load_mlp(module.mlp, weights_dict, mlp_path / mlp_key, up_proj_key, gate_proj_key, down_proj_key)
     if module.post_mlp_norm is not None:
         post_mlp_norm = load_rmsnorm(
             module.post_mlp_norm,
             weights_dict,
-            path / "post_feedforward_layernorm",
+            mlp_path / "post_feedforward_layernorm",
         )
     else:
         post_mlp_norm = None
     return load_parameters(
-        lambda m: (m.pre_attention_norm, m.attention, m.post_attention_norm, m.pre_mlp_norm, m.mlp, m.post_mlp_norm),
+        lambda m: (m.pre_mixer_norm, m.mixer, m.post_mixer_norm, m.pre_mlp_norm, m.mlp, m.post_mlp_norm),
         module,
-        (pre_attention_norm, attention, post_attention_norm, pre_mlp_norm, mlp, post_mlp_norm),
+        (pre_attention_norm, mixer, post_attention_norm, pre_mlp_norm, mlp, post_mlp_norm),
     )
 def load_tied_embedding(
     module: TiedEmbedding,
     weights_dict: Mapping[str, Array],
-    decoder_path: ParameterPath,
+    embedding_path: ParameterPath,
 ) -> TiedEmbedding:
-    weights = weights_dict[decoder_path / "embed_tokens" / "weight"]
+    weights = weights_dict[embedding_path / "weight"]
     return load_parameters(lambda m: (m.weights,), module, (weights,))
+def load_mlx_quantized_tied_embedding(
+    module: MLXQuantizedTiedEmbedding,
+    weights_dict: Mapping[str, Array],
+    embedding_path: ParameterPath,
+) -> MLXQuantizedTiedEmbedding:
+    qweights = weights_dict[embedding_path / "weight"]
+    qscales = weights_dict[embedding_path / "scales"]
+    qbiases = weights_dict[embedding_path / "biases"]
+    weights = _process_quantized_tensor(
+        qweights,
+        module.config.embedding_quantization_mode,
+        module.activation_precision,
+        None,
+    )
+    scales = qscales.astype(module.activation_precision)
+    biases = qbiases.astype(module.activation_precision)
+    return load_parameters(lambda m: (m.weights, m.scales, m.biases), module, (weights, scales, biases))
+def load_mlx_semi_quantized_untied_embedding(
+    module: MLXSemiQuantizedUntiedEmbedding,
+    weights_dict: Mapping[str, Array],
+    embedding_path: ParameterPath,
+    lm_head_path: ParameterPath,
+) -> MLXSemiQuantizedUntiedEmbedding:
+    input_weights = weights_dict[embedding_path / "weight"]
+    output_qweights = weights_dict[lm_head_path / "weight"]
+    output_qscales = weights_dict[lm_head_path / "scales"]
+    output_qbiases = weights_dict[lm_head_path / "biases"]
+    output_weights = _process_quantized_tensor(
+        output_qweights,
+        module.config.embedding_quantization_mode,
+        module.activation_precision,
+        None,
+    )
+    output_scales = output_qscales.astype(module.activation_precision)
+    output_biases = output_qbiases.astype(module.activation_precision)
+    return load_parameters(
+        lambda m: (m.input_weights, m.output_weights, m.output_scales, m.output_biases),
+        module,
+        (input_weights, output_weights, output_scales, output_biases),
+    )
 def load_untied_embedding(
     module: UntiedEmbedding,
     weights_dict: Mapping[str, Array],
-    decoder_path: ParameterPath,
+    embedding_path: ParameterPath,
     lm_head_path: ParameterPath,
 ) -> UntiedEmbedding:
-    input_weights = weights_dict[decoder_path / "embed_tokens" / "weight"]
+    input_weights = weights_dict[embedding_path / "weight"]
     output_weights = weights_dict[lm_head_path / "weight"]
     return load_parameters(lambda m: (m.input_weights, m.output_weights), module, (input_weights, output_weights))
@@ -381,19 +570,82 @@ def load_huggingface(
     else:
         base_path = ParameterPath()
-    decoder_path = base_path / "model"
-    lm_head_path = base_path / "lm_head"
+    is_llamba_full_precision = any(key.startswith("backbone.") for key in weights_dict)
+    is_llamba_mlx = any(key.startswith("embedding.encoder.") for key in weights_dict)
+    if is_llamba_full_precision:
+        decoder_path = base_path / "backbone"
+        embedding_path = decoder_path / "embedding"
+        pre_mixer_norm_key = "input_layernorm"
+        mixer_key = "mixer"
+        pre_mlp_norm_key = "post_attention_layernorm"
+        mlp_key = "mlp"
+        up_proj_key = "up_proj"
+        gate_proj_key = "gate_proj"
+        down_proj_key = "down_proj"
+        alternating_layers = False
+        norm_key = "final_layernorm"
+        lm_head_path = base_path / "lm_head"
+    elif is_llamba_mlx:
+        decoder_path = base_path / "model"
+        embedding_path = base_path / "embedding.encoder"
+        pre_mixer_norm_key = "norm"
+        mixer_key = "layer"
+        pre_mlp_norm_key = "norm"
+        mlp_key = "layer"
+        up_proj_key = "gate_proj"
+        gate_proj_key = "in_proj"
+        down_proj_key = "out_proj"
+        alternating_layers = True
+        norm_key = "norm"
+        lm_head_path = base_path / "head.linear"
+    else:
+        decoder_path = base_path / "model"
+        embedding_path = decoder_path / "embed_tokens"
+        pre_mixer_norm_key = "input_layernorm"
+        mixer_key = "self_attn"
+        pre_mlp_norm_key = "post_attention_layernorm"
+        mlp_key = "mlp"
+        up_proj_key = "up_proj"
+        gate_proj_key = "gate_proj"
+        down_proj_key = "down_proj"
+        alternating_layers = False
+        norm_key = "norm"
+        lm_head_path = base_path / "lm_head"
     if isinstance(module.embedding, TiedEmbedding):
-        embedding = load_tied_embedding(module.embedding, weights_dict, decoder_path)
+        embedding = load_tied_embedding(module.embedding, weights_dict, embedding_path)
+    elif isinstance(module.embedding, MLXQuantizedTiedEmbedding):
+        embedding = load_mlx_quantized_tied_embedding(module.embedding, weights_dict, embedding_path)
+    elif isinstance(module.embedding, MLXSemiQuantizedUntiedEmbedding):
+        embedding = load_mlx_semi_quantized_untied_embedding(
+            module.embedding,
+            weights_dict,
+            embedding_path,
+            lm_head_path,
+        )
     elif isinstance(module.embedding, UntiedEmbedding):
-        embedding = load_untied_embedding(module.embedding, weights_dict, decoder_path, lm_head_path)
+        embedding = load_untied_embedding(module.embedding, weights_dict, embedding_path, lm_head_path)
     else:
         raise TypeError(f"Unsupported embedding type: {type(module.embedding)}")
     decoder_layers = tuple(
-        load_decoder_layer(layer, weights_dict, decoder_path / "layers" / i) for i, layer in enumerate(module.layers)
+        load_decoder_layer(
+            layer,
+            weights_dict,
+            decoder_path / "layers" / ((i * 2) if alternating_layers else i),
+            decoder_path / "layers" / ((i * 2 + 1) if alternating_layers else i),
+            mixer_key,
+            mlp_key,
+            pre_mixer_norm_key,
+            pre_mlp_norm_key,
+            up_proj_key,
+            gate_proj_key,
+            down_proj_key,
+        )
+        for i, layer in enumerate(module.layers)
     )
-    output_norm = load_rmsnorm(module.output_norm, weights_dict, decoder_path / "norm")
+    output_norm = load_rmsnorm(module.output_norm, weights_dict, decoder_path / norm_key)
     return load_parameters(
         lambda m: (m.embedding, m.layers, m.output_norm),
         module,

lalamo/model_import/model_specs/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .gemma import GEMMA_MODELS
 from .gpt_oss import GPT_OSS_MODELS
 from .huggingface import HUGGINGFACE_MODELS
 from .llama import LLAMA_MODELS
+from .llamba import LLAMBA_MODELS
 from .mistral import MISTRAL_MODELS
 # from .pleias import PLEIAS_MODELS
@@ -22,6 +23,7 @@ __all__ = [
 ALL_MODEL_LISTS = [
     LLAMA_MODELS,
+    LLAMBA_MODELS,
     DEEPSEEK_MODELS,
     GEMMA_MODELS,
     HUGGINGFACE_MODELS,

lalamo/model_import/model_specs/common.py CHANGED Viewed

@@ -20,6 +20,7 @@ from lalamo.utils import MapDictValues, open_safetensors
 __all__ = [
     "ConfigMap",
     "FileSpec",
+    "JSONFieldSpec",
     "ModelSpec",
     "UseCase",
     "WeightsType",
@@ -39,17 +40,21 @@ class WeightsType(Enum):
     TORCH = "torch"
     @contextmanager
-    def load(self, filename: Path | str, float_dtype: DTypeLike) -> Iterator[Mapping[str, jnp.ndarray]]:
+    def load(
+        self,
+        filename: Path | str,
+        float_dtype: DTypeLike,
+    ) -> Iterator[tuple[Mapping[str, jnp.ndarray], Mapping[str, str]]]:
         if self == WeightsType.SAFETENSORS:
-            with open_safetensors(filename) as weights_dict:
-                yield MapDictValues(lambda v: cast_if_float(v, float_dtype), weights_dict)
+            with open_safetensors(filename) as (weights_dict, metadata_dict):
+                yield MapDictValues(lambda v: cast_if_float(v, float_dtype), weights_dict), metadata_dict or {}
         else:
             import torch
             from lalamo.modules.torch_interop import torch_to_jax
             torch_weights = torch.load(filename, map_location="cpu", weights_only=True)
-            yield MapDictValues(lambda v: cast_if_float(torch_to_jax(v), float_dtype), torch_weights)
+            yield MapDictValues(lambda v: cast_if_float(torch_to_jax(v), float_dtype), torch_weights), {}
 class UseCase(Enum):
@@ -62,13 +67,19 @@ class FileSpec:
     repo: str | None = None
+@dataclass(frozen=True)
+class JSONFieldSpec:
+    file_spec: FileSpec
+    field_name: str
 @dataclass(frozen=True)
 class ConfigMap:
     model_config: FileSpec = field(default=FileSpec("config.json"))
     tokenizer: FileSpec = field(default=FileSpec("tokenizer.json"))
     tokenizer_config: FileSpec = field(default=FileSpec("tokenizer_config.json"))
     generation_config: FileSpec | None = field(default=FileSpec("generation_config.json"))
-    chat_template: FileSpec | None = None
+    chat_template: FileSpec | JSONFieldSpec | None = None
 def _is_foreign_config_type(t: object) -> bool:

lalamo 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

lalamo 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl