PyPI - lalamo - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

lalamo/__init__.py +20 -5
lalamo/data/__init__.py +8 -0
lalamo/data/huggingface_message.py +38 -0
lalamo/data/lalamo_completions.py +43 -0
lalamo/data/utils.py +8 -0
lalamo/language_model.py +152 -69
lalamo/main.py +271 -43
lalamo/message_processor.py +11 -1
lalamo/model_import/common.py +17 -7
lalamo/model_import/decoder_configs/__init__.py +3 -0
lalamo/model_import/decoder_configs/executorch.py +12 -6
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +1 -3
lalamo/model_import/decoder_configs/huggingface/gemma2.py +11 -5
lalamo/model_import/decoder_configs/huggingface/gemma3.py +14 -5
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +195 -0
lalamo/model_import/decoder_configs/huggingface/llama.py +38 -8
lalamo/model_import/decoder_configs/huggingface/mistral.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen2.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen3.py +12 -6
lalamo/model_import/huggingface_tokenizer_config.py +1 -4
lalamo/model_import/loaders/executorch.py +10 -9
lalamo/model_import/loaders/huggingface.py +104 -9
lalamo/model_import/loaders/utils.py +92 -0
lalamo/model_import/model_specs/__init__.py +4 -1
lalamo/model_import/model_specs/common.py +15 -12
lalamo/model_import/model_specs/gpt_oss.py +21 -0
lalamo/modules/__init__.py +35 -7
lalamo/modules/activations.py +24 -14
lalamo/modules/attention.py +73 -20
lalamo/modules/common.py +8 -57
lalamo/modules/decoder.py +48 -34
lalamo/modules/decoder_layer.py +57 -43
lalamo/modules/embedding.py +13 -19
lalamo/modules/kv_cache.py +53 -16
lalamo/modules/linear.py +260 -79
lalamo/modules/mlp.py +395 -23
lalamo/modules/normalization.py +2 -3
lalamo/modules/rope.py +32 -21
lalamo/modules/utils.py +10 -0
lalamo/speculator/__init__.py +11 -0
lalamo/speculator/common.py +22 -0
lalamo/speculator/inference.py +75 -0
lalamo/speculator/ngram.py +154 -0
lalamo/speculator/utils.py +52 -0
lalamo/utils.py +27 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/METADATA +11 -4
lalamo-0.4.0.dist-info/RECORD +71 -0
lalamo-0.3.3.dist-info/RECORD +0 -59
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/WHEEL +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/top_level.txt +0 -0

lalamo/model_import/loaders/huggingface.py CHANGED Viewed

@@ -6,10 +6,10 @@ from jaxtyping import Array
 from lalamo.common import ParameterPath
 from lalamo.modules import (
-    MLP,
     Attention,
     Decoder,
     DecoderLayer,
+    DenseMLP,
     FullPrecisionLinear,
     GroupQuantizedLinear,
     LinearBase,
@@ -17,9 +17,11 @@ from lalamo.modules import (
     TiedEmbedding,
     UntiedEmbedding,
 )
+from lalamo.modules.mlp import MixtureOfExperts, MLPBase
 from lalamo.quantization import QuantizationMode
 from .common import load_parameters
+from .utils import decode_mxfp4, deinterleave_pairwise_columns
 __all__ = ["load_huggingface"]
@@ -78,7 +80,7 @@ def _process_quantized_tensors(
     zero_points = unpacked_zero_points.astype(module.config.activation_precision)
     processed_scales = scales.astype(module.config.activation_precision)
-    return weights.transpose(), zero_points.transpose(), processed_scales.transpose()
+    return weights, zero_points, processed_scales
 def _fuse_full_precision_weights(
@@ -158,16 +160,103 @@ def load_linear(
         return load_parameters(
             lambda m: (m.weights, m.scales, m.zero_points, m.biases),
             module,
-            (weights, scales, zero_points, bias),
+            (weights.T, scales.T, zero_points.T, bias),
         )
     raise TypeError(f"Unsupported module type for loading: {type(module)}")
-def load_mlp(module: MLP, weights_dict: Mapping[str, Array], path: ParameterPath) -> MLP:
-    up_projection = load_linear(module.up_projection, weights_dict, path, sublayers_to_fuse=["up_proj", "gate_proj"])
-    down_projection = load_linear(module.down_projection, weights_dict, path / "down_proj")
-    return load_parameters(lambda m: (m.up_projection, m.down_projection), module, (up_projection, down_projection))
+def load_mlp(module: MLPBase, weights_dict: Mapping[str, Array], path: ParameterPath) -> MLPBase:
+    if isinstance(module, DenseMLP):
+        # Standard dense MLP with separate sublayers.
+        up_projection = load_linear(
+            module.up_projection,
+            weights_dict,
+            path,
+            sublayers_to_fuse=["up_proj", "gate_proj"],
+        )
+        down_projection = load_linear(module.down_projection, weights_dict, path / "down_proj")
+        return load_parameters(
+            lambda m: (m.up_projection, m.down_projection),
+            module,
+            (up_projection, down_projection),
+        )
+    if isinstance(module, MixtureOfExperts):
+        return load_moe(module, weights_dict, path)
+    raise TypeError(f"Unsupported module type for loading: {type(module)}")
+def load_moe(module: MixtureOfExperts, weights_dict: Mapping[str, Array], path: ParameterPath) -> MixtureOfExperts:
+    # Load router via the standard linear loader
+    router = load_linear(module.router, weights_dict, path / "router")
+    experts_path = path / "experts"
+    # Handle fused MXFP4 experts layout if present
+    if (experts_path / "gate_up_proj_blocks") in weights_dict:
+        # Decode fused gate/up (interleaved), split into (up, gate), and add +1.0 to up bias
+        fused = decode_mxfp4(
+            weights_dict[experts_path / "gate_up_proj_blocks"],
+            weights_dict[experts_path / "gate_up_proj_scales"],
+            dtype=module.activation_precision,
+            flatten=False,
+        )
+        # Stored as (experts, outputs=2*hidden_dim, input_blocks, input_block_elems)
+        # Merge blocks and move outputs last
+        fused_eio = rearrange(fused, "e o ib ie -> e (ib ie) o")
+        up_w, gate_w = deinterleave_pairwise_columns(fused_eio, first="odd")
+        combined_up_gate = jnp.concatenate([up_w, gate_w], axis=-1)
+        # Transpose to new layout: (experts, outputs, inputs)
+        combined_up_gate_w = jnp.swapaxes(combined_up_gate, -1, -2)
+        gub = weights_dict[experts_path / "gate_up_proj_bias"]
+        if gub.ndim == 1:
+            # Broadcast to (experts, 2*hidden_dim)
+            gub = jnp.broadcast_to(gub, (combined_up_gate_w.shape[0], gub.shape[0]))
+        up_b, gate_b = deinterleave_pairwise_columns(gub, first="odd")
+        combined_up_gate_b = jnp.concatenate([up_b + 1.0, gate_b], axis=-1)
+        up_projection = load_parameters(
+            lambda m: (m.weights, m.biases),  # type: ignore
+            module.experts.up_projection,
+            (combined_up_gate_w, combined_up_gate_b),
+        )
+        # Down projection: decode MXFP4 to dense
+        down_w = decode_mxfp4(
+            weights_dict[experts_path / "down_proj_blocks"],
+            weights_dict[experts_path / "down_proj_scales"],
+            dtype=module.activation_precision,
+            flatten=False,
+        )
+        # Stored as (experts, outputs=model_dim, input_blocks, input_block_elems)
+        # Merge blocks and move outputs last
+        down_w = rearrange(down_w, "e o ib ie -> e o (ib ie)")
+        down_b = weights_dict[experts_path / "down_proj_bias"]
+        if down_b.ndim == 1:
+            down_b = jnp.broadcast_to(down_b, down_w.shape[:-1] + (down_b.shape[0],))
+        down_projection = load_parameters(
+            lambda m: (m.weights, m.biases),  # type: ignore
+            module.experts.down_projection,
+            (down_w, down_b),
+        )
+        experts = load_parameters(
+            lambda m: (m.up_projection, m.down_projection),
+            module.experts,
+            (up_projection, down_projection),
+        )
+    else:
+        # Fallback: recursively load a standard DenseMLP experts module
+        experts = load_mlp(module.experts, weights_dict, experts_path)
+    return load_parameters(
+        lambda m: (m.router, m.experts),
+        module,
+        (router, experts),
+    )
 def load_rmsnorm(
@@ -202,10 +291,16 @@ def load_attention(
     else:
         key_norm = None
+    # GPT-OSS adds per-head attention sinks; load them if present.
+    if (path / "sinks") in weights_dict:
+        sinks = weights_dict[path / "sinks"]
+    else:
+        sinks = module.sinks
     return load_parameters(
-        lambda m: (m.qkv_projection, m.out_projection, m.query_norm, m.key_norm),
+        lambda m: (m.qkv_projection, m.out_projection, m.query_norm, m.key_norm, m.sinks),
         module,
-        (qkv_projection, out_projection, query_norm, key_norm),
+        (qkv_projection, out_projection, query_norm, key_norm, sinks),
     )

lalamo/model_import/loaders/utils.py ADDED Viewed

@@ -0,0 +1,92 @@
+# MXFP4 decoding utilities for model loaders.
+# Based on OpenAI's reference implementation logic for GPT-OSS MXFP4 weights.
+# Converts packed FP4 blocks plus per-row scales into dense weights in the target dtype.
+import jax.numpy as jnp
+from jaxtyping import Array, DTypeLike
+__all__ = [
+    "decode_mxfp4",
+    "deinterleave_pairwise_columns",
+]
+# The 16 representable FP4 values used by MXFP4, in logical order (low nibble indices 0..15).
+# See: https://github.com/openai/gpt-oss/blob/main/gpt_oss/torch/weights.py
+_MXFP4_LUT_VALUES = (
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    -0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+)
+def decode_mxfp4(
+    blocks: Array,
+    scales: Array,
+    *,
+    dtype: DTypeLike,
+    flatten: bool = False,
+) -> Array:
+    target_dtype = jnp.dtype(dtype)
+    # Prepare LUT in target dtype
+    lut = jnp.array(_MXFP4_LUT_VALUES, dtype=target_dtype)
+    *prefix, rows, packed_cols = blocks.shape
+    if scales.shape != (*prefix, rows):
+        raise ValueError(
+            f"MXFP4 scales shape {scales.shape} does not match blocks prefix/rows {(*prefix, rows)}",
+        )
+    # Extract low/high nibble indices
+    low_mask = jnp.array(0x0F, dtype=blocks.dtype)
+    idx_lo = (blocks & low_mask).astype(jnp.int32)
+    idx_hi = (blocks >> jnp.array(4, dtype=blocks.dtype)).astype(jnp.int32)
+    # Lookup FP4 base values
+    vals_lo = lut[idx_lo]
+    vals_hi = lut[idx_hi]
+    # Interleave into (..., rows, 2*packed_cols)
+    out_shape = (*prefix, rows, packed_cols * 2)
+    out = jnp.empty(out_shape, dtype=target_dtype)
+    out = out.at[..., 0::2].set(vals_lo)
+    out = out.at[..., 1::2].set(vals_hi)
+    # Apply exponent scaling: exponents are biased by 127 in checkpoints
+    exp = scales.astype(jnp.int32) - 127
+    out = jnp.ldexp(out, exp[..., None])
+    if flatten:
+        return out.reshape(*prefix, rows * (packed_cols * 2))
+    return out
+def deinterleave_pairwise_columns(
+    matrix: Array,
+    *,
+    first: str = "even",
+) -> tuple[Array, Array]:
+    if matrix.shape[-1] % 2 != 0:
+        raise ValueError(f"Last dimension must be even, got {matrix.shape[-1]}")
+    match first:
+        case "even":
+            return matrix[..., 0::2], matrix[..., 1::2]
+        case "odd":
+            return matrix[..., 1::2], matrix[..., 0::2]
+        case _:
+            raise ValueError("Parameter 'first' must be either 'even' or 'odd'")

lalamo/model_import/model_specs/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from .common import FileSpec, ModelSpec, UseCase, build_quantized_models
 from .deepseek import DEEPSEEK_MODELS
 from .gemma import GEMMA_MODELS
+from .gpt_oss import GPT_OSS_MODELS
 from .huggingface import HUGGINGFACE_MODELS
 from .llama import LLAMA_MODELS
 from .mistral import MISTRAL_MODELS
-from .pleias import PLEIAS_MODELS
+# from .pleias import PLEIAS_MODELS
 from .polaris import POLARIS_MODELS
 from .qwen import QWEN_MODELS
 from .reka import REKA_MODELS
@@ -23,6 +25,7 @@ ALL_MODEL_LISTS = [
     DEEPSEEK_MODELS,
     GEMMA_MODELS,
     HUGGINGFACE_MODELS,
+    GPT_OSS_MODELS,
     MISTRAL_MODELS,
     # PLEIAS_MODELS,  # TODO(norpadon): Add chat template
     POLARIS_MODELS,

lalamo/model_import/model_specs/common.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from collections.abc import (
     Callable,
+    Iterator,
     Mapping,
 )
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
@@ -10,11 +12,10 @@ from typing import ClassVar, cast, get_args, get_origin
 import cattrs
 import jax.numpy as jnp
 from jaxtyping import Array, DTypeLike
-from safetensors.flax import load_file as load_safetensors
 from lalamo.model_import.decoder_configs import ForeignConfig
 from lalamo.quantization import QuantizationMode
-from lalamo.utils import MapDictValues
+from lalamo.utils import MapDictValues, open_safetensors
 __all__ = [
     "ConfigMap",
@@ -37,16 +38,18 @@ class WeightsType(Enum):
     SAFETENSORS = "safetensors"
     TORCH = "torch"
-    def load(self, filename: Path | str, float_dtype: DTypeLike) -> Mapping[str, jnp.ndarray]:
+    @contextmanager
+    def load(self, filename: Path | str, float_dtype: DTypeLike) -> Iterator[Mapping[str, jnp.ndarray]]:
         if self == WeightsType.SAFETENSORS:
-            return MapDictValues(lambda v: cast_if_float(v, float_dtype), load_safetensors(filename))
+            with open_safetensors(filename) as weights_dict:
+                yield MapDictValues(lambda v: cast_if_float(v, float_dtype), weights_dict)
+        else:
+            import torch
-        import torch
+            from lalamo.modules.torch_interop import torch_to_jax
-        from lalamo.modules.torch_interop import torch_to_jax
-        torch_weights = torch.load(filename, map_location="cpu", weights_only=True)
-        return MapDictValues(lambda v: cast_if_float(torch_to_jax(v), float_dtype), torch_weights)
+            torch_weights = torch.load(filename, map_location="cpu", weights_only=True)
+            yield MapDictValues(lambda v: cast_if_float(torch_to_jax(v), float_dtype), torch_weights)
 class UseCase(Enum):
@@ -75,8 +78,8 @@ def _is_foreign_config_type(t: object) -> bool:
 def _structure_foreign_config_factory(
-    t: object,
-    c: cattrs.Converter,
+    t: object,  # noqa: ARG001
+    c: cattrs.Converter,  # noqa: ARG001
 ) -> Callable[[object, object], type[ForeignConfig]]:
     name_to_type = {t.__name__: t for t in ForeignConfig.__descendants__()}
@@ -88,7 +91,7 @@ def _structure_foreign_config_factory(
     return _hook
-def _unstructure_foreign_config_factory(t: object, c: cattrs.Converter) -> Callable[[type[ForeignConfig]], str]:
+def _unstructure_foreign_config_factory(t: object, c: cattrs.Converter) -> Callable[[type[ForeignConfig]], str]:  # noqa: ARG001
     def _hook(v: type[ForeignConfig]) -> str:
         return v.__name__

lalamo/model_import/model_specs/gpt_oss.py ADDED Viewed

@@ -0,0 +1,21 @@
+from lalamo.model_import.decoder_configs import HFGPTOssConfig
+from .common import ConfigMap, FileSpec, ModelSpec, WeightsType
+__all__ = ["GPT_OSS_MODELS"]
+GPT_OSS_MODELS = [
+    ModelSpec(
+        vendor="OpenAI",
+        family="GPT-OSS",
+        name="GPT-OSS-20B",
+        size="20B",
+        quantization=None,
+        repo="openai/gpt-oss-20b",
+        config_type=HFGPTOssConfig,
+        weights_type=WeightsType.SAFETENSORS,
+        configs=ConfigMap(
+            chat_template=FileSpec("chat_template.jinja"),
+        ),
+    ),
+]

lalamo/modules/__init__.py CHANGED Viewed

@@ -1,8 +1,14 @@
-from .activations import Activation
+from .activations import GELU, Activation, SiLU
 from .attention import Attention, AttentionConfig
-from .common import LalamoModule, WeightLayout, config_converter
-from .decoder import Decoder, DecoderActivationTrace, DecoderConfig, DecoderResult
-from .decoder_layer import DecoderLayer, DecoderLayerActivationTrace, DecoderLayerConfig, DecoderLayerResult
+from .common import AttentionType, ForwardPassMode, LalamoModule, config_converter
+from .decoder import Decoder, DecoderActivationTrace, DecoderConfig, DecoderForwardPassConfig, DecoderResult
+from .decoder_layer import (
+    DecoderLayer,
+    DecoderLayerActivationTrace,
+    DecoderLayerConfig,
+    DecoderLayerForwardPassConfig,
+    DecoderLayerResult,
+)
 from .embedding import (
     EmbeddingBase,
     EmbeddingConfig,
@@ -24,7 +30,17 @@ from .linear import (
     QLoRALinear,
     QLoRALinearConfig,
 )
-from .mlp import MLP, MLPConfig
+from .mlp import (
+    DenseMLP,
+    DenseMLPConfig,
+    MixtureOfExperts,
+    MixtureOfExpertsConfig,
+    MLPBase,
+    MLPConfig,
+    MLPForwardPassConfig,
+    RoutingFunction,
+    SoftmaxRouting,
+)
 from .normalization import RMSNorm, RMSNormConfig, UpcastMode
 from .rope import (
     LinearScalingRoPEConfig,
@@ -37,21 +53,27 @@ from .rope import (
 )
 __all__ = [
-    "MLP",
+    "GELU",
     "Activation",
     "Attention",
     "AttentionConfig",
+    "AttentionType",
     "Decoder",
     "DecoderActivationTrace",
     "DecoderConfig",
+    "DecoderForwardPassConfig",
     "DecoderLayer",
     "DecoderLayerActivationTrace",
     "DecoderLayerConfig",
+    "DecoderLayerForwardPassConfig",
     "DecoderLayerResult",
     "DecoderResult",
+    "DenseMLP",
+    "DenseMLPConfig",
     "DynamicKVCacheLayer",
     "EmbeddingBase",
     "EmbeddingConfig",
+    "ForwardPassMode",
     "FullPrecisionLinear",
     "FullPrecisionLinearConfig",
     "GroupQuantizedLinear",
@@ -63,7 +85,11 @@ __all__ = [
     "LinearConfig",
     "LinearScalingRoPEConfig",
     "LlamaRoPEConfig",
+    "MLPBase",
     "MLPConfig",
+    "MLPForwardPassConfig",
+    "MixtureOfExperts",
+    "MixtureOfExpertsConfig",
     "PositionalEmbeddings",
     "QLoRALinear",
     "QLoRALinearConfig",
@@ -73,6 +99,9 @@ __all__ = [
     "RMSNormConfig",
     "RoPE",
     "RoPEConfig",
+    "RoutingFunction",
+    "SiLU",
+    "SoftmaxRouting",
     "StaticKVCacheLayer",
     "TiedEmbedding",
     "TiedEmbeddingConfig",
@@ -80,7 +109,6 @@ __all__ = [
     "UntiedEmbedding",
     "UntiedEmbeddingConfig",
     "UpcastMode",
-    "WeightLayout",
     "YARNRoPEConfig",
     "config_converter",
 ]

lalamo/modules/activations.py CHANGED Viewed

@@ -1,30 +1,40 @@
-from enum import Enum
+from abc import abstractmethod
 import jax
 import jax.numpy as jnp
-from jax import jit
+from attr import dataclass
 from jaxtyping import Array, Float
+from lalamo.modules.common import register_config_union
 __all__ = [
+    "GELU",
     "Activation",
-    "silu",
+    "SiLU",
 ]
-@jit
-def silu(x: Float[Array, "*dims"]) -> Float[Array, "*dims"]:
-    return x / (1 + jnp.exp(-x))
+@dataclass(frozen=True)
+class ActivationBase:
+    @abstractmethod
+    def __call__(self, x: Float[Array, "*dims"]) -> Float[Array, "*dims"]: ...
+@dataclass(frozen=True)
+class SiLU(ActivationBase):
+    alpha: float = 1.0
+    def __call__(self, x: Float[Array, "*dims"]) -> Float[Array, "*dims"]:
+        return x / (1 + jnp.exp(-x * self.alpha))
-class Activation(Enum):
-    SILU = "silu"
-    GELU = "gelu"
+@dataclass(frozen=True)
+class GELU(ActivationBase):
     def __call__(self, x: Float[Array, "*dims"]) -> Float[Array, "*dims"]:
-        return ACTIVATION_FUNCTIONS[self](x)
+        return jax.nn.gelu(x)
+Activation = SiLU | GELU
-ACTIVATION_FUNCTIONS = {
-    Activation.SILU: silu,
-    Activation.GELU: jax.nn.gelu,
-}
+register_config_union(Activation)

lalamo 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl