PyPI - lalamo - Versions diffs - 0.5.8__tar.gz → 0.5.10__tar.gz - Mend

lalamo 0.5.8tar.gz → 0.5.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

{lalamo-0.5.8 → lalamo-0.5.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.5.8
+Version: 0.5.10
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/__init__.py RENAMED Viewed

@@ -15,7 +15,7 @@ from lalamo.speculator import (
     SpeculatorTrainingEvent,
 )
-__version__ = "0.5.8"
+__version__ = "0.5.10"
 __all__ = [
     "AssistantMessage",

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/common.py RENAMED Viewed

@@ -17,6 +17,7 @@ from lalamo.message_processor import MessageProcessor, MessageProcessorConfig
 from lalamo.models import ClassifierModel, ClassifierModelConfig, GenerationConfig, LanguageModel, LanguageModelConfig
 from lalamo.modules import Classifier, Decoder, LalamoModule
 from lalamo.quantization import QuantizationMode
+from lalamo.utils import process_chat_template
 from .decoder_configs import ForeignClassifierConfig, ForeignConfig, ForeignLMConfig
 from .huggingface_generation_config import HFGenerationConfig
@@ -154,6 +155,7 @@ def import_message_processor(
         if model_spec.configs.chat_template is not None:
             raise ValueError("Conflicting chat template specifications.")
         prompt_template = tokenizer_config.chat_template
+    prompt_template = process_chat_template(prompt_template)
     tokenizer = Tokenizer.from_file(str(tokenizer_file))
     added_tokens = tokenizer_config.added_tokens()

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/decoder_configs/__init__.py RENAMED Viewed

@@ -6,6 +6,7 @@ from .huggingface import (
     HFGemma3Config,
     HFGemma3TextConfig,
     HFGPTOssConfig,
+    HFLFM2Config,
     HFLlamaConfig,
     HFLlambaConfig,
     HFMistralConfig,
@@ -22,6 +23,7 @@ __all__ = [
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",
+    "HFLFM2Config",
     "HFLlamaConfig",
     "HFLlambaConfig",
     "HFMistralConfig",

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/decoder_configs/huggingface/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from .common import HuggingFaceLMConfig
 from .gemma2 import HFGemma2Config
 from .gemma3 import HFGemma3Config, HFGemma3TextConfig
 from .gpt_oss import HFGPTOssConfig
+from .lfm2 import HFLFM2Config
 from .llama import HFLlamaConfig
 from .llamba import HFLlambaConfig
 from .mistral import HFMistralConfig
@@ -14,6 +15,7 @@ __all__ = [
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",
+    "HFLFM2Config",
     "HFLlamaConfig",
     "HFLlambaConfig",
     "HFMistralConfig",

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/decoder_configs/huggingface/gemma3.py RENAMED Viewed

@@ -10,7 +10,7 @@ from lalamo.modules.activations import GELU
 from lalamo.modules.linear import FullPrecisionLinearConfig
 from lalamo.modules.mlp import DenseMLPConfig
 from lalamo.modules.normalization import NormalizationConfig, UpcastMode
-from lalamo.modules.rope import LinearScalingRoPEConfig, UnscaledRoPEConfig
+from lalamo.modules.rope import LinearScalingRoPEConfig, UnscaledRoPEConfig, YARNRoPEConfig
 from lalamo.modules.token_mixers.attention import AttentionConfig
 from lalamo.modules.transformer_layer import TransformerLayerConfig
@@ -19,9 +19,6 @@ from .common import HuggingFaceLMConfig
 __all__ = ["HFGemma3Config", "HFGemma3TextConfig"]
-NUM_SLIDING_WINDOW_LAYERS_PER_FULL_ATTENTION_LAYER = 6
 def _round_to_bfloat16(x: float) -> float:
     return jnp.asarray(x).astype(jnp.bfloat16).item()
@@ -32,6 +29,16 @@ class GemmaRoPEScalingConfig:
     rope_type: Literal["linear"]
+@dataclass(frozen=True)
+class YarnRopeScalingConfig:
+    factor: float
+    beta_fast: float
+    beta_slow: float
+    original_max_position_embeddings: int
+    rope_type: Literal["yarn"]
+    truncate: bool = False
 @dataclass(frozen=True)
 class HFGemma3TextConfigRaw:
     hidden_size: int
@@ -39,6 +46,7 @@ class HFGemma3TextConfigRaw:
     model_type: Literal["gemma3_text"]
     num_hidden_layers: int
     sliding_window: int
+    sliding_window_pattern: int
     rms_norm_eps: float = 1e-06
     query_pre_attn_scalar: float = 256.0
     attention_bias: bool = False
@@ -49,7 +57,7 @@ class HFGemma3TextConfigRaw:
     max_position_embeddings: int = 131072
     rope_theta: float = 1000000.0
     rope_local_base_freq: float = 10000.0
-    rope_scaling: GemmaRoPEScalingConfig | None = None
+    rope_scaling: GemmaRoPEScalingConfig | YarnRopeScalingConfig | None = None
     final_logit_softcapping: float | None = None
     vocab_size: int = 262208
@@ -57,7 +65,7 @@ class HFGemma3TextConfigRaw:
     def sliding_window_sizes(self) -> list[int | None]:
         result = []
         for i in range(self.num_hidden_layers):
-            if (i + 1) % NUM_SLIDING_WINDOW_LAYERS_PER_FULL_ATTENTION_LAYER == 0:
+            if (i + 1) % self.sliding_window_pattern == 0:
                 result.append(None)
             else:
                 result.append(self.sliding_window)
@@ -74,7 +82,7 @@ class HFGemma3TextConfigRaw:
         attention_scale = self.query_pre_attn_scalar**-0.5
         embedding_config = TiedEmbeddingConfig(
             input_scale=input_scale,
-            logit_soft_cap=None,
+            logit_soft_cap=self.final_logit_softcapping,
             precision=activation_precision,
         )
         rms_norm_config = NormalizationConfig(
@@ -86,19 +94,33 @@ class HFGemma3TextConfigRaw:
             subtract_mean=False,
         )
-        if self.rope_scaling is not None:
+        if isinstance(self.rope_scaling, GemmaRoPEScalingConfig):
             global_rope_config = LinearScalingRoPEConfig(
                 precision=activation_precision,
                 base=self.rope_theta,
                 max_sequence_length=self.max_position_embeddings,
                 scaling_factor=self.rope_scaling.factor,
             )
-        else:
+        elif isinstance(self.rope_scaling, YarnRopeScalingConfig):
+            global_rope_config = YARNRoPEConfig(
+                precision=activation_precision,
+                base=self.rope_theta,
+                scaling_factor=self.rope_scaling.factor,
+                max_sequence_length=self.max_position_embeddings,
+                original_context_length=self.rope_scaling.original_max_position_embeddings,
+                beta_fast=self.rope_scaling.beta_fast,
+                beta_slow=self.rope_scaling.beta_slow,
+                truncate=self.rope_scaling.truncate,
+            )
+        elif self.rope_scaling is None:
             global_rope_config = UnscaledRoPEConfig(
                 precision=activation_precision,
                 base=self.rope_theta,
                 max_sequence_length=context_length or self.max_position_embeddings,
             )
+        else:
+            raise ValueError("Invalid rope scaling configuration")
         local_rope_config = UnscaledRoPEConfig(
             precision=activation_precision,
             base=self.rope_local_base_freq,

lalamo-0.5.10/lalamo/model_import/decoder_configs/huggingface/lfm2.py ADDED Viewed

@@ -0,0 +1,174 @@
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Literal
+from jaxtyping import DTypeLike
+from lalamo.modules import (
+    AttentionConfig,
+    DecoderConfig,
+    DenseMLPConfig,
+    FullPrecisionLinearConfig,
+    NormalizationConfig,
+    SeparableCausalConvConfig,
+    ShortConvConfig,
+    SiLU,
+    TiedEmbeddingConfig,
+    TransformerConfig,
+    TransformerLayerConfig,
+    UnscaledRoPEConfig,
+    UntiedEmbeddingConfig,
+    UpcastMode,
+)
+from .common import HuggingFaceLMConfig
+@dataclass(frozen=True)
+class HFLFM2Config(HuggingFaceLMConfig):
+    architectures: list[Literal["Lfm2ForCausalLM"]]
+    block_auto_adjust_ff_dim: Literal[False]
+    block_dim: int
+    block_ff_dim: int
+    block_ffn_dim_multiplier: float
+    block_mlp_init_scale: float
+    block_multiple_of: int
+    block_norm_eps: float
+    block_out_init_scale: float
+    block_use_swiglu: bool
+    block_use_xavier_init: bool
+    bos_token_id: int
+    conv_L_cache: int # noqa: N815
+    conv_bias: int
+    conv_dim: int
+    conv_dim_out: int
+    conv_use_xavier_init: bool
+    eos_token_id: int
+    hidden_size: int
+    initializer_range: float
+    intermediate_size: int
+    layer_types: list[Literal["conv", "full_attention"]]
+    max_position_embeddings: int
+    model_type: Literal["lfm2"]
+    norm_eps: float
+    num_attention_heads: int
+    num_heads: int
+    num_hidden_layers: int
+    num_key_value_heads: int
+    pad_token_id: int
+    rope_theta: float
+    theta: float
+    tie_embedding: bool
+    torch_dtype: Literal["bfloat16"]
+    transformers_version: str
+    use_cache: bool
+    use_pos_enc: bool
+    vocab_size: int
+    def to_decoder_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],  # noqa: ARG002
+    ) -> DecoderConfig:
+        assert self.num_attention_heads == self.num_heads
+        if self.tie_embedding:
+            embedding_config = TiedEmbeddingConfig(
+                input_scale=None,
+                logit_soft_cap=None,
+                precision=activation_precision,
+            )
+        else:
+            embedding_config = UntiedEmbeddingConfig(
+                input_scale=None,
+                logit_soft_cap=None,
+                precision=activation_precision,
+            )
+        rope_config = UnscaledRoPEConfig(
+            precision=activation_precision,
+            base=self.rope_theta,
+            max_sequence_length=context_length or self.max_position_embeddings,
+        )
+        linear_config = FullPrecisionLinearConfig(activation_precision)
+        block_norm_config = NormalizationConfig(
+            scale_precision=activation_precision,
+            accumulation_precision=accumulation_precision,
+            epsilon=self.block_norm_eps,
+            scale_offset=None,
+            upcast_mode=UpcastMode.ONLY_NORMALIZATION,
+            subtract_mean=False,
+        )
+        attention_config = AttentionConfig(
+            qkv_projection_config=linear_config,
+            out_projection_config=linear_config,
+            query_norm_config=block_norm_config,
+            key_norm_config=block_norm_config,
+            num_heads=self.num_attention_heads,
+            num_groups=self.num_key_value_heads,
+            head_dim=self.hidden_size // self.num_heads,
+            is_causal=True,
+            scale=None,
+            sliding_window_size=None,
+            logit_soft_cap=None,
+            has_sinks=False,
+            has_qkv_biases=False,
+            has_out_biases=False,
+        )
+        short_conv_config = ShortConvConfig(
+            in_projection_config=linear_config,
+            conv_config=SeparableCausalConvConfig(activation_precision, has_biases=False),
+            out_projection_config=linear_config,
+            kernel_size=self.conv_L_cache,
+        )
+        mlp_config = DenseMLPConfig(
+            linear_config=linear_config,
+            activation=SiLU(),
+            has_up_biases=False,
+            has_down_biases=False,
+            up_clipping=None,
+            gate_clipping=None,
+        )
+        layer_configs = [
+            TransformerLayerConfig(
+                pre_mixer_norm_config=block_norm_config,
+                mixer_config={"conv": short_conv_config, "full_attention": attention_config}[layer_type],
+                post_mixer_norm_config=None,
+                pre_mlp_norm_config=block_norm_config,
+                mlp_config=mlp_config,
+                post_mlp_norm_config=None,
+            ) for layer_type in self.layer_types
+        ]
+        output_norm_config = NormalizationConfig(
+            scale_precision=activation_precision,
+            accumulation_precision=accumulation_precision,
+            epsilon=self.norm_eps,
+            scale_offset=None,
+            upcast_mode=UpcastMode.ONLY_NORMALIZATION,
+            subtract_mean=False,
+        )
+        transformer_config = TransformerConfig(
+            global_rope_config=rope_config,
+            local_rope_config=None,
+            layer_configs=tuple(layer_configs),
+            output_norm_config=output_norm_config,
+            model_dim=self.hidden_size,
+            hidden_dim=self.intermediate_size,
+            context_length=context_length or self.max_position_embeddings,
+        )
+        return DecoderConfig(
+            embedding_config=embedding_config,
+            transformer_config=transformer_config,
+            vocab_size=self.vocab_size,
+        )

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/loaders/huggingface.py RENAMED Viewed

@@ -8,17 +8,21 @@ from jaxtyping import Array, DTypeLike
 from lalamo.common import ParameterPath
 from lalamo.modules import (
     Attention,
+    AttentionConfig,
     Decoder,
     DenseMLP,
     FullPrecisionLinear,
     GroupQuantizedLinear,
     LinearBase,
     Mamba2,
+    Mamba2Config,
     MLXQuantizedLinear,
     MLXQuantizedTiedEmbedding,
     MLXSemiQuantizedUntiedEmbedding,
     Normalization,
     SeparableCausalConv,
+    ShortConv,
+    ShortConvConfig,
     TiedEmbedding,
     TransformerLayer,
     UntiedEmbedding,
@@ -300,7 +304,7 @@ def load_moe(module: MixtureOfExperts, weights_dict: Mapping[str, Array], path:
         down_w = rearrange(down_w, "e o ib ie -> e o (ib ie)")
         down_b = weights_dict[experts_path / "down_proj_bias"]
         if down_b.ndim == 1:
-            down_b = jnp.broadcast_to(down_b, down_w.shape[:-1] + (down_b.shape[0],))
+            down_b = jnp.broadcast_to(down_b, (*down_w.shape[:-1], down_b.shape[0]))
         down_projection = load_parameters(
             lambda m: (m.weights, m.biases),  # type: ignore
@@ -345,21 +349,42 @@ def load_attention(
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
 ) -> Attention:
+    if (path / "o_proj.weight") in weights_dict:
+        o_proj_name = "o_proj"
+    elif (path / "out_proj.weight") in weights_dict:
+        o_proj_name = "out_proj"
+    else:
+        raise NotImplementedError("Can't determine attention output projection name")
     qkv_projection = load_linear(
         module.qkv_projection,
         weights_dict,
         path,
         sublayers_to_fuse=["q_proj", "k_proj", "v_proj"],
     )
-    out_projection = load_linear(module.out_projection, weights_dict, path / "o_proj")
+    out_projection = load_linear(module.out_projection, weights_dict, path / o_proj_name)
     if module.query_norm is not None:
-        query_norm = load_rmsnorm(module.query_norm, weights_dict, path / "q_norm")
+        if (path / "q_norm.weight") in weights_dict:
+            q_norm_name = "q_norm"
+        elif (path / "q_layernorm.weight") in weights_dict:
+            q_norm_name = "q_layernorm"
+        else:
+            raise NotImplementedError("Can't determine attention query projection parameter name")
+        query_norm = load_rmsnorm(module.query_norm, weights_dict, path / q_norm_name)
     else:
         query_norm = None
     if module.key_norm is not None:
-        key_norm = load_rmsnorm(module.key_norm, weights_dict, path / "k_norm")
+        if (path / "k_norm.weight") in weights_dict:
+            k_norm_name = "k_norm"
+        elif (path / "k_layernorm.weight") in weights_dict:
+            k_norm_name = "k_layernorm"
+        else:
+            raise NotImplementedError("Can't determine attention key projection parameter name")
+        key_norm = load_rmsnorm(module.key_norm, weights_dict, path / k_norm_name)
     else:
         key_norm = None
@@ -382,7 +407,7 @@ def load_attention(
     )
-def _load_mamba_conv(
+def _load_conv(
     conv_module: SeparableCausalConv,
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
@@ -390,6 +415,8 @@ def _load_mamba_conv(
     weight_path = path / "conv1d" / "weight"
     if weight_path not in weights_dict:
         weight_path = path / "conv_weight"
+    if weight_path not in weights_dict:
+        weight_path = path / "conv.weight"
     if weight_path not in weights_dict:
         weight_path = None
@@ -402,6 +429,8 @@ def _load_mamba_conv(
     bias_path = path / "conv1d" / "bias"
     if bias_path not in weights_dict:
         bias_path = path / "conv_bias"
+    if bias_path not in weights_dict:
+        bias_path = path / "conv.bias"
     if bias_path not in weights_dict:
         bias_path = None
@@ -424,7 +453,7 @@ def load_mamba2(
 ) -> Mamba2:
     in_projection = load_linear(module.in_projection, weights_dict, path / "in_proj")
     out_projection = load_linear(module.out_projection, weights_dict, path / "out_proj")
-    conv = _load_mamba_conv(module.conv, weights_dict, path)
+    conv = _load_conv(module.conv, weights_dict, path)
     skip_connection_weight_path = path / "D"
     if skip_connection_weight_path in weights_dict:
@@ -451,6 +480,22 @@ def load_mamba2(
     )
+def load_short_conv(
+    module: ShortConv,
+    weights_dict: Mapping[str, Array],
+    path: ParameterPath,
+) -> ShortConv:
+    in_projection = load_linear(module.in_projection, weights_dict, path / "in_proj")
+    out_projection = load_linear(module.out_projection, weights_dict, path / "out_proj")
+    conv = _load_conv(module.conv, weights_dict, path)
+    return load_parameters(
+        lambda m: (m.in_projection, m.out_projection, m.conv),
+        module,
+        (in_projection, out_projection, conv),
+    )
 def load_transformer_layer(
     module: TransformerLayer,
     weights_dict: Mapping[str, Array],
@@ -478,6 +523,8 @@ def load_transformer_layer(
         mixer = load_attention(module.mixer, weights_dict, mixer_path / mixer_key)
     elif isinstance(module.mixer, Mamba2):
         mixer = load_mamba2(module.mixer, weights_dict, mixer_path / mixer_key)
+    elif isinstance(module.mixer, ShortConv):
+        mixer = load_short_conv(module.mixer, weights_dict, mixer_path / mixer_key)
     else:
         mixer = module.mixer
@@ -625,11 +672,12 @@ def load_huggingface_decoder(
     is_llamba_full_precision = any(key.startswith("backbone.") for key in weights_dict)
     is_llamba_mlx = any(key.startswith("embedding.encoder.") for key in weights_dict)
+    is_lfm2 = any(key.startswith("model.layers.0.operator_norm.weight") for key in weights_dict)
     if is_llamba_full_precision:
         decoder_path = base_path / "backbone"
         embedding_path = decoder_path / "embedding"
         pre_mixer_norm_key = "input_layernorm"
-        mixer_key = "mixer"
+        mixer_key = {Mamba2Config: "mixer"}
         pre_mlp_norm_key = "post_attention_layernorm"
         mlp_key = "mlp"
         up_proj_key = "up_proj"
@@ -642,7 +690,7 @@ def load_huggingface_decoder(
         decoder_path = base_path / "model"
         embedding_path = base_path / "embedding.encoder"
         pre_mixer_norm_key = "norm"
-        mixer_key = "layer"
+        mixer_key = {Mamba2Config: "layer"}
         pre_mlp_norm_key = "norm"
         mlp_key = "layer"
         up_proj_key = "gate_proj"
@@ -651,11 +699,24 @@ def load_huggingface_decoder(
         alternating_layers = True
         norm_key = "norm"
         lm_head_path = base_path / "head.linear"
+    elif is_lfm2:
+        decoder_path = base_path / "model"
+        embedding_path = decoder_path / "embed_tokens"
+        pre_mixer_norm_key = "operator_norm"
+        mixer_key = {ShortConvConfig: "conv", AttentionConfig: "self_attn"}
+        pre_mlp_norm_key = "ffn_norm"
+        mlp_key = "feed_forward"
+        up_proj_key = "w3"
+        gate_proj_key = "w1"
+        down_proj_key = "w2"
+        alternating_layers = False
+        norm_key = "embedding_norm"
+        lm_head_path = base_path / "lm_head"
     else:
         decoder_path = base_path / "model"
         embedding_path = decoder_path / "embed_tokens"
         pre_mixer_norm_key = "input_layernorm"
-        mixer_key = "self_attn"
+        mixer_key = {AttentionConfig: "self_attn"}
         pre_mlp_norm_key = "post_attention_layernorm"
         mlp_key = "mlp"
         up_proj_key = "up_proj"
@@ -687,7 +748,7 @@ def load_huggingface_decoder(
             weights_dict,
             decoder_path / "layers" / ((i * 2) if alternating_layers else i),
             decoder_path / "layers" / ((i * 2 + 1) if alternating_layers else i),
-            mixer_key,
+            mixer_key[type(layer.config.mixer_config)], # type: ignore
             mlp_key,
             pre_mixer_norm_key,
             pre_mlp_norm_key,

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/model_specs/__init__.py RENAMED Viewed

@@ -1,8 +1,10 @@
 from .common import FileSpec, ModelSpec, ModelType, UseCase, build_quantized_models
 from .deepseek import DEEPSEEK_MODELS
+from .essential_ai import RNJ_MODELS
 from .gemma import GEMMA_MODELS
 from .gpt_oss import GPT_OSS_MODELS
 from .huggingface import HUGGINGFACE_MODELS
+from .lfm2 import LFM2_MODELS
 from .llama import LLAMA_MODELS
 from .llamba import LLAMBA_MODELS
 from .mirai import MIRAI_CLASSIFIER_MODELS
@@ -24,6 +26,7 @@ __all__ = [
 ALL_MODEL_LISTS = [
+    LFM2_MODELS,
     LLAMA_MODELS,
     LLAMBA_MODELS,
     DEEPSEEK_MODELS,
@@ -36,6 +39,7 @@ ALL_MODEL_LISTS = [
     QWEN_MODELS,
     REKA_MODELS,
     MIRAI_CLASSIFIER_MODELS,
+    RNJ_MODELS,
 ]
 ALL_MODELS = [model for model_list in ALL_MODEL_LISTS for model in model_list]

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/model_specs/common.py RENAMED Viewed

@@ -56,6 +56,7 @@ class WeightsType(Enum):
                 yield MapDictValues(lambda v: cast_if_float(v, float_dtype), weights_dict), metadata_dict or {}
         else:
             import torch
             from lalamo.modules.torch_interop import torch_to_jax
             torch_weights = torch.load(filename, map_location="cpu", weights_only=True)

lalamo-0.5.10/lalamo/model_import/model_specs/essential_ai.py ADDED Viewed

@@ -0,0 +1,17 @@
+from lalamo.model_import.decoder_configs.huggingface import HFGemma3TextConfig
+from .common import ModelSpec
+__all__ = ["RNJ_MODELS"]
+RNJ_MODELS = [
+    ModelSpec(
+        vendor="EssentialAI",
+        family="Rnj-1",
+        name="Rnj-1-Instruct",
+        size="8B",
+        quantization=None,
+        repo="EssentialAI/rnj-1-instruct",
+        config_type=HFGemma3TextConfig,
+    ),
+]

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/model_import/model_specs/huggingface.py RENAMED Viewed

@@ -14,5 +14,5 @@ HUGGINGFACE_MODELS = [
         repo="HuggingFaceTB/SmolLM2-1.7B-Instruct",
         config_type=HFLlamaConfig,
         use_cases=tuple(),
-    )
+    ),
 ]

lalamo-0.5.10/lalamo/model_import/model_specs/lfm2.py ADDED Viewed

@@ -0,0 +1,21 @@
+from lalamo.model_import.decoder_configs import HFLFM2Config
+from .common import ConfigMap, FileSpec, ModelSpec
+__all__ = ["LFM2_MODELS"]
+LFM2_MODELS = [
+    ModelSpec(
+        vendor="LiquidAI",
+        family="LFM2",
+        name="LFM2-2.6B",
+        size="2.6B",
+        repo="LiquidAI/LFM2-2.6B",
+        config_type=HFLFM2Config,
+        quantization=None,
+        configs=ConfigMap(
+            chat_template=FileSpec("chat_template.jinja"),
+        ),
+        use_cases=tuple(),
+    ),
+]

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/modules/__init__.py RENAMED Viewed

@@ -69,6 +69,9 @@ from .token_mixers import (
     Mamba2Config,
     SeparableCausalConv,
     SeparableCausalConvConfig,
+    ShortConv,
+    ShortConvConfig,
+    ShortConvStateLayer,
     State,
     StaticKVCacheLayer,
 )
@@ -136,6 +139,9 @@ __all__ = [
     "RoutingFunction",
     "SeparableCausalConv",
     "SeparableCausalConvConfig",
+    "ShortConv",
+    "ShortConvConfig",
+    "ShortConvStateLayer",
     "SiLU",
     "SoftmaxRouting",
     "State",

{lalamo-0.5.8 → lalamo-0.5.10}/lalamo/modules/token_mixers/__init__.py RENAMED Viewed

@@ -3,9 +3,18 @@ from lalamo.modules.common import register_config_union
 from .attention import Attention, AttentionConfig, AttentionResult
 from .common import TokenMixerBase, TokenMixerResult
 from .mamba import Mamba2, Mamba2Config, Mamba2Result, SeparableCausalConv, SeparableCausalConvConfig
-from .state import DynamicKVCacheLayer, KVCacheLayer, Mamba2StateLayer, State, StateLayerBase, StaticKVCacheLayer
+from .short_conv import ShortConv, ShortConvConfig, ShortConvResult
+from .state import (
+    DynamicKVCacheLayer,
+    KVCacheLayer,
+    Mamba2StateLayer,
+    ShortConvStateLayer,
+    State,
+    StateLayerBase,
+    StaticKVCacheLayer,
+)
-TokenMixerConfig = AttentionConfig | Mamba2Config
+TokenMixerConfig = AttentionConfig | Mamba2Config | ShortConvConfig
 register_config_union(TokenMixerConfig)  # type: ignore (pyright bug)
@@ -21,6 +30,10 @@ __all__ = [
     "Mamba2StateLayer",
     "SeparableCausalConv",
     "SeparableCausalConvConfig",
+    "ShortConv",
+    "ShortConvConfig",
+    "ShortConvResult",
+    "ShortConvStateLayer",
     "State",
     "StateLayerBase",
     "StaticKVCacheLayer",

lalamo 0.5.8__tar.gz → 0.5.10__tar.gz

lalamo 0.5.8tar.gz → 0.5.10tar.gz