PyPI - lalamo - Versions diffs - 0.5.9__tar.gz → 0.5.11__tar.gz - Mend

lalamo 0.5.9tar.gz → 0.5.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

{lalamo-0.5.9 → lalamo-0.5.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.5.9
+Version: 0.5.11
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/__init__.py RENAMED Viewed

@@ -15,7 +15,7 @@ from lalamo.speculator import (
     SpeculatorTrainingEvent,
 )
-__version__ = "0.5.9"
+__version__ = "0.5.11"
 __all__ = [
     "AssistantMessage",

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/model_import/decoder_configs/__init__.py RENAMED Viewed

@@ -6,6 +6,7 @@ from .huggingface import (
     HFGemma3Config,
     HFGemma3TextConfig,
     HFGPTOssConfig,
+    HFLFM2Config,
     HFLlamaConfig,
     HFLlambaConfig,
     HFMistralConfig,
@@ -22,6 +23,7 @@ __all__ = [
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",
+    "HFLFM2Config",
     "HFLlamaConfig",
     "HFLlambaConfig",
     "HFMistralConfig",

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/model_import/decoder_configs/huggingface/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from .common import HuggingFaceLMConfig
 from .gemma2 import HFGemma2Config
 from .gemma3 import HFGemma3Config, HFGemma3TextConfig
 from .gpt_oss import HFGPTOssConfig
+from .lfm2 import HFLFM2Config
 from .llama import HFLlamaConfig
 from .llamba import HFLlambaConfig
 from .mistral import HFMistralConfig
@@ -14,6 +15,7 @@ __all__ = [
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",
+    "HFLFM2Config",
     "HFLlamaConfig",
     "HFLlambaConfig",
     "HFMistralConfig",

lalamo-0.5.11/lalamo/model_import/decoder_configs/huggingface/lfm2.py ADDED Viewed

@@ -0,0 +1,225 @@
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Literal
+from jaxtyping import DTypeLike
+from lalamo.modules import (
+    AttentionConfig,
+    DecoderConfig,
+    DenseMLPConfig,
+    FullPrecisionLinearConfig,
+    MLXQuantizedLinearConfig,
+    MLXQuantizedTiedEmbeddingConfig,
+    NormalizationConfig,
+    SeparableCausalConvConfig,
+    ShortConvConfig,
+    SiLU,
+    TiedEmbeddingConfig,
+    TransformerConfig,
+    TransformerLayerConfig,
+    UnscaledRoPEConfig,
+    UntiedEmbeddingConfig,
+    UpcastMode,
+)
+from lalamo.quantization import QuantizationMode
+from .common import HuggingFaceLMConfig
+@dataclass(frozen=True)
+class QuantizationConfig:
+    group_size: int
+    bits: int
+@dataclass(frozen=True)
+class HFLFM2Config(HuggingFaceLMConfig):
+    architectures: list[Literal["Lfm2ForCausalLM"]]
+    block_auto_adjust_ff_dim: bool
+    block_dim: int
+    block_ff_dim: int
+    block_ffn_dim_multiplier: float
+    block_mlp_init_scale: float
+    block_multiple_of: int
+    block_norm_eps: float
+    block_out_init_scale: float
+    block_use_swiglu: bool
+    block_use_xavier_init: bool
+    bos_token_id: int
+    conv_L_cache: int  # noqa: N815
+    conv_bias: bool
+    conv_dim: int
+    conv_dim_out: int
+    conv_use_xavier_init: bool
+    eos_token_id: int
+    hidden_size: int
+    initializer_range: float
+    max_position_embeddings: int
+    model_type: Literal["lfm2"]
+    norm_eps: float
+    num_attention_heads: int
+    num_heads: int
+    num_hidden_layers: int
+    num_key_value_heads: int
+    pad_token_id: int
+    rope_theta: float
+    torch_dtype: Literal["bfloat16"]
+    transformers_version: str
+    use_cache: bool
+    use_pos_enc: bool
+    vocab_size: int
+    intermediate_size: int | None = None
+    layer_types: list[Literal["conv", "full_attention"]] | None = None
+    full_attn_idxs: list[int] | None = None
+    tie_embedding: bool = True
+    theta: float | None = None
+    quantization: QuantizationConfig | None = None
+    quantization_config: QuantizationConfig | None = None
+    def to_decoder_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],  # noqa: ARG002
+    ) -> DecoderConfig:
+        assert self.num_attention_heads == self.num_heads
+        if self.quantization_config is not None:
+            assert self.tie_embedding
+            embedding_config = MLXQuantizedTiedEmbeddingConfig(
+                input_scale=None,
+                logit_soft_cap=None,
+                group_size=self.quantization_config.group_size,
+                embedding_quantization_mode=QuantizationMode.from_num_bits(self.quantization_config.bits),
+                activation_quantization_mode=None,
+                activation_precision=activation_precision,
+            )
+        elif self.tie_embedding:
+            embedding_config = TiedEmbeddingConfig(
+                input_scale=None,
+                logit_soft_cap=None,
+                precision=activation_precision,
+            )
+        else:
+            embedding_config = UntiedEmbeddingConfig(
+                input_scale=None,
+                logit_soft_cap=None,
+                precision=activation_precision,
+            )
+        rope_config = UnscaledRoPEConfig(
+            precision=activation_precision,
+            base=self.rope_theta,
+            max_sequence_length=context_length or self.max_position_embeddings,
+        )
+        if self.quantization_config is None:
+            linear_config = FullPrecisionLinearConfig(activation_precision)
+        else:
+            linear_config = MLXQuantizedLinearConfig(
+                group_size=self.quantization_config.group_size,
+                weight_quantization_mode=QuantizationMode.from_num_bits(self.quantization_config.bits),
+                activation_quantization_mode=None,
+                activation_precision=activation_precision,
+            )
+        block_norm_config = NormalizationConfig(
+            scale_precision=activation_precision,
+            accumulation_precision=accumulation_precision,
+            epsilon=self.block_norm_eps,
+            scale_offset=None,
+            upcast_mode=UpcastMode.ONLY_NORMALIZATION,
+            subtract_mean=False,
+        )
+        attention_config = AttentionConfig(
+            qkv_projection_config=linear_config,
+            out_projection_config=linear_config,
+            query_norm_config=block_norm_config,
+            key_norm_config=block_norm_config,
+            num_heads=self.num_attention_heads,
+            num_groups=self.num_key_value_heads,
+            head_dim=self.hidden_size // self.num_heads,
+            is_causal=True,
+            scale=None,
+            sliding_window_size=None,
+            logit_soft_cap=None,
+            has_sinks=False,
+            has_qkv_biases=False,
+            has_out_biases=False,
+        )
+        short_conv_config = ShortConvConfig(
+            in_projection_config=linear_config,
+            conv_config=SeparableCausalConvConfig(activation_precision, has_biases=self.conv_bias),
+            out_projection_config=linear_config,
+            kernel_size=self.conv_L_cache,
+        )
+        mlp_config = DenseMLPConfig(
+            linear_config=linear_config,
+            activation=SiLU(),
+            has_up_biases=False,
+            has_down_biases=False,
+            up_clipping=None,
+            gate_clipping=None,
+        )
+        if self.layer_types is not None:
+            layer_types = self.layer_types
+        elif self.full_attn_idxs is not None:
+            layer_types = [
+                "full_attention" if i in self.full_attn_idxs else "conv" for i in range(self.num_hidden_layers)
+            ]
+        else:
+            raise RuntimeError("Either layer_types or full_attn_idxs must be present.")
+        layer_configs = [
+            TransformerLayerConfig(
+                pre_mixer_norm_config=block_norm_config,
+                mixer_config={"conv": short_conv_config, "full_attention": attention_config}[layer_type],
+                post_mixer_norm_config=None,
+                pre_mlp_norm_config=block_norm_config,
+                mlp_config=mlp_config,
+                post_mlp_norm_config=None,
+            )
+            for layer_type in layer_types
+        ]
+        output_norm_config = NormalizationConfig(
+            scale_precision=activation_precision,
+            accumulation_precision=accumulation_precision,
+            epsilon=self.norm_eps,
+            scale_offset=None,
+            upcast_mode=UpcastMode.ONLY_NORMALIZATION,
+            subtract_mean=False,
+        )
+        if self.intermediate_size is not None:
+            hidden_dim = self.intermediate_size
+        else:
+            hidden_dim_adjusted = self.block_ff_dim * self.block_ffn_dim_multiplier * (2 / 3)
+            hidden_dim = int(
+                (hidden_dim_adjusted + self.block_multiple_of - 1) // self.block_multiple_of * self.block_multiple_of,
+            )
+        transformer_config = TransformerConfig(
+            global_rope_config=rope_config,
+            local_rope_config=None,
+            layer_configs=tuple(layer_configs),
+            output_norm_config=output_norm_config,
+            model_dim=self.hidden_size,
+            hidden_dim=hidden_dim,
+            context_length=context_length or self.max_position_embeddings,
+        )
+        return DecoderConfig(
+            embedding_config=embedding_config,
+            transformer_config=transformer_config,
+            vocab_size=self.vocab_size,
+        )

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/model_import/loaders/huggingface.py RENAMED Viewed

@@ -8,17 +8,22 @@ from jaxtyping import Array, DTypeLike
 from lalamo.common import ParameterPath
 from lalamo.modules import (
     Attention,
+    AttentionConfig,
     Decoder,
     DenseMLP,
     FullPrecisionLinear,
     GroupQuantizedLinear,
     LinearBase,
     Mamba2,
+    Mamba2Config,
     MLXQuantizedLinear,
     MLXQuantizedTiedEmbedding,
+    MLXQuantizedTiedEmbeddingConfig,
     MLXSemiQuantizedUntiedEmbedding,
     Normalization,
     SeparableCausalConv,
+    ShortConv,
+    ShortConvConfig,
     TiedEmbedding,
     TransformerLayer,
     UntiedEmbedding,
@@ -345,21 +350,42 @@ def load_attention(
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
 ) -> Attention:
+    if (path / "o_proj.weight") in weights_dict:
+        o_proj_name = "o_proj"
+    elif (path / "out_proj.weight") in weights_dict:
+        o_proj_name = "out_proj"
+    else:
+        raise NotImplementedError("Can't determine attention output projection name")
     qkv_projection = load_linear(
         module.qkv_projection,
         weights_dict,
         path,
         sublayers_to_fuse=["q_proj", "k_proj", "v_proj"],
     )
-    out_projection = load_linear(module.out_projection, weights_dict, path / "o_proj")
+    out_projection = load_linear(module.out_projection, weights_dict, path / o_proj_name)
     if module.query_norm is not None:
-        query_norm = load_rmsnorm(module.query_norm, weights_dict, path / "q_norm")
+        if (path / "q_norm.weight") in weights_dict:
+            q_norm_name = "q_norm"
+        elif (path / "q_layernorm.weight") in weights_dict:
+            q_norm_name = "q_layernorm"
+        else:
+            raise NotImplementedError("Can't determine attention query projection parameter name")
+        query_norm = load_rmsnorm(module.query_norm, weights_dict, path / q_norm_name)
     else:
         query_norm = None
     if module.key_norm is not None:
-        key_norm = load_rmsnorm(module.key_norm, weights_dict, path / "k_norm")
+        if (path / "k_norm.weight") in weights_dict:
+            k_norm_name = "k_norm"
+        elif (path / "k_layernorm.weight") in weights_dict:
+            k_norm_name = "k_layernorm"
+        else:
+            raise NotImplementedError("Can't determine attention key projection parameter name")
+        key_norm = load_rmsnorm(module.key_norm, weights_dict, path / k_norm_name)
     else:
         key_norm = None
@@ -382,19 +408,24 @@ def load_attention(
     )
-def _load_mamba_conv(
+def _load_conv(
     conv_module: SeparableCausalConv,
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
+    permute_conv: bool,
 ) -> SeparableCausalConv:
     weight_path = path / "conv1d" / "weight"
     if weight_path not in weights_dict:
         weight_path = path / "conv_weight"
+    if weight_path not in weights_dict:
+        weight_path = path / "conv.weight"
     if weight_path not in weights_dict:
         weight_path = None
     if weight_path is not None:
         raw = weights_dict[weight_path]
+        if permute_conv:
+            raw = jnp.matrix_transpose(raw)
         conv_weight = raw.squeeze(1) if raw.ndim == 3 else raw
     else:
         conv_weight = conv_module.weights
@@ -402,6 +433,8 @@ def _load_mamba_conv(
     bias_path = path / "conv1d" / "bias"
     if bias_path not in weights_dict:
         bias_path = path / "conv_bias"
+    if bias_path not in weights_dict:
+        bias_path = path / "conv.bias"
     if bias_path not in weights_dict:
         bias_path = None
@@ -421,10 +454,11 @@ def load_mamba2(
     module: Mamba2,
     weights_dict: Mapping[str, Array],
     path: ParameterPath,
+    permute_conv: bool,
 ) -> Mamba2:
     in_projection = load_linear(module.in_projection, weights_dict, path / "in_proj")
     out_projection = load_linear(module.out_projection, weights_dict, path / "out_proj")
-    conv = _load_mamba_conv(module.conv, weights_dict, path)
+    conv = _load_conv(module.conv, weights_dict, path, permute_conv)
     skip_connection_weight_path = path / "D"
     if skip_connection_weight_path in weights_dict:
@@ -451,6 +485,23 @@ def load_mamba2(
     )
+def load_short_conv(
+    module: ShortConv,
+    weights_dict: Mapping[str, Array],
+    path: ParameterPath,
+    permute_conv: bool,
+) -> ShortConv:
+    in_projection = load_linear(module.in_projection, weights_dict, path / "in_proj")
+    out_projection = load_linear(module.out_projection, weights_dict, path / "out_proj")
+    conv = _load_conv(module.conv, weights_dict, path, permute_conv)
+    return load_parameters(
+        lambda m: (m.in_projection, m.out_projection, m.conv),
+        module,
+        (in_projection, out_projection, conv),
+    )
 def load_transformer_layer(
     module: TransformerLayer,
     weights_dict: Mapping[str, Array],
@@ -463,6 +514,7 @@ def load_transformer_layer(
     up_proj_key: str,
     gate_proj_key: str,
     down_proj_key: str,
+    permute_conv: bool,
 ) -> TransformerLayer:
     if module.pre_mixer_norm is not None:
         pre_attention_norm = load_rmsnorm(
@@ -477,7 +529,9 @@ def load_transformer_layer(
     if isinstance(module.mixer, Attention):
         mixer = load_attention(module.mixer, weights_dict, mixer_path / mixer_key)
     elif isinstance(module.mixer, Mamba2):
-        mixer = load_mamba2(module.mixer, weights_dict, mixer_path / mixer_key)
+        mixer = load_mamba2(module.mixer, weights_dict, mixer_path / mixer_key, permute_conv)
+    elif isinstance(module.mixer, ShortConv):
+        mixer = load_short_conv(module.mixer, weights_dict, mixer_path / mixer_key, permute_conv)
     else:
         mixer = module.mixer
@@ -625,11 +679,13 @@ def load_huggingface_decoder(
     is_llamba_full_precision = any(key.startswith("backbone.") for key in weights_dict)
     is_llamba_mlx = any(key.startswith("embedding.encoder.") for key in weights_dict)
+    is_lfm2 = any(key.startswith("model.layers.0.operator_norm.weight") for key in weights_dict)
     if is_llamba_full_precision:
         decoder_path = base_path / "backbone"
         embedding_path = decoder_path / "embedding"
         pre_mixer_norm_key = "input_layernorm"
-        mixer_key = "mixer"
+        mixer_key = {Mamba2Config: "mixer"}
+        permute_conv = False
         pre_mlp_norm_key = "post_attention_layernorm"
         mlp_key = "mlp"
         up_proj_key = "up_proj"
@@ -642,7 +698,8 @@ def load_huggingface_decoder(
         decoder_path = base_path / "model"
         embedding_path = base_path / "embedding.encoder"
         pre_mixer_norm_key = "norm"
-        mixer_key = "layer"
+        mixer_key = {Mamba2Config: "layer"}
+        permute_conv = False
         pre_mlp_norm_key = "norm"
         mlp_key = "layer"
         up_proj_key = "gate_proj"
@@ -651,11 +708,26 @@ def load_huggingface_decoder(
         alternating_layers = True
         norm_key = "norm"
         lm_head_path = base_path / "head.linear"
+    elif is_lfm2:
+        decoder_path = base_path / "model"
+        embedding_path = decoder_path / "embed_tokens"
+        pre_mixer_norm_key = "operator_norm"
+        mixer_key = {ShortConvConfig: "conv", AttentionConfig: "self_attn"}
+        permute_conv = isinstance(module.config.embedding_config, MLXQuantizedTiedEmbeddingConfig)
+        pre_mlp_norm_key = "ffn_norm"
+        mlp_key = "feed_forward"
+        up_proj_key = "w3"
+        gate_proj_key = "w1"
+        down_proj_key = "w2"
+        alternating_layers = False
+        norm_key = "embedding_norm"
+        lm_head_path = base_path / "lm_head"
     else:
         decoder_path = base_path / "model"
         embedding_path = decoder_path / "embed_tokens"
         pre_mixer_norm_key = "input_layernorm"
-        mixer_key = "self_attn"
+        mixer_key = {AttentionConfig: "self_attn"}
+        permute_conv = False
         pre_mlp_norm_key = "post_attention_layernorm"
         mlp_key = "mlp"
         up_proj_key = "up_proj"
@@ -687,13 +759,14 @@ def load_huggingface_decoder(
             weights_dict,
             decoder_path / "layers" / ((i * 2) if alternating_layers else i),
             decoder_path / "layers" / ((i * 2 + 1) if alternating_layers else i),
-            mixer_key,
+            mixer_key[type(layer.config.mixer_config)], # type: ignore
             mlp_key,
             pre_mixer_norm_key,
             pre_mlp_norm_key,
             up_proj_key,
             gate_proj_key,
             down_proj_key,
+            permute_conv,
         )
         for i, layer in enumerate(module.transformer.layers)
     )

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/model_import/model_specs/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ from .essential_ai import RNJ_MODELS
 from .gemma import GEMMA_MODELS
 from .gpt_oss import GPT_OSS_MODELS
 from .huggingface import HUGGINGFACE_MODELS
+from .lfm2 import LFM2_MODELS
 from .llama import LLAMA_MODELS
 from .llamba import LLAMBA_MODELS
 from .mirai import MIRAI_CLASSIFIER_MODELS
@@ -25,6 +26,7 @@ __all__ = [
 ALL_MODEL_LISTS = [
+    LFM2_MODELS,
     LLAMA_MODELS,
     LLAMBA_MODELS,
     DEEPSEEK_MODELS,

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/model_import/model_specs/common.py RENAMED Viewed

@@ -56,6 +56,7 @@ class WeightsType(Enum):
                 yield MapDictValues(lambda v: cast_if_float(v, float_dtype), weights_dict), metadata_dict or {}
         else:
             import torch
             from lalamo.modules.torch_interop import torch_to_jax
             torch_weights = torch.load(filename, map_location="cpu", weights_only=True)

lalamo-0.5.11/lalamo/model_import/model_specs/lfm2.py ADDED Viewed

@@ -0,0 +1,31 @@
+from lalamo.model_import.decoder_configs import HFLFM2Config
+from lalamo.quantization import QuantizationMode
+from .common import ConfigMap, FileSpec, ModelSpec
+__all__ = ["LFM2_MODELS"]
+def _lfm2_repo(size: str, quantization: QuantizationMode | None) -> tuple[str, str]:
+    organization = "LiquidAI" if quantization is None else "mlx-community"
+    name = f"LFM2-{size}{f'-{quantization.bits}bit' if quantization is not None else ''}"
+    return (organization, name)
+LFM2_MODELS = [
+    ModelSpec(
+        vendor="LiquidAI",
+        family="LFM2",
+        name=_lfm2_repo(size, quantization)[1],
+        size=size,
+        repo="/".join(_lfm2_repo(size, quantization)),
+        config_type=HFLFM2Config,
+        quantization=quantization,
+        configs=ConfigMap(
+            chat_template=FileSpec("chat_template.jinja"),
+        ),
+        use_cases=tuple(),
+    )
+    for size in ["350M", "700M", "1.2B", "2.6B"]
+    for quantization in [None, *([QuantizationMode.UINT4, QuantizationMode.UINT8] if size != "2.6B" else [])]
+]

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/modules/__init__.py RENAMED Viewed

@@ -69,6 +69,9 @@ from .token_mixers import (
     Mamba2Config,
     SeparableCausalConv,
     SeparableCausalConvConfig,
+    ShortConv,
+    ShortConvConfig,
+    ShortConvStateLayer,
     State,
     StaticKVCacheLayer,
 )
@@ -136,6 +139,9 @@ __all__ = [
     "RoutingFunction",
     "SeparableCausalConv",
     "SeparableCausalConvConfig",
+    "ShortConv",
+    "ShortConvConfig",
+    "ShortConvStateLayer",
     "SiLU",
     "SoftmaxRouting",
     "State",

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/modules/token_mixers/__init__.py RENAMED Viewed

@@ -3,9 +3,18 @@ from lalamo.modules.common import register_config_union
 from .attention import Attention, AttentionConfig, AttentionResult
 from .common import TokenMixerBase, TokenMixerResult
 from .mamba import Mamba2, Mamba2Config, Mamba2Result, SeparableCausalConv, SeparableCausalConvConfig
-from .state import DynamicKVCacheLayer, KVCacheLayer, Mamba2StateLayer, State, StateLayerBase, StaticKVCacheLayer
+from .short_conv import ShortConv, ShortConvConfig, ShortConvResult
+from .state import (
+    DynamicKVCacheLayer,
+    KVCacheLayer,
+    Mamba2StateLayer,
+    ShortConvStateLayer,
+    State,
+    StateLayerBase,
+    StaticKVCacheLayer,
+)
-TokenMixerConfig = AttentionConfig | Mamba2Config
+TokenMixerConfig = AttentionConfig | Mamba2Config | ShortConvConfig
 register_config_union(TokenMixerConfig)  # type: ignore (pyright bug)
@@ -21,6 +30,10 @@ __all__ = [
     "Mamba2StateLayer",
     "SeparableCausalConv",
     "SeparableCausalConvConfig",
+    "ShortConv",
+    "ShortConvConfig",
+    "ShortConvResult",
+    "ShortConvStateLayer",
     "State",
     "StateLayerBase",
     "StaticKVCacheLayer",

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/modules/token_mixers/common.py RENAMED Viewed

@@ -25,7 +25,7 @@ class TokenMixerResult[StateLayerT](NamedTuple):
 class TokenMixerConfigBase(ABC):
     @property
     @abstractmethod
-    def rope_dim(self) -> int: ...
+    def rope_dim(self) -> int | None: ...
     @abstractmethod
     def random_init(

{lalamo-0.5.9 → lalamo-0.5.11}/lalamo/modules/token_mixers/mamba.py RENAMED Viewed

@@ -184,8 +184,8 @@ class Mamba2Config(TokenMixerConfigBase):
         return self.num_heads * self.head_dim
     @property
-    def rope_dim(self) -> int:
-        return self.head_dim
+    def rope_dim(self) -> None:
+        return None
     def random_init(
         self,

lalamo 0.5.9__tar.gz → 0.5.11__tar.gz

lalamo 0.5.9tar.gz → 0.5.11tar.gz