PyPI - lalamo - Versions diffs - 0.4.1__tar.gz → 0.5.0__tar.gz - Mend

lalamo 0.4.1tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{lalamo-0.4.1 → lalamo-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.4.1
+Version: 0.5.0
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown
@@ -38,7 +38,8 @@ Dynamic: license-file
 <a href="https://artifacts.trymirai.com/social/about_us.mp3"><img src="https://img.shields.io/badge/Listen-Podcast-red" alt="Listen to our podcast"></a>
 <a href="https://docsend.com/v/76bpr/mirai2025"><img src="https://img.shields.io/badge/View-Deck-red" alt="View our deck"></a>
-<a href="mailto:alexey@getmirai.co,dima@getmirai.co,aleksei@getmirai.co?subject=Interested%20in%20Mirai"><img src="https://img.shields.io/badge/Send-Email-green" alt="Contact us"></a>
+<a href="https://discord.com/invite/trymirai"><img src="https://img.shields.io/discord/1377764166764462120?label=Discord" alt="Discord"></a>
+<a href="mailto:contact@getmirai.co?subject=Interested%20in%20Mirai"><img src="https://img.shields.io/badge/Send-Email-green" alt="Contact us"></a>
 <a href="https://docs.trymirai.com/overview/lalamo"><img src="https://img.shields.io/badge/Read-Docs-blue" alt="Read docs"></a>
 [![License](https://img.shields.io/badge/License-MIT-blue)](LICENSE)

{lalamo-0.4.1 → lalamo-0.5.0}/README.md RENAMED Viewed

@@ -6,7 +6,8 @@
 <a href="https://artifacts.trymirai.com/social/about_us.mp3"><img src="https://img.shields.io/badge/Listen-Podcast-red" alt="Listen to our podcast"></a>
 <a href="https://docsend.com/v/76bpr/mirai2025"><img src="https://img.shields.io/badge/View-Deck-red" alt="View our deck"></a>
-<a href="mailto:alexey@getmirai.co,dima@getmirai.co,aleksei@getmirai.co?subject=Interested%20in%20Mirai"><img src="https://img.shields.io/badge/Send-Email-green" alt="Contact us"></a>
+<a href="https://discord.com/invite/trymirai"><img src="https://img.shields.io/discord/1377764166764462120?label=Discord" alt="Discord"></a>
+<a href="mailto:contact@getmirai.co?subject=Interested%20in%20Mirai"><img src="https://img.shields.io/badge/Send-Email-green" alt="Contact us"></a>
 <a href="https://docs.trymirai.com/overview/lalamo"><img src="https://img.shields.io/badge/Read-Docs-blue" alt="Read docs"></a>
 [![License](https://img.shields.io/badge/License-MIT-blue)](LICENSE)

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/__init__.py RENAMED Viewed

@@ -10,7 +10,7 @@ from lalamo.message_processor import (
 )
 from lalamo.model_import import ModelSpec, import_model
-__version__ = "0.4.1"
+__version__ = "0.5.0"
 __all__ = [
     "AssistantMessage",

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/language_model.py RENAMED Viewed

@@ -14,8 +14,7 @@ from tokenizers import Tokenizer
 from lalamo.common import DTypeLike, ParameterTree, unflatten_parameters
 from lalamo.message_processor import AssistantMessage, Message, MessageProcessor, MessageProcessorConfig
-from lalamo.modules import Decoder, DecoderConfig, KVCache, LalamoModule, config_converter
-from lalamo.modules.common import ForwardPassMode
+from lalamo.modules import Decoder, DecoderConfig, ForwardPassMode, LalamoModule, State, config_converter
 from lalamo.modules.decoder import DecoderForwardPassConfig
 from lalamo.sampling import SamplingPolicy, make_policy
 from lalamo.utils import open_safetensors
@@ -37,13 +36,13 @@ type ForwardPassConfig = DecoderForwardPassConfig
 class PrefillResults(NamedTuple):
     last_token_logits: Float[Array, "batch vocabulary"]
     last_token_indices: Int[Array, " batch"]
-    kv_cache: KVCache
+    state: State
 class DecodingState(NamedTuple):
     last_token_logits: Float[Array, "batch vocabulary"]
     last_token_indices: Int[Array, " batch"]
-    kv_cache: KVCache
+    state: State
     stop_flags: Bool[Array, " batch"]
@@ -89,7 +88,7 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         with open(path / "config.json") as config_file:
             config_json = json.load(config_file)
         config = config_converter.structure(config_json["model_config"], LanguageModelConfig)
-        with open_safetensors(path / "model.safetensors") as weights_dict:
+        with open_safetensors(path / "model.safetensors") as (weights_dict, _):
             weights = unflatten_parameters(weights_dict)
             decoder = config.decoder_config.empty().import_weights(weights)
         tokenizer = Tokenizer.from_file(str(path / "tokenizer.json"))
@@ -124,21 +123,21 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         self,
         token_ids: Int[Array, "batch tokens"],
         lengths_without_padding: Int[Array, " batch"] | None = None,
-        kv_cache_capacity: int | None = None,
+        state_capacity: int | None = None,
         forward_pass_config: ForwardPassConfig | None = None,
     ) -> PrefillResults:
         batch_size, sequence_length = token_ids.shape
         token_positions = jnp.repeat(jnp.arange(sequence_length, dtype=jnp.int32)[None, ...], batch_size, axis=0)
-        if kv_cache_capacity is not None:
-            kv_cache = self.decoder.init_static_kv_cache(batch_size, kv_cache_capacity)
+        if state_capacity is not None:
+            state = self.decoder.init_static_state(batch_size, state_capacity)
         else:
-            kv_cache = None
+            state = None
         decoder_outputs = self.decoder(
             token_ids,
             token_positions,
-            kv_cache,
-            return_updated_kv_cache=True,
+            state,
+            return_updated_state=True,
             lengths_without_padding=lengths_without_padding,
             forward_pass_mode=ForwardPassMode.MULTI_TOKEN,
             forward_pass_config=forward_pass_config,
@@ -151,11 +150,11 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         last_token_logits = vmap(lambda logits, index: logits[index])(decoder_outputs.logits, last_logits_indices)
-        assert decoder_outputs.updated_kv_cache is not None
+        assert decoder_outputs.updated_state is not None
         return PrefillResults(
             last_token_logits=last_token_logits,
             last_token_indices=last_logits_indices,
-            kv_cache=decoder_outputs.updated_kv_cache,
+            state=decoder_outputs.updated_state,
         )
     @eqx.filter_jit
@@ -187,7 +186,7 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         initial_state = DecodingState(
             prefill_results.last_token_logits,
             prefill_results.last_token_indices,
-            prefill_results.kv_cache,
+            prefill_results.state,
             jnp.zeros(batch_size, dtype=jnp.bool),
         )
@@ -224,16 +223,16 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
                 decoder_outputs = self.decoder(
                     next_token_ids[:, None],
                     next_token_indices[:, None],
-                    state.kv_cache,
-                    return_updated_kv_cache=True,
+                    state.state,
+                    return_updated_state=True,
                     forward_pass_mode=forward_pass_mode,
                     forward_pass_config=forward_pass_config,
                 )
-                assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
+                assert decoder_outputs.updated_state is not None, "updated_state should not be None"
                 new_state = DecodingState(
                     decoder_outputs.logits.squeeze(1),
                     next_token_indices,
-                    decoder_outputs.updated_kv_cache,
+                    decoder_outputs.updated_state,
                     stop_flags,
                 )
                 return new_state, GenerationStepResults(next_token_ids, next_top_k_token_ids, next_top_k_token_logits)
@@ -338,7 +337,7 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         state = DecodingState(
             prefill_results.last_token_logits,
             prefill_results.last_token_indices,
-            prefill_results.kv_cache,
+            prefill_results.state,
             jnp.array([0], dtype=jnp.bool),
         )
@@ -356,14 +355,14 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
             decoder_outputs = self.decoder(
                 next_token_id.reshape(1, 1),
                 next_token_indices.reshape(1, 1),
-                state.kv_cache,
-                return_updated_kv_cache=True,
+                state.state,
+                return_updated_state=True,
                 forward_pass_config=forward_pass_config,
             )
-            assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
+            assert decoder_outputs.updated_state is not None, "updated_state should not be None"
             state = DecodingState(
                 decoder_outputs.logits.squeeze(1),
                 next_token_indices,
-                decoder_outputs.updated_kv_cache,
+                decoder_outputs.updated_state,
                 state.stop_flags,
             )

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/main.py RENAMED Viewed

@@ -27,7 +27,6 @@ from rich.progress import (
     TextColumn,
     TimeElapsedColumn,
     TimeRemainingColumn,
-    track,
 )
 from rich.table import Table
 from safetensors.flax import save_file
@@ -50,7 +49,6 @@ from lalamo.modules import config_converter
 from lalamo.speculator.inference import CollectTracesEvent, inference_collect_traces
 from lalamo.speculator.ngram import NGramSpeculator
 from lalamo.speculator.utils import SpeculatorTrainingEvent, test_speculator, train_speculator
-from lalamo.utils import jax_uint4_to_packed_uint8
 SCRIPT_NAME = Path(sys.argv[0]).name
@@ -109,16 +107,6 @@ def _error(message: str) -> None:
     raise Exit(1)
-def _pack_uint4_weights(weights: dict[str, jnp.ndarray]) -> dict[str, jnp.ndarray]:
-    packed_weights = {}
-    for key, value in weights.items():
-        if value.dtype == jnp.uint4:
-            packed_weights[key] = jax_uint4_to_packed_uint8(value)
-        else:
-            packed_weights[key] = value
-    return packed_weights
 @app.command(help="Chat with a converted model.")
 def chat(
     model_path: Annotated[
@@ -274,7 +262,7 @@ def convert(
             result = model.decoder(
                 token_ids,
                 token_positions,
-                return_updated_kv_cache=True,
+                return_updated_state=True,
                 return_activation_trace=True,
             )
             traces = flatten_parameters(result.export())
@@ -286,8 +274,7 @@ def convert(
         weights = flatten_parameters(model.export_weights())
         del model
-        packed_weights = _pack_uint4_weights(weights)
-        save_file(packed_weights, output_dir / "model.safetensors")
+        save_file(weights, output_dir / "model.safetensors")
         config_json = config_converter.unstructure(metadata, ModelMetadata)
         with open(output_dir / "config.json", "w") as file:
@@ -511,7 +498,6 @@ def train(
         ) as progress:
             inference_task = progress.add_task("🔮 [cyan]Training speculator...[/cyan]", total=subsample_size)
             def progress_callback(event: SpeculatorTrainingEvent) -> None:
                 progress.update(inference_task, completed=event.trained_tokens)

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/common.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import importlib.metadata
+import json
 from collections import ChainMap
 from collections.abc import Callable
 from contextlib import ExitStack
@@ -14,6 +15,7 @@ from tokenizers import Tokenizer
 from lalamo.language_model import GenerationConfig, LanguageModel, LanguageModelConfig
 from lalamo.message_processor import MessageProcessor, MessageProcessorConfig
+from lalamo.model_import.model_specs.common import JSONFieldSpec
 from lalamo.quantization import QuantizationMode
 from .huggingface_generation_config import HFGenerationConfig
@@ -130,10 +132,17 @@ def import_message_processor(
     )
     tokenizer_config = HFTokenizerConfig.from_json(tokenizer_config_file)
     if tokenizer_config.chat_template is None:
-        if model_spec.configs.chat_template is None:
-            raise ValueError("Missiing chat template.")
-        chat_template_file = download_file(model_spec.configs.chat_template, model_spec.repo, output_dir)
-        prompt_template = chat_template_file.read_text()
+        match model_spec.configs.chat_template:
+            case JSONFieldSpec(file_spec, field_name):
+                json_file = download_file(file_spec, model_spec.repo, output_dir)
+                with open(json_file) as file:
+                    json_dict = json.load(file)
+                prompt_template = json_dict[field_name]
+            case FileSpec(_) as file_spec:
+                chat_template_file = download_file(file_spec, model_spec.repo, output_dir)
+                prompt_template = chat_template_file.read_text()
+            case None:
+                raise ValueError("No chat template specified.")
     else:
         if model_spec.configs.chat_template is not None:
             raise ValueError("Conflicting chat template specifications.")
@@ -180,15 +189,24 @@ def import_model(
     weights_paths = download_weights(model_spec, progress_callback=progress_callback)
     with ExitStack() as stack:
         weights_shards = []
+        metadata_shards = []
         for weights_path in weights_paths:
-            weights_shard = stack.enter_context(model_spec.weights_type.load(weights_path, precision))
+            weights_shard, metadata_shard = stack.enter_context(model_spec.weights_type.load(weights_path, precision))
             weights_shards.append(weights_shard)
+            metadata_shards.append(metadata_shard)
         weights_dict: ChainMap[str, Array] = ChainMap(*weights_shards)
+        metadata_dict: ChainMap[str, str] = ChainMap(*metadata_shards)
         if progress_callback is not None:
             progress_callback(InitializingModelEvent())
-        decoder = foreign_decoder_config.load_decoder(context_length, precision, accumulation_precision, weights_dict)
+        decoder = foreign_decoder_config.load_decoder(
+            context_length,
+            precision,
+            accumulation_precision,
+            weights_dict,
+            metadata_dict,
+        )
     if progress_callback is not None:
         progress_callback(FinishedInitializingModelEvent())

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@ from .huggingface import (
     HFGemma3TextConfig,
     HFGPTOssConfig,
     HFLlamaConfig,
+    HFLlambaConfig,
     HFMistralConfig,
     HFQwen2Config,
     HFQwen3Config,
@@ -20,6 +21,7 @@ __all__ = [
     "HFGemma3Config",
     "HFGemma3TextConfig",
     "HFLlamaConfig",
+    "HFLlambaConfig",
     "HFMistralConfig",
     "HFQwen2Config",
     "HFQwen3Config",

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/common.py RENAMED Viewed

@@ -19,11 +19,9 @@ class ForeignConfig(RegistryABC):
     _converter: ClassVar[cattrs.Converter] = cattrs.Converter()
     _converter.register_structure_hook(int | list[int], lambda v, _: v)
-    eos_token_id: int | list[int]
     @property
     def eos_token_ids(self) -> list[int]:
-        return [self.eos_token_id] if isinstance(self.eos_token_id, int) else self.eos_token_id
+        raise NotImplementedError
     @property
     @abstractmethod
@@ -41,6 +39,7 @@ class ForeignConfig(RegistryABC):
         context_length: int | None,
         activation_precision: DTypeLike,
         accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],
     ) -> DecoderConfig:
         raise NotImplementedError
@@ -58,7 +57,8 @@ class ForeignConfig(RegistryABC):
         activation_precision: DTypeLike,
         accumulation_precision: DTypeLike,
         weights_dict: Mapping[str, Array],
+        metadata_dict: Mapping[str, str],
     ) -> Decoder:
-        config = self.to_decoder_config(context_length, activation_precision, accumulation_precision)
+        config = self.to_decoder_config(context_length, activation_precision, accumulation_precision, metadata_dict)
         model = config.empty()
         return self._load_weights(model, weights_dict)

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/executorch.py RENAMED Viewed

@@ -51,6 +51,12 @@ class LoraConfig:
 @dataclass(frozen=True)
 class ExecutorchConfig(ForeignConfig):
+    eos_token_id: int | list[int]
+    @property
+    def eos_token_ids(self) -> list[int]:
+        return [self.eos_token_id] if isinstance(self.eos_token_id, int) else self.eos_token_id
     @property
     def default_precision(self) -> DTypeLike:
         return jnp.bfloat16
@@ -89,6 +95,7 @@ class ETLlamaConfig(ExecutorchConfig):
         context_length: int | None,
         activation_precision: DTypeLike,
         accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],  # noqa: ARG002
     ) -> DecoderConfig:
         if self.lora_args is None:
             raise ValueError("We only support QLoRA models for now.")
@@ -136,6 +143,12 @@ class ETLlamaConfig(ExecutorchConfig):
             has_sinks=False,
             has_qkv_biases=False,
             has_out_biases=False,
+            num_heads=self.n_heads,
+            num_groups=self.n_kv_heads,
+            head_dim=self.dim // self.n_heads,
+            is_causal=True,
+            scale=None,
+            sliding_window_size=None,
         )
         mlp_config = DenseMLPConfig(
             linear_config=linear_config,
@@ -146,9 +159,9 @@ class ETLlamaConfig(ExecutorchConfig):
             gate_clipping=None,
         )
         decoder_layer_config = DecoderLayerConfig(
-            pre_attention_norm_config=rmsnorm_config,
-            attention_config=attention_config,
-            post_attention_norm_config=None,
+            pre_mixer_norm_config=rmsnorm_config,
+            mixer_config=attention_config,
+            post_mixer_norm_config=None,
             pre_mlp_norm_config=rmsnorm_config,
             mlp_config=mlp_config,
             post_mlp_norm_config=None,
@@ -157,16 +170,10 @@ class ETLlamaConfig(ExecutorchConfig):
             embedding_config=embedding_config,
             global_rope_config=rope_config,
             local_rope_config=None,
-            layer_config=decoder_layer_config,
+            layer_configs=(decoder_layer_config,) * self.n_layers,
             output_norm_config=rmsnorm_config,
             vocab_size=self.vocab_size,
             model_dim=self.dim,
             hidden_dim=self._find_hidden_size(),
-            num_heads=self.n_heads,
-            num_groups=self.n_kv_heads,
-            head_dim=self.dim // self.n_heads,
-            attention_scale=None,
-            num_layers=self.n_layers,
-            sliding_window_sizes=None,
             context_length=context_length or MAX_SEQUENCE_LENGTH,
         )

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/huggingface/__init__.py RENAMED Viewed

@@ -3,6 +3,7 @@ from .gemma2 import HFGemma2Config
 from .gemma3 import HFGemma3Config, HFGemma3TextConfig
 from .gpt_oss import HFGPTOssConfig
 from .llama import HFLlamaConfig
+from .llamba import HFLlambaConfig
 from .mistral import HFMistralConfig
 from .qwen2 import HFQwen2Config
 from .qwen3 import HFQwen3Config
@@ -13,6 +14,7 @@ __all__ = [
     "HFGemma3Config",
     "HFGemma3TextConfig",
     "HFLlamaConfig",
+    "HFLlambaConfig",
     "HFMistralConfig",
     "HFQwen2Config",
     "HFQwen3Config",

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/huggingface/common.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import Literal
+from typing import ClassVar, Literal
+import cattrs
 import jax.numpy as jnp
 from jaxtyping import Array, DTypeLike
@@ -56,11 +57,45 @@ class GPTQQuantizationConfig:
     sym: bool
+@dataclass(frozen=True)
+class MLXQuantizationConfig:
+    group_size: int
+    bits: int
+QuantizationConfigType = AWQQuantizationConfig | GPTQQuantizationConfig | MLXQuantizationConfig | None
+def _structure_quantization_config(v: object, _: object) -> QuantizationConfigType:
+    match v:
+        case None:
+            return None
+        case {"quant_method": "awq", **_other}:
+            return cattrs.structure(v, AWQQuantizationConfig)
+        case {"quant_method": "gptq", **_other}:
+            return cattrs.structure(v, GPTQQuantizationConfig)
+        case {**_other}:
+            return cattrs.structure(v, MLXQuantizationConfig)
+        case _:
+            raise RuntimeError(f"Cannot structure {v}field")
 @dataclass(frozen=True)
 class HuggingFaceConfig(ForeignConfig):
+    _converter: ClassVar[cattrs.Converter] = cattrs.Converter()
+    _converter.register_structure_hook(int | list[int], lambda v, _: v)
+    _converter.register_structure_hook(QuantizationConfigType, _structure_quantization_config)
     @property
     def eos_token_ids(self) -> list[int]:
-        return [self.eos_token_id] if isinstance(self.eos_token_id, int) else self.eos_token_id
+        if not hasattr(self, "eos_token_id"):
+            raise RuntimeError("model doesn't havve eos_token_id, override eos_token_ids in model config")
+        return [self.eos_token_id] if isinstance(self.eos_token_id, int) else self.eos_token_id  # type: ignore  (This is a bug in pyright)
     @property
     def default_precision(self) -> DTypeLike:

{lalamo-0.4.1 → lalamo-0.5.0}/lalamo/model_import/decoder_configs/huggingface/gemma2.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import Literal
@@ -57,10 +58,8 @@ class HFGemma2Config(HuggingFaceConfig):
         context_length: int | None,
         activation_precision: DTypeLike,
         accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],  # noqa: ARG002
     ) -> DecoderConfig:
-        sliding_window_sizes = tuple(
-            self.sliding_window if not bool(i % 2) else None for i in range(self.num_hidden_layers)
-        )
         embedding_input_scale = self.hidden_size**0.5
         attention_scale = self.query_pre_attn_scalar**-0.5
         embedding_config = TiedEmbeddingConfig(
@@ -83,16 +82,6 @@ class HFGemma2Config(HuggingFaceConfig):
         linear_config = FullPrecisionLinearConfig(
             precision=activation_precision,
         )
-        attention_config = AttentionConfig(
-            qkv_projection_config=linear_config,
-            out_projection_config=linear_config,
-            query_norm_config=None,
-            key_norm_config=None,
-            logit_soft_cap=self.attn_logit_softcapping,
-            has_sinks=False,
-            has_qkv_biases=self.attention_bias,
-            has_out_biases=False,
-        )
         mlp_config = DenseMLPConfig(
             linear_config=linear_config,
             activation=GELU(),
@@ -101,28 +90,44 @@ class HFGemma2Config(HuggingFaceConfig):
             up_clipping=None,
             gate_clipping=None,
         )
-        decoder_layer_config = DecoderLayerConfig(
-            pre_attention_norm_config=rmsnorm_config,
-            attention_config=attention_config,
-            post_attention_norm_config=rmsnorm_config,
-            pre_mlp_norm_config=rmsnorm_config,
-            mlp_config=mlp_config,
-            post_mlp_norm_config=rmsnorm_config,
-        )
+        layer_configs = []
+        for i in range(self.num_hidden_layers):
+            sliding_window_size = self.sliding_window if not bool(i % 2) else None
+            attention_config = AttentionConfig(
+                qkv_projection_config=linear_config,
+                out_projection_config=linear_config,
+                query_norm_config=None,
+                key_norm_config=None,
+                logit_soft_cap=self.attn_logit_softcapping,
+                has_sinks=False,
+                has_qkv_biases=self.attention_bias,
+                has_out_biases=False,
+                num_heads=self.num_attention_heads,
+                num_groups=self.num_key_value_heads,
+                head_dim=self.head_dim,
+                is_causal=True,
+                scale=attention_scale,
+                sliding_window_size=sliding_window_size,
+            )
+            decoder_layer_config = DecoderLayerConfig(
+                pre_mixer_norm_config=rmsnorm_config,
+                mixer_config=attention_config,
+                post_mixer_norm_config=rmsnorm_config,
+                pre_mlp_norm_config=rmsnorm_config,
+                mlp_config=mlp_config,
+                post_mlp_norm_config=rmsnorm_config,
+            )
+            layer_configs.append(decoder_layer_config)
         return DecoderConfig(
             embedding_config=embedding_config,
             global_rope_config=rope_config,
             local_rope_config=None,
-            layer_config=decoder_layer_config,
+            layer_configs=tuple(layer_configs),
             output_norm_config=rmsnorm_config,
             vocab_size=self.vocab_size,
             model_dim=self.hidden_size,
             hidden_dim=self.intermediate_size,
-            num_heads=self.num_attention_heads,
-            num_groups=self.num_key_value_heads,
-            head_dim=self.head_dim,
-            attention_scale=attention_scale,
-            num_layers=self.num_hidden_layers,
-            sliding_window_sizes=sliding_window_sizes,
             context_length=context_length or self.max_position_embeddings,
         )

lalamo 0.4.1__tar.gz → 0.5.0__tar.gz

lalamo 0.4.1tar.gz → 0.5.0tar.gz