PyPI - lalamo - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

lalamo/__init__.py +20 -5
lalamo/data/__init__.py +8 -0
lalamo/data/huggingface_message.py +38 -0
lalamo/data/lalamo_completions.py +43 -0
lalamo/data/utils.py +8 -0
lalamo/language_model.py +152 -69
lalamo/main.py +271 -43
lalamo/message_processor.py +11 -1
lalamo/model_import/common.py +17 -7
lalamo/model_import/decoder_configs/__init__.py +3 -0
lalamo/model_import/decoder_configs/executorch.py +12 -6
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +1 -3
lalamo/model_import/decoder_configs/huggingface/gemma2.py +11 -5
lalamo/model_import/decoder_configs/huggingface/gemma3.py +14 -5
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +195 -0
lalamo/model_import/decoder_configs/huggingface/llama.py +38 -8
lalamo/model_import/decoder_configs/huggingface/mistral.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen2.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen3.py +12 -6
lalamo/model_import/huggingface_tokenizer_config.py +1 -4
lalamo/model_import/loaders/executorch.py +10 -9
lalamo/model_import/loaders/huggingface.py +104 -9
lalamo/model_import/loaders/utils.py +92 -0
lalamo/model_import/model_specs/__init__.py +4 -1
lalamo/model_import/model_specs/common.py +15 -12
lalamo/model_import/model_specs/gpt_oss.py +21 -0
lalamo/modules/__init__.py +35 -7
lalamo/modules/activations.py +24 -14
lalamo/modules/attention.py +73 -20
lalamo/modules/common.py +8 -57
lalamo/modules/decoder.py +48 -34
lalamo/modules/decoder_layer.py +57 -43
lalamo/modules/embedding.py +13 -19
lalamo/modules/kv_cache.py +53 -16
lalamo/modules/linear.py +260 -79
lalamo/modules/mlp.py +395 -23
lalamo/modules/normalization.py +2 -3
lalamo/modules/rope.py +32 -21
lalamo/modules/utils.py +10 -0
lalamo/speculator/__init__.py +11 -0
lalamo/speculator/common.py +22 -0
lalamo/speculator/inference.py +75 -0
lalamo/speculator/ngram.py +154 -0
lalamo/speculator/utils.py +52 -0
lalamo/utils.py +27 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/METADATA +11 -4
lalamo-0.4.0.dist-info/RECORD +71 -0
lalamo-0.3.3.dist-info/RECORD +0 -59
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/WHEEL +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/top_level.txt +0 -0

lalamo/main.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import json
+import random
 import re
 import shutil
 import sys
 from enum import Enum
+from itertools import chain
 from pathlib import Path
 from typing import Annotated
+import jax
 import jax.numpy as jnp
+import jax.profiler
 import thefuzz.process
 from click import Context as ClickContext
 from click import Parameter as ClickParameter
@@ -14,13 +18,24 @@ from click import ParamType
 from jaxtyping import DTypeLike
 from rich import box
 from rich.console import Console
+from rich.live import Live
 from rich.panel import Panel
-from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.progress import (
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+    track,
+)
 from rich.table import Table
 from safetensors.flax import save_file
-from typer import Argument, Exit, Option, Typer
+from typer import Argument, Context, Exit, Option, Typer
 from lalamo.common import flatten_parameters
+from lalamo.data import import_hf_parquet
+from lalamo.data.lalamo_completions import LalamoCompletion
 from lalamo.language_model import LanguageModel
 from lalamo.message_processor import UserMessage
 from lalamo.model_import import REPO_TO_MODEL, ModelMetadata, ModelSpec, import_model
@@ -31,7 +46,10 @@ from lalamo.model_import.common import (
     InitializingModelEvent,
     StatusEvent,
 )
-from lalamo.modules import WeightLayout, config_converter
+from lalamo.modules import config_converter
+from lalamo.speculator.inference import CollectTracesEvent, inference_collect_traces
+from lalamo.speculator.ngram import NGramSpeculator
+from lalamo.speculator.utils import SpeculatorTrainingEvent, test_speculator, train_speculator
 from lalamo.utils import jax_uint4_to_packed_uint8
 SCRIPT_NAME = Path(sys.argv[0]).name
@@ -110,27 +128,19 @@ def chat(
             metavar="MODEL_PATH",
         ),
     ],
-    weight_layout: Annotated[
-        WeightLayout | None,
-        Option(
-            help=(
-                "(EXPERIMENTAL) Order of dimensions in the weights of linear layers."
-                "\n\n\n\n"
-                "If set to AUTO, the layout will depend on the model."
-            ),
-            show_default="auto",
-        ),
-    ] = None,
 ) -> None:
-    if weight_layout is None:
-        weight_layout = WeightLayout.AUTO
     with Progress(
         SpinnerColumn(),
         TextColumn("[progress.description]{task.description}"),
         transient=True,
     ) as progress:
-        progress.add_task("🚀 [cyan]Loading model...[/cyan]")
-        model = LanguageModel.load(model_path, weight_layout)
+        loading_task = progress.add_task("🚀 [cyan]Loading model...[/cyan]")
+        model = LanguageModel.load(model_path)
+        progress.remove_task(loading_task)
+        warmup_task = progress.add_task("🔥 Warming up compilation cache...")
+        list(model.stream_reply_text([UserMessage("")], max_output_length=1))
+        progress.remove_task(warmup_task)
+    console.print(f"🤖 Chatting with [blue]{model_path}[/blue]:")
     messages = []
     while True:
         user_text = console.input("[cyan]user> [/cyan]")
@@ -170,17 +180,6 @@ def convert(
             show_default="Native precision of the model",
         ),
     ] = None,
-    weight_layout: Annotated[
-        WeightLayout | None,
-        Option(
-            help=(
-                "(EXPERIMENTAL) Order of dimensions in the weights of linear layers."
-                "\n\n\n\n"
-                "If set to AUTO, the layout will depend on the model."
-            ),
-            show_default="auto",
-        ),
-    ] = None,
     output_dir: Annotated[
         Path | None,
         Option(
@@ -213,18 +212,10 @@ def convert(
     else:
         precision_dtype = None
-    if weight_layout is not None:
-        weight_layout = WeightLayout(weight_layout)
-    else:
-        weight_layout = WeightLayout.AUTO
     if output_dir is None:
         output_dir = DEFAULT_OUTPUT_DIR / model_repo.name
-    console.print(f"🚀 Converting [cyan]{model_repo.name}[/cyan] by [cyan]{model_repo.vendor}[/cyan].")
-    conversion_strs = [
-        f"⚙️ Using weight layout [cyan]{weight_layout}[/cyan]",
-    ]
+    conversion_strs = [f"🚀 Converting [cyan]{model_repo.name}[/cyan] by [cyan]{model_repo.vendor}[/cyan]"]
     if precision is not None:
         conversion_strs.append(
             f" and converting floating-point weights into [cyan]{precision.name.lower()}[/cyan] precision",
@@ -292,7 +283,7 @@ def convert(
         progress.remove_task(main_task)
         model.message_processor.tokenizer.save(str(output_dir / "tokenizer.json"))
-        weights = flatten_parameters(model.export_weights(weight_layout))
+        weights = flatten_parameters(model.export_weights())
         del model
         packed_weights = _pack_uint4_weights(weights)
@@ -312,10 +303,10 @@ def _model_size_string_to_int(
 ) -> float:
     match = _regex.match(size_str)
     factors = {
-        "K": 1024**1,
-        "M": 1024**2,
-        "B": 1024**3,
-        "T": 1024**4,
+        "K": 1000**1,
+        "M": 1000**2,
+        "B": 1000**3,
+        "T": 1000**4,
     }
     if match:
         return float(match.group("number")) * factors[match.group("suffix")]
@@ -368,5 +359,242 @@ def list_models(
     console.print(table)
+speculator_app = Typer()
+app.add_typer(speculator_app, name="speculator", help="Train a speculator for a model.")
+@speculator_app.command(help="Run model inference and collect traces for speculator training")
+def collect_traces(
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory",
+            metavar="MODEL_PATH",
+        ),
+    ],
+    dataset_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the dataset with prompts",
+            metavar="DATASET_PATH",
+        ),
+    ],
+    output_path: Annotated[
+        Path,
+        Option(
+            help="File to save the trace to",
+            metavar="OUTPUT_PATH",
+        ),
+    ],
+    num_logits_per_token: Annotated[
+        int,
+        Option(help="Record logits for this number of most probable tokens"),
+    ] = 8,
+    max_input_length: Annotated[
+        int,
+        Option(help="Filter prompts that have more than this number of tokens in context"),
+    ] = 1024,
+    max_output_length: Annotated[
+        int,
+        Option(help="Maximum number of tokens to generate in one completion"),
+    ] = 1024,
+    batch_size: Annotated[
+        int,
+        Option(help="Number of sequences in one batch"),
+    ] = 1,
+    num_tokens_to_generate: Annotated[
+        int | None,
+        Option(
+            help="Exit early after generating this number of output tokens",
+            show_default="all",
+        ),
+    ] = None,
+) -> None:
+    with Live(refresh_per_second=10) as live:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            transient=True,
+            disable=True,
+        ) as progress:
+            live.update(progress, refresh=True)
+            loading_model_task = progress.add_task("🧠 [cyan]Loading model...[/cyan]")
+            model = LanguageModel.load(model_path)
+            progress.remove_task(loading_model_task)
+            loading_dataset_task = progress.add_task("🗂️ [cyan]Loading dataset...[/cyan]")
+            dataset = iter(import_hf_parquet(dataset_path))
+            dataset = chain([next(dataset)], dataset)  # iterator is lazy, force it to actually open the file
+            progress.remove_task(loading_dataset_task)
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            disable=True,
+        ) as progress:
+            live.update(progress, refresh=True)
+            inference_task = progress.add_task("🔮 [cyan]Running inference...[/cyan]", total=num_tokens_to_generate)
+            def progress_callback(event: CollectTracesEvent) -> None:
+                progress.update(inference_task, completed=event.tokens_generated)
+            traces = inference_collect_traces(
+                model,
+                dataset,
+                num_logits_per_token,
+                batch_size,
+                max_input_length,
+                max_output_length,
+                num_tokens_to_generate,
+                progress_callback,
+            )
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, "wb+") as output_fd:
+                for trace in traces:
+                    blob = trace.serialize()
+                    output_fd.write(blob)
+            progress.update(inference_task, description="✅ Completed")
+@speculator_app.command(help="Train a speculator from inference traces")
+def train(
+    trace_path: Annotated[
+        Path,
+        Argument(
+            help="File of llm inference traces to train the speculator on",
+            metavar="TRACE_PATH",
+        ),
+    ],
+    output_path: Annotated[
+        Path,
+        Option(
+            help="File to save the output to",
+            metavar="OUTPUT_PATH",
+        ),
+    ],
+    hashtable_size: Annotated[
+        int,
+        Option(help="Size of ngram hashtable"),
+    ] = 65536,
+    num_logits_per_token: Annotated[
+        int,
+        Option(help="Top K tokens to keep in ngram hashtable"),
+    ] = 8,
+    ngram_size: Annotated[
+        int,
+        Option(help="Length of ngrams"),
+    ] = 2,
+    subsample_size: Annotated[
+        int | None,
+        Option(
+            help="Exit early after training the model on this number of tokens",
+            show_default="all",
+        ),
+    ] = None,
+) -> None:
+    with open(trace_path, "rb") as trace_fd:
+        traces = LalamoCompletion.deserialize_many(trace_fd)
+        speculator = NGramSpeculator.new(hashtable_size, num_logits_per_token, ngram_size)
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+        ) as progress:
+            inference_task = progress.add_task("🔮 [cyan]Training speculator...[/cyan]", total=subsample_size)
+            def progress_callback(event: SpeculatorTrainingEvent) -> None:
+                progress.update(inference_task, completed=event.trained_tokens)
+            train_speculator(speculator, traces, subsample_size, progress_callback)
+            progress.update(inference_task, description="✅ Completed")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "wb+") as fd:
+        fd.write(speculator.serialize())
+@speculator_app.command(help="Run speculator as an autoregressive llm")
+def test(
+    speculator_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the speculator file.",
+            metavar="SPECULATOR_PATH",
+        ),
+    ],
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory for detokenization.",
+            metavar="MODEL_PATH",
+        ),
+    ],
+    seed: Annotated[
+        int | None,
+        Option(help="Set seed for deterministic sampling"),
+    ] = None,
+    num_sequences: Annotated[
+        int,
+        Option(help="Number of sequences to generate"),
+    ] = 8,
+) -> None:
+    model = LanguageModel.load(model_path)
+    with open(speculator_path, "rb") as fd:
+        speculator = NGramSpeculator.deserialize(fd.read())
+    table = Table(
+        show_header=False,
+        show_lines=True,
+        box=box.ROUNDED,
+    )
+    if seed is not None:
+        random.seed(seed)
+    for _ in range(num_sequences):
+        sequence = test_speculator(speculator)
+        detokenized = model.message_processor.detokenize(sequence)
+        table.add_row(detokenized)
+    console.print(table)
+@app.callback()
+def _profile_memory(
+    ctx: Context,
+    profile_memory: Annotated[
+        Path | None,
+        Option(
+            help="Record and save the XLA memory profile to specified path",
+            show_default="Don't save the XLA memory profile",
+            envvar="LALAMO_PROFILE_MEMORY",
+        ),
+    ] = None,
+) -> None:
+    if profile_memory is None:
+        return
+    if profile_memory.is_dir():
+        profile_memory /= "lalamo-memory.prof"
+    def _save_memory_profile() -> None:
+        console.print(f"Saving XLA memory profile to {profile_memory}")
+        jax.profiler.save_device_memory_profile(profile_memory)
+    ctx.call_on_close(_save_memory_profile)
 if __name__ == "__main__":
     app()

lalamo/message_processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import re
 from collections.abc import Iterable
 from dataclasses import dataclass
+from datetime import datetime
 from functools import cached_property
 from re import Pattern
 from typing import NotRequired, TypedDict
@@ -24,6 +25,10 @@ type ToolSchema = None  # WIP
 type Image = None  # WIP
+def _strftime_now(format_string: str) -> str:
+    return datetime.now().strftime(format_string)  # noqa: DTZ005
 class HuggingFaceMessage(TypedDict):
     role: str
     content: str
@@ -141,7 +146,7 @@ class MessageProcessor:
     def render_request(self, messages: Iterable[Message]) -> str:
         request_dict = self.request_to_dict(messages)
-        return self.prompt_template.render(request_dict)
+        return self.prompt_template.render({**request_dict, "strftime_now": _strftime_now})
     def parse_response(self, response: str) -> AssistantMessage:
         if self.output_parser_regex is None:
@@ -154,6 +159,11 @@ class MessageProcessor:
     def tokenize(self, text: str) -> list[int]:
         return self.tokenizer.encode(text, add_special_tokens=False).ids
+    def tokenize_request(self, messages: Iterable[Message]) -> list[int]:
+        rendered = self.render_request(messages)
+        tokenized = self.tokenize(rendered)
+        return tokenized
     def detokenize(self, tokens: list[int]) -> str:
         return self.tokenizer.decode(tokens, skip_special_tokens=False)

lalamo/model_import/common.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import importlib.metadata
 from collections import ChainMap
 from collections.abc import Callable
+from contextlib import ExitStack
 from dataclasses import dataclass
 from pathlib import Path
 from typing import NamedTuple
@@ -138,7 +139,13 @@ def import_message_processor(
             raise ValueError("Conflicting chat template specifications.")
         prompt_template = tokenizer_config.chat_template
     tokenizer = Tokenizer.from_file(str(tokenizer_file))
-    tokenizer.add_special_tokens(tokenizer_config.added_tokens())
+    added_tokens = tokenizer_config.added_tokens()
+    added_special_tokens = [token for token in added_tokens if token.special]
+    added_not_special_tokens = [token for token in added_tokens if not token.special]
+    tokenizer.add_special_tokens(added_special_tokens)
+    tokenizer.add_tokens(added_not_special_tokens)
     message_processor_config = MessageProcessorConfig(
         prompt_template=prompt_template,
         output_parser_regex=model_spec.output_parser_regex,
@@ -171,14 +178,17 @@ def import_model(
         precision = foreign_decoder_config.default_precision
     weights_paths = download_weights(model_spec, progress_callback=progress_callback)
-    weights_dict: ChainMap[str, Array] = ChainMap(
-        *[model_spec.weights_type.load(weights_path, precision) for weights_path in weights_paths],  # type: ignore
-    )
+    with ExitStack() as stack:
+        weights_shards = []
+        for weights_path in weights_paths:
+            weights_shard = stack.enter_context(model_spec.weights_type.load(weights_path, precision))
+            weights_shards.append(weights_shard)
+        weights_dict: ChainMap[str, Array] = ChainMap(*weights_shards)
-    if progress_callback is not None:
-        progress_callback(InitializingModelEvent())
+        if progress_callback is not None:
+            progress_callback(InitializingModelEvent())
-    decoder = foreign_decoder_config.load_decoder(context_length, precision, accumulation_precision, weights_dict)
+        decoder = foreign_decoder_config.load_decoder(context_length, precision, accumulation_precision, weights_dict)
     if progress_callback is not None:
         progress_callback(FinishedInitializingModelEvent())

lalamo/model_import/decoder_configs/__init__.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from .common import ForeignConfig
 # from .executorch import ETLlamaConfig
 from .huggingface import (
     HFGemma2Config,
     HFGemma3Config,
     HFGemma3TextConfig,
+    HFGPTOssConfig,
     HFLlamaConfig,
     HFMistralConfig,
     HFQwen2Config,
@@ -13,6 +15,7 @@ from .huggingface import (
 __all__ = [
     # "ETLlamaConfig",
     "ForeignConfig",
+    "HFGPTOssConfig",
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",

lalamo/model_import/decoder_configs/executorch.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from collections.abc import Mapping
 from dataclasses import dataclass
 import jax.numpy as jnp
@@ -5,18 +6,18 @@ from jaxtyping import Array, DTypeLike
 from lalamo.model_import.loaders.executorch import load_executorch
 from lalamo.modules import (
-    Activation,
     AttentionConfig,
     Decoder,
     DecoderConfig,
     DecoderLayerConfig,
+    DenseMLPConfig,
     LlamaRoPEConfig,
-    MLPConfig,
     QLoRALinearConfig,
     QuantizedTiedEmbeddingConfig,
     RMSNormConfig,
     UpcastMode,
 )
+from lalamo.modules.activations import SiLU
 from lalamo.quantization import QuantizationMode
 from .common import ForeignConfig
@@ -58,7 +59,7 @@ class ExecutorchConfig(ForeignConfig):
     def _load_weights(
         cls,
         model: Decoder,
-        weights_dict: dict[str, Array],
+        weights_dict: Mapping[str, Array],
     ) -> Decoder:
         return load_executorch(model, weights_dict)
@@ -97,7 +98,7 @@ class ETLlamaConfig(ExecutorchConfig):
         embedding_config = QuantizedTiedEmbeddingConfig(
             input_scale=None,
-            logits_soft_cap=None,
+            logit_soft_cap=None,
             embedding_quantization_mode=EMBEDDING_QUANTIZATION_MODE,
             activation_quantization_mode=ACTIVATION_QUANTIZATION_MODE,
             activation_precision=activation_precision,
@@ -132,12 +133,17 @@ class ETLlamaConfig(ExecutorchConfig):
             query_norm_config=None,
             key_norm_config=None,
             logit_soft_cap=None,
+            has_sinks=False,
             has_qkv_biases=False,
             has_out_biases=False,
         )
-        mlp_config = MLPConfig(
+        mlp_config = DenseMLPConfig(
             linear_config=linear_config,
-            activation=Activation.SILU,
+            activation=SiLU(),
+            has_up_biases=False,
+            has_down_biases=False,
+            up_clipping=None,
+            gate_clipping=None,
         )
         decoder_layer_config = DecoderLayerConfig(
             pre_attention_norm_config=rmsnorm_config,

lalamo/model_import/decoder_configs/huggingface/__init__.py CHANGED Viewed

@@ -1,12 +1,14 @@
 from .common import HuggingFaceConfig
 from .gemma2 import HFGemma2Config
 from .gemma3 import HFGemma3Config, HFGemma3TextConfig
+from .gpt_oss import HFGPTOssConfig
 from .llama import HFLlamaConfig
 from .mistral import HFMistralConfig
 from .qwen2 import HFQwen2Config
 from .qwen3 import HFQwen3Config
 __all__ = [
+    "HFGPTOssConfig",
     "HFGemma2Config",
     "HFGemma3Config",
     "HFGemma3TextConfig",

lalamo/model_import/decoder_configs/huggingface/common.py CHANGED Viewed

@@ -58,15 +58,13 @@ class GPTQQuantizationConfig:
 @dataclass(frozen=True)
 class HuggingFaceConfig(ForeignConfig):
-    torch_dtype: Literal["bfloat16", "float16", "float32"]
     @property
     def eos_token_ids(self) -> list[int]:
         return [self.eos_token_id] if isinstance(self.eos_token_id, int) else self.eos_token_id
     @property
     def default_precision(self) -> DTypeLike:
-        return jnp.dtype(self.torch_dtype)
+        return jnp.dtype(getattr(self, "torch_dtype", "bfloat16"))
     @classmethod
     def _load_weights(

lalamo/model_import/decoder_configs/huggingface/gemma2.py CHANGED Viewed

@@ -4,17 +4,17 @@ from typing import Literal
 from jaxtyping import DTypeLike
 from lalamo.modules import (
-    Activation,
     AttentionConfig,
     DecoderConfig,
     DecoderLayerConfig,
+    DenseMLPConfig,
     FullPrecisionLinearConfig,
-    MLPConfig,
     RMSNormConfig,
     TiedEmbeddingConfig,
     UnscaledRoPEConfig,
     UpcastMode,
 )
+from lalamo.modules.activations import GELU
 from .common import HuggingFaceConfig
@@ -50,6 +50,7 @@ class HFGemma2Config(HuggingFaceConfig):
     transformers_version: str
     use_cache: bool
     vocab_size: int
+    torch_dtype: Literal["bfloat16", "float16", "float32"]
     def to_decoder_config(
         self,
@@ -64,7 +65,7 @@ class HFGemma2Config(HuggingFaceConfig):
         attention_scale = self.query_pre_attn_scalar**-0.5
         embedding_config = TiedEmbeddingConfig(
             input_scale=embedding_input_scale,
-            logits_soft_cap=self.final_logit_softcapping,
+            logit_soft_cap=self.final_logit_softcapping,
             precision=activation_precision,
         )
         rope_config = UnscaledRoPEConfig(
@@ -88,12 +89,17 @@ class HFGemma2Config(HuggingFaceConfig):
             query_norm_config=None,
             key_norm_config=None,
             logit_soft_cap=self.attn_logit_softcapping,
+            has_sinks=False,
             has_qkv_biases=self.attention_bias,
             has_out_biases=False,
         )
-        mlp_config = MLPConfig(
+        mlp_config = DenseMLPConfig(
             linear_config=linear_config,
-            activation=Activation.GELU,
+            activation=GELU(),
+            has_up_biases=False,
+            has_down_biases=False,
+            up_clipping=None,
+            gate_clipping=None,
         )
         decoder_layer_config = DecoderLayerConfig(
             pre_attention_norm_config=rmsnorm_config,

lalamo 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl