PyPI - lalamo - Versions diffs - 0.5.2__tar.gz → 0.5.3__tar.gz - Mend

lalamo 0.5.2tar.gz → 0.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

{lalamo-0.5.2 → lalamo-0.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.5.2
+Version: 0.5.3
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/__init__.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from lalamo.language_model import LanguageModel
 from lalamo.message_processor import (
     AssistantMessage,
     ContentBlock,
@@ -9,8 +8,9 @@ from lalamo.message_processor import (
     UserMessage,
 )
 from lalamo.model_import import ModelSpec, import_model
+from lalamo.models import LanguageModel, Router
-__version__ = "0.5.2"
+__version__ = "0.5.3"
 __all__ = [
     "AssistantMessage",
@@ -19,6 +19,7 @@ __all__ = [
     "LanguageModel",
     "Message",
     "ModelSpec",
+    "Router",
     "SystemMessage",
     "ToolSchema",
     "UserMessage",

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/data/__init__.py RENAMED Viewed

@@ -5,4 +5,3 @@ __all__ = [
     "get_prefixes_ending_in_user_message",
     "import_hf_parquet",
 ]

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/data/huggingface_message.py RENAMED Viewed

@@ -29,6 +29,7 @@ class HFMessage:
             case other:
                 raise ValueError(f"Cannot convert {other} message")
 def import_hf_parquet(path: Path | str) -> Iterable[list[Message]]:
     path = Path(path)

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/main.py RENAMED Viewed

@@ -4,12 +4,11 @@ import re
 import shutil
 import sys
 from enum import Enum
-from itertools import chain
+from itertools import chain, islice
 from pathlib import Path
 from typing import Annotated
 import jax
-import jax.numpy as jnp
 import jax.profiler
 import thefuzz.process
 from click import Context as ClickContext
@@ -35,7 +34,6 @@ from typer import Argument, Context, Exit, Option, Typer
 from lalamo.common import flatten_parameters
 from lalamo.data import import_hf_parquet
 from lalamo.data.lalamo_completions import LalamoCompletion
-from lalamo.language_model import LanguageModel
 from lalamo.message_processor import UserMessage
 from lalamo.model_import import REPO_TO_MODEL, ModelMetadata, ModelSpec, import_model
 from lalamo.model_import.common import (
@@ -45,10 +43,16 @@ from lalamo.model_import.common import (
     InitializingModelEvent,
     StatusEvent,
 )
+from lalamo.models import LanguageModelConfig, RouterConfig
 from lalamo.modules import config_converter
+from lalamo.speculator.estimator import EstimateBatchsizeFromMemoryEvent, estimate_batchsize_from_memory
 from lalamo.speculator.inference import CollectTracesEvent, inference_collect_traces
 from lalamo.speculator.ngram import NGramSpeculator
-from lalamo.speculator.utils import SpeculatorTrainingEvent, test_speculator, train_speculator
+from lalamo.speculator.utils import (
+    SpeculatorTrainingEvent,
+    test_speculator,
+    train_speculator,
+)
 SCRIPT_NAME = Path(sys.argv[0]).name
@@ -123,7 +127,7 @@ def chat(
         transient=True,
     ) as progress:
         loading_task = progress.add_task("🚀 [cyan]Loading model...[/cyan]")
-        model = LanguageModel.load(model_path)
+        model = LanguageModelConfig.load_model(model_path)
         progress.remove_task(loading_task)
         warmup_task = progress.add_task("🔥 Warming up compilation cache...")
         list(model.stream_reply_text([UserMessage("")], max_output_length=1))
@@ -145,6 +149,39 @@ def chat(
         messages.append(model.message_processor.parse_response(model_response_text))
+@app.command(help="Classify given message with a Router type of model.")
+def classify(
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory.",
+            metavar="MODEL_PATH",
+        ),
+    ],
+) -> None:
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        transient=True,
+    ) as progress:
+        loading_task = progress.add_task("🚀 [cyan]Loading model...[/cyan]")
+        model = RouterConfig.load_model(model_path)
+        progress.remove_task(loading_task)
+        warmup_task = progress.add_task("🔥 Warming up...")
+        model.classify_chat([UserMessage(content="warmup message")])
+        progress.remove_task(warmup_task)
+    console.print(f"🤖 Classifying input with [blue]{model_path}[/blue]:")
+    while True:
+        user_text = console.input("[cyan]user> [/cyan]")
+        user_message = UserMessage(user_text)
+        console.print("[red]assistant> [/red]", end="")
+        result = model.classify_chat([user_message])
+        for label, confidence in result.items():
+            console.print(f"{label} : {confidence}", end="")
+        console.print()
 @app.command(help="Convert the model for use with the Uzu inference engine.")
 def convert(
     model_repo: Annotated[
@@ -194,6 +231,12 @@ def convert(
             help="Overwrite existing model files.",
         ),
     ] = False,
+    message_for_trace: Annotated[
+        str | None,
+        Option(
+            help="Text message to use as prompt when recording trace",
+        ),
+    ] = None,
 ) -> None:
     if precision is not None:
         precision_dtype = config_converter.structure(precision.value, DTypeLike)  # type: ignore
@@ -224,6 +267,8 @@ def convert(
             console.print("Exiting...")
             raise Exit
+    message = None if message_for_trace is None else [UserMessage(content=message_for_trace)]
     with Progress(
         SpinnerColumn(),
         TextColumn("[progress.description]{task.description}"),
@@ -254,17 +299,7 @@ def convert(
         if include_traces:
             trace_task = progress.add_task("🚁 Generating traces...")
-            num_tokens = 512
-            token_stride = 8
-            token_ids = jnp.arange(0, num_tokens, dtype=jnp.int32)[None, :]
-            token_positions = jnp.arange(0, num_tokens * token_stride, token_stride, dtype=jnp.int32)[None, :]
-            result = model.decoder(
-                token_ids,
-                token_positions,
-                return_updated_state=True,
-                return_activation_trace=True,
-            )
+            result = model.record_trace(message)
             traces = flatten_parameters(result.export())
             save_file(traces, output_dir / "traces.safetensors")
             progress.remove_task(trace_task)
@@ -350,6 +385,77 @@ speculator_app = Typer()
 app.add_typer(speculator_app, name="speculator", help="Train a speculator for a model.")
+@speculator_app.command(help="Estimate maximum batch size at which a model can be run.")
+def estimate_batchsize(
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory",
+            metavar="MODEL_PATH",
+        ),
+    ],
+    max_input_length: Annotated[
+        int,
+        Option(help="Max input length of a model."),
+    ] = 1024,
+    max_output_length: Annotated[
+        int,
+        Option(help="Max output length of a model."),
+    ] = 1024,
+    num_logits_per_token: Annotated[
+        int,
+        Option(help="Number of top logits that will be recorded."),
+    ] = 8,
+    vram_gb: Annotated[
+        int | None,
+        Option(
+            help="Maximum vram size in gb allowed.",
+            show_default="max on default device",
+        ),
+    ] = None,
+) -> None:
+    if vram_gb is not None:
+        mem = vram_gb * 1024 * 1024 * 1024
+    else:
+        memory_stats = jax.local_devices()[0].memory_stats()
+        if memory_stats is None:
+            err_console.print("Cannot get the default device's memory stats, use --vram-gb")
+            raise Exit(1)
+        if "bytes_limit" not in memory_stats:
+            err_console.print("Cannot get the default device's bytes limit, use --vram-gb")
+            raise Exit(1)
+        mem = memory_stats["bytes_limit"]
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        transient=True,
+    ) as progress:
+        loading_model_task = progress.add_task("[cyan]Loading model...[/cyan]")
+        model = LanguageModelConfig.load_model(model_path)
+        progress.remove_task(loading_model_task)
+        estimating_batchsize_task = progress.add_task("[cyan]Estimating batch size...[/cyan]")
+        def progress_callback(event: EstimateBatchsizeFromMemoryEvent) -> None:
+            lo = str(event.lo)
+            hi = str(event.hi) if event.hi is not None else "?"
+            description = f"[cyan]Estimating batch size... ({lo}..{hi})[/cyan]"
+            progress.update(estimating_batchsize_task, description=description)
+        bs = estimate_batchsize_from_memory(
+            model,
+            max_input_length,
+            max_output_length,
+            num_logits_per_token,
+            mem,
+            progress_callback,
+        )
+        progress.remove_task(estimating_batchsize_task)
+    console.print(f"Found maximum batch size: [cyan]{bs}[/cyan]")
 @speculator_app.command(help="Run model inference and collect traces for speculator training")
 def collect_traces(
     model_path: Annotated[
@@ -406,7 +512,7 @@ def collect_traces(
         ) as progress:
             live.update(progress, refresh=True)
             loading_model_task = progress.add_task("🧠 [cyan]Loading model...[/cyan]")
-            model = LanguageModel.load(model_path)
+            model = LanguageModelConfig.load_model(model_path)
             progress.remove_task(loading_model_task)
             loading_dataset_task = progress.add_task("🗂️ [cyan]Loading dataset...[/cyan]")
@@ -448,6 +554,49 @@ def collect_traces(
             progress.update(inference_task, description="✅ Completed")
+@speculator_app.command(help="View model inference traces")
+def view_traces(
+    trace_path: Annotated[
+        Path,
+        Argument(
+            help="File of inference traces to view.",
+            metavar="TRACE_PATH",
+        ),
+    ],
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory for detokenization.",
+            metavar="MODEL_PATH",
+        ),
+    ],
+    num_completions: Annotated[
+        int | None,
+        Option(
+            help="Number of completions to show.",
+        ),
+    ] = None,
+) -> None:
+    model = LanguageModelConfig.load_model(model_path)
+    with open(trace_path, "rb") as trace_fd:
+        traces = LalamoCompletion.deserialize_many(trace_fd)
+        table = Table(
+            show_lines=True,
+            box=box.ROUNDED,
+        )
+        table.add_column("Prefix")
+        table.add_column("Completion")
+        for completion in islice(traces, num_completions):
+            detokenized_prefix = model.message_processor.detokenize(completion.prefix_token_ids)
+            detokenized_completion = model.message_processor.detokenize(completion.completion_token_ids)
+            table.add_row(detokenized_prefix, detokenized_completion)
+        console.print(table)
 @speculator_app.command(help="Train a speculator from inference traces")
 def train(
     trace_path: Annotated[
@@ -535,7 +684,7 @@ def test(
         Option(help="Number of sequences to generate"),
     ] = 8,
 ) -> None:
-    model = LanguageModel.load(model_path)
+    model = LanguageModelConfig.load_model(model_path)
     with open(speculator_path, "rb") as fd:
         speculator = NGramSpeculator.deserialize(fd.read())

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/message_processor.py RENAMED Viewed

@@ -156,13 +156,12 @@ class MessageProcessor:
             raise ValueError(f"Invalid response format: {response}")
         return AssistantMessage(**match.groupdict())
-    def tokenize(self, text: str) -> list[int]:
+    def tokenize_text(self, text: str) -> list[int]:
         return self.tokenizer.encode(text, add_special_tokens=False).ids
     def tokenize_request(self, messages: Iterable[Message]) -> list[int]:
         rendered = self.render_request(messages)
-        tokenized = self.tokenize(rendered)
-        return tokenized
+        return self.tokenize_text(rendered)
     def detokenize(self, tokens: list[int]) -> str:
         return self.tokenizer.decode(tokens, skip_special_tokens=False)

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/model_import/common.py RENAMED Viewed

@@ -13,14 +13,16 @@ from jax import Array
 from jaxtyping import DTypeLike
 from tokenizers import Tokenizer
-from lalamo.language_model import GenerationConfig, LanguageModel, LanguageModelConfig
 from lalamo.message_processor import MessageProcessor, MessageProcessorConfig
-from lalamo.model_import.model_specs.common import JSONFieldSpec
+from lalamo.models import GenerationConfig, LanguageModel, LanguageModelConfig, Router, RouterConfig
+from lalamo.modules import Classifier, Decoder, LalamoModule
 from lalamo.quantization import QuantizationMode
+from .decoder_configs import ForeignClassifierConfig, ForeignConfig, ForeignLMConfig
 from .huggingface_generation_config import HFGenerationConfig
 from .huggingface_tokenizer_config import HFTokenizerConfig
-from .model_specs import REPO_TO_MODEL, FileSpec, ModelSpec, UseCase
+from .model_specs import REPO_TO_MODEL, FileSpec, ModelSpec, ModelType, UseCase
+from .model_specs.common import JSONFieldSpec
 __all__ = [
     "REPO_TO_MODEL",
@@ -29,6 +31,7 @@ __all__ = [
     "InitializingModelEvent",
     "ModelMetadata",
     "ModelSpec",
+    "ModelType",
     "StatusEvent",
     "import_model",
 ]
@@ -68,7 +71,8 @@ class ModelMetadata:
     quantization: QuantizationMode | None
     repo: str
     use_cases: tuple[UseCase, ...]
-    model_config: LanguageModelConfig
+    model_type: ModelType
+    model_config: LanguageModelConfig | RouterConfig
 def download_file(
@@ -114,7 +118,7 @@ def download_config_file(
 class ImportResults(NamedTuple):
-    model: LanguageModel
+    model: LanguageModel | Router
     metadata: ModelMetadata
@@ -166,26 +170,14 @@ def import_message_processor(
     return MessageProcessor(config=message_processor_config, tokenizer=tokenizer)
-def import_model(
-    model_spec: ModelSpec | str,
-    *,
+def _load_main_processing_module(
+    model_spec: ModelSpec,
+    precision: DTypeLike,
+    foreign_config: ForeignConfig,
+    progress_callback: Callable[[StatusEvent], None] | None = None,
     context_length: int | None = None,
-    precision: DTypeLike | None = None,
     accumulation_precision: DTypeLike = jnp.float32,
-    progress_callback: Callable[[StatusEvent], None] | None = None,
-) -> ImportResults:
-    if isinstance(model_spec, str):
-        try:
-            model_spec = REPO_TO_MODEL[model_spec]
-        except KeyError as e:
-            raise ValueError(f"Unknown model: {model_spec}") from e
-    foreign_decoder_config_file = download_config_file(model_spec)
-    foreign_decoder_config = model_spec.config_type.from_json(foreign_decoder_config_file)
-    if precision is None:
-        precision = foreign_decoder_config.default_precision
+) -> LalamoModule:
     weights_paths = download_weights(model_spec, progress_callback=progress_callback)
     with ExitStack() as stack:
         weights_shards = []
@@ -200,7 +192,7 @@ def import_model(
         if progress_callback is not None:
             progress_callback(InitializingModelEvent())
-        decoder = foreign_decoder_config.load_decoder(
+        processing_module = foreign_config.load(
             context_length,
             precision,
             accumulation_precision,
@@ -208,6 +200,33 @@ def import_model(
             metadata_dict,
         )
+    return processing_module
+def _import_language_model(
+    model_spec: ModelSpec,
+    *,
+    context_length: int | None = None,
+    precision: DTypeLike | None = None,
+    accumulation_precision: DTypeLike = jnp.float32,
+    progress_callback: Callable[[StatusEvent], None] | None = None,
+) -> tuple[LanguageModel, LanguageModelConfig]:
+    foreign_decoder_config_file = download_config_file(model_spec)
+    foreign_decoder_config = model_spec.config_type.from_json(foreign_decoder_config_file)
+    assert isinstance(foreign_decoder_config, ForeignLMConfig)
+    if precision is None:
+        precision = foreign_decoder_config.default_precision
+    decoder = _load_main_processing_module(
+        model_spec,
+        precision,
+        foreign_decoder_config,
+        progress_callback,
+        context_length,
+        accumulation_precision,
+    )
+    assert isinstance(decoder, Decoder)
     if progress_callback is not None:
         progress_callback(FinishedInitializingModelEvent())
@@ -235,12 +254,85 @@ def import_model(
         )
     language_model_config = LanguageModelConfig(
-        decoder_config=decoder.config,
+        model_config=decoder.config,
         message_processor_config=message_processor.config,
         generation_config=generation_config,
     )
     language_model = LanguageModel(language_model_config, decoder, message_processor)
+    return language_model, language_model_config
+def _import_router(
+    model_spec: ModelSpec,
+    *,
+    context_length: int | None = None,
+    precision: DTypeLike | None = None,
+    accumulation_precision: DTypeLike = jnp.float32,
+    progress_callback: Callable[[StatusEvent], None] | None = None,
+) -> tuple[Router, RouterConfig]:
+    foreign_classifier_config_file = download_config_file(model_spec)
+    foreign_classifier_config = model_spec.config_type.from_json(foreign_classifier_config_file)
+    assert isinstance(foreign_classifier_config, ForeignClassifierConfig)
+    if precision is None:
+        precision = foreign_classifier_config.default_precision
+    classifier = _load_main_processing_module(
+        model_spec,
+        precision,
+        foreign_classifier_config,
+        progress_callback,
+        context_length,
+        accumulation_precision,
+    )
+    assert isinstance(classifier, Classifier)
+    if progress_callback is not None:
+        progress_callback(FinishedInitializingModelEvent())
+    message_processor = import_message_processor(model_spec)
+    router_config = RouterConfig(
+        model_config=classifier.config,
+        message_processor_config=message_processor.config,
+    )
+    router_model = Router(router_config, classifier, message_processor)
+    return router_model, router_config
+def import_model(
+    model_spec: ModelSpec | str,
+    *,
+    context_length: int | None = None,
+    precision: DTypeLike | None = None,
+    accumulation_precision: DTypeLike = jnp.float32,
+    progress_callback: Callable[[StatusEvent], None] | None = None,
+) -> ImportResults:
+    if isinstance(model_spec, str):
+        try:
+            model_spec = REPO_TO_MODEL[model_spec]
+        except KeyError as e:
+            raise ValueError(f"Unknown model: {model_spec}") from e
+    match model_spec.model_type:
+        case ModelType.LANGUAGE_MODEL:
+            model, config = _import_language_model(
+                model_spec,
+                context_length=context_length,
+                precision=precision,
+                accumulation_precision=accumulation_precision,
+                progress_callback=progress_callback,
+            )
+        case ModelType.ROUTER_MODEL:
+            model, config = _import_router(
+                model_spec,
+                context_length=context_length,
+                precision=precision,
+                accumulation_precision=accumulation_precision,
+                progress_callback=progress_callback,
+            )
     metadata = ModelMetadata(
         toolchain_version=LALAMO_VERSION,
         vendor=model_spec.vendor,
@@ -250,6 +342,7 @@ def import_model(
         quantization=model_spec.quantization,
         repo=model_spec.repo,
         use_cases=model_spec.use_cases,
-        model_config=language_model_config,
+        model_type=model_spec.model_type,
+        model_config=config,
     )
-    return ImportResults(language_model, metadata)
+    return ImportResults(model, metadata)

{lalamo-0.5.2 → lalamo-0.5.3}/lalamo/model_import/decoder_configs/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .common import ForeignConfig
+from .common import ForeignClassifierConfig, ForeignConfig, ForeignLMConfig
 # from .executorch import ETLlamaConfig
 from .huggingface import (
@@ -14,8 +14,10 @@ from .huggingface import (
 )
 __all__ = [
-    # "ETLlamaConfig",
+    "ForeignClassifierConfig",
     "ForeignConfig",
+    # "ETLlamaConfig",
+    "ForeignLMConfig",
     "HFGPTOssConfig",
     "HFGemma2Config",
     "HFGemma3Config",

lalamo-0.5.3/lalamo/model_import/decoder_configs/common.py ADDED Viewed

@@ -0,0 +1,105 @@
+import json
+from abc import abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, Self
+import cattrs
+from jaxtyping import Array, DTypeLike
+from lalamo.modules import ClassifierConfig, DecoderConfig
+from lalamo.modules.common import LalamoModule
+from lalamo.registry_abc import RegistryABC
+__all__ = ["ForeignClassifierConfig", "ForeignLMConfig"]
+@dataclass(frozen=True)
+class ForeignConfig[ConfigT: DecoderConfig | ClassifierConfig](RegistryABC):
+    _converter: ClassVar[cattrs.Converter] = cattrs.Converter()
+    _converter.register_structure_hook(int | list[int], lambda v, _: v)
+    @property
+    @abstractmethod
+    def default_precision(self) -> DTypeLike: ...
+    @classmethod
+    def from_json(cls, json_path: Path | str) -> Self:
+        json_path = Path(json_path)
+        with open(json_path) as f:
+            config = json.load(f)
+        return cls._converter.structure(config, cls)
+    @abstractmethod
+    def _load_weights(
+        self,
+        model: LalamoModule,
+        weights_dict: Mapping[str, Array],
+    ) -> LalamoModule: ...
+    @abstractmethod
+    def to_lalamo_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],
+    ) -> ConfigT: ...
+    def load(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        weights_dict: Mapping[str, Array],
+        metadata_dict: Mapping[str, str],
+    ) -> LalamoModule[ConfigT]:
+        config = self.to_lalamo_config(context_length, activation_precision, accumulation_precision, metadata_dict)
+        model = config.empty()
+        return self._load_weights(model, weights_dict)
+@dataclass(frozen=True)
+class ForeignLMConfig(ForeignConfig, RegistryABC):
+    @abstractmethod
+    def to_decoder_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],
+    ) -> DecoderConfig: ...
+    @property
+    @abstractmethod
+    def eos_token_ids(self) -> list[int]: ...
+    def to_lalamo_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],
+    ) -> DecoderConfig:
+        return self.to_decoder_config(context_length, activation_precision, accumulation_precision, metadata_dict)
+@dataclass(frozen=True)
+class ForeignClassifierConfig(ForeignConfig, RegistryABC):
+    @abstractmethod
+    def to_classifier_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+    ) -> ClassifierConfig: ...
+    def to_lalamo_config(
+        self,
+        context_length: int | None,
+        activation_precision: DTypeLike,
+        accumulation_precision: DTypeLike,
+        metadata_dict: Mapping[str, str],  # noqa: ARG002
+    ) -> ClassifierConfig:
+        return self.to_classifier_config(context_length, activation_precision, accumulation_precision)

lalamo 0.5.2__tar.gz → 0.5.3__tar.gz

lalamo 0.5.2tar.gz → 0.5.3tar.gz