PyPI - lalamo - Versions diffs - 0.2.6__tar.gz → 0.3.0__tar.gz - Mend

lalamo 0.2.6tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{lalamo-0.2.6 → lalamo-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.2.6
+Version: 0.3.0
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown
@@ -13,10 +13,12 @@ Requires-Dist: huggingface-hub[hf-transfer]>=0.27.1
 Requires-Dist: jax>=0.4.38; sys_platform == "darwin"
 Requires-Dist: jax[cuda]>=0.4.38; sys_platform == "linux"
 Requires-Dist: jaxtyping>=0.2.36
+Requires-Dist: jinja2>=3.1.6
 Requires-Dist: ml-dtypes>=0.5.1
 Requires-Dist: optax>=0.2.4
 Requires-Dist: rich>=14.0.0
 Requires-Dist: thefuzz>=0.22.1
+Requires-Dist: tokenizers>=0.21.2
 Requires-Dist: typer>=0.15.1
 Requires-Dist: safetensors>=0.6.2
 Dynamic: license-file
@@ -48,9 +50,11 @@ uv run lalamo list-models
 To convert a model, run:
 ```bash
-uv run lalamo convert MODEL_REPO --precision float16
+uv run lalamo convert MODEL_REPO
 ```
+Note: on some CPU platform you may be getting an error saying `The precision 'F16_F16_F32' is not supported by dot_general on CPU`. This is due to a bug in XLA, which causes matmuls inside `jax.jit` not work correctly on CPUs. The workaround is to set the environment variable `JAX_DISABLE_JIT=1` when running the conversion.
 After that, you can find the converted model in the `models` folder. For more options see `uv run lalamo convert --help`.
 ## Model Support
@@ -66,10 +70,6 @@ ModelSpec(
     quantization=None,
     repo="google/gemma-3-1b-it",
     config_type=HFGemma3TextConfig,
-    config_file_name="config.json",
-    weights_file_names=huggingface_weight_files(1),
     weights_type=WeightsType.SAFETENSORS,
-    tokenizer_files=HUGGINGFACE_TOKENIZER_FILES,
-    use_cases=tuple(),
 )
 ```

{lalamo-0.2.6 → lalamo-0.3.0}/README.md RENAMED Viewed

@@ -25,9 +25,11 @@ uv run lalamo list-models
 To convert a model, run:
 ```bash
-uv run lalamo convert MODEL_REPO --precision float16
+uv run lalamo convert MODEL_REPO
 ```
+Note: on some CPU platform you may be getting an error saying `The precision 'F16_F16_F32' is not supported by dot_general on CPU`. This is due to a bug in XLA, which causes matmuls inside `jax.jit` not work correctly on CPUs. The workaround is to set the environment variable `JAX_DISABLE_JIT=1` when running the conversion.
 After that, you can find the converted model in the `models` folder. For more options see `uv run lalamo convert --help`.
 ## Model Support
@@ -43,10 +45,6 @@ ModelSpec(
     quantization=None,
     repo="google/gemma-3-1b-it",
     config_type=HFGemma3TextConfig,
-    config_file_name="config.json",
-    weights_file_names=huggingface_weight_files(1),
     weights_type=WeightsType.SAFETENSORS,
-    tokenizer_files=HUGGINGFACE_TOKENIZER_FILES,
-    use_cases=tuple(),
 )
-```
+```

{lalamo-0.2.6 → lalamo-0.3.0}/lalamo/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from lalamo.model_import import REPO_TO_MODEL, ModelSpec, import_model
 from lalamo.modules import Decoder
-__version__ = "0.2.6"
+__version__ = "0.3.0"
 __all__ = [
     "REPO_TO_MODEL",

lalamo-0.3.0/lalamo/common.py ADDED Viewed

@@ -0,0 +1,110 @@
+from collections import defaultdict
+from collections.abc import Mapping, Sequence
+from typing import cast
+import jax.numpy as jnp
+from jax._src.api import ShapeDtypeStruct
+from jaxtyping import Array, DTypeLike
+from lalamo.utils import MapDictValues, MapSequence
+__all__ = [
+    "DEFAULT_PRECISION",
+    "ArrayLike",
+    "ParameterPath",
+    "ParameterTree",
+    "dummy_array",
+    "flatten_parameters",
+    "unflatten_parameters",
+]
+DEFAULT_PRECISION: DTypeLike = jnp.bfloat16
+type ArrayLike = Array | ShapeDtypeStruct
+type ParameterTree[ArrayType: ArrayLike] = (
+    Mapping[str, ArrayType | ParameterTree[ArrayType]] | Sequence[ArrayType | ParameterTree[ArrayType]]
+)
+def dummy_array(shape: int | tuple[int, ...], dtype: DTypeLike) -> Array:
+    if isinstance(shape, int):
+        shape = (shape,)
+    return cast("Array", ShapeDtypeStruct(shape=shape, dtype=dtype))
+def flatten_parameters[ArrayType: ArrayLike](nested_parameters: ParameterTree[ArrayType]) -> dict[str, ArrayType]:
+    result: dict[str, ArrayType] = {}
+    if not isinstance(nested_parameters, Mapping):
+        nested_parameters = {str(i): value for i, value in enumerate(nested_parameters)}
+    for key, value in nested_parameters.items():
+        key_path = ParameterPath(key)
+        if isinstance(value, (Array, ShapeDtypeStruct)):
+            result[key_path] = value
+        else:
+            update: dict[str, ArrayType] = {
+                str(key_path / subkey): subvalue for subkey, subvalue in flatten_parameters(value).items()
+            }
+            result.update(update)
+    return result
+type KeyTree = Mapping[str, str | KeyTree] | Sequence[str | KeyTree]
+def _unflatten_keys(flat_keys: Mapping[str, str]) -> KeyTree:
+    groups: dict[str, dict[str, str] | str] = defaultdict(dict)
+    for subkey, full_key in flat_keys.items():
+        match subkey.split(".", maxsplit=1):
+            case [head]:
+                groups[head] = full_key
+            case [head, tail]:
+                group = groups[head]
+                assert isinstance(group, dict)
+                group[tail] = full_key
+    unflattened_groups: dict[str, KeyTree] = {}
+    for subkey, group in groups.items():
+        if isinstance(group, str):
+            unflattened_groups[subkey] = group
+        else:
+            unflattened_groups[subkey] = _unflatten_keys(group)
+    if any(key.isnumeric() for key in unflattened_groups):
+        assert set(unflattened_groups.keys()) == set(map(str, range(len(unflattened_groups))))
+        return [v for k, v in sorted(unflattened_groups.items(), key=lambda item: int(item[0]))]
+    return unflattened_groups
+def _recursive_map_dict[ArrayType: ArrayLike](
+    key_tree: KeyTree | str,
+    root_collection: Mapping[str, ArrayType],
+) -> ParameterTree[ArrayType] | ArrayType:
+    if isinstance(key_tree, str):
+        return root_collection[key_tree]
+    if isinstance(key_tree, Mapping):
+        return MapDictValues(lambda subtree: _recursive_map_dict(subtree, root_collection), key_tree)
+    if isinstance(key_tree, Sequence):
+        return MapSequence(lambda subtree: _recursive_map_dict(subtree, root_collection), key_tree)
+def unflatten_parameters[ArrayType: ArrayLike](flat_parameters: Mapping[str, ArrayType]) -> ParameterTree[ArrayType]:
+    unflattened_keys = _unflatten_keys({k: k for k in flat_parameters})
+    result = _recursive_map_dict(unflattened_keys, flat_parameters)
+    assert not isinstance(result, (Array, ShapeDtypeStruct))
+    return result
+class ParameterPath(str):
+    __slots__ = ()
+    @property
+    def components(self) -> tuple[str, ...]:
+        return tuple(self.split("."))
+    def __truediv__(self, other: str | int) -> "ParameterPath":
+        if not self:
+            return ParameterPath(str(other))
+        return ParameterPath(self + "." + str(other))

{lalamo-0.2.6 → lalamo-0.3.0}/lalamo/language_model.py RENAMED Viewed

@@ -1,89 +1,28 @@
-from abc import abstractmethod
+import json
 from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import NamedTuple
+from dataclasses import dataclass, replace
+from pathlib import Path
+from typing import NamedTuple, Self
 import equinox as eqx
 import jax
 import jax.numpy as jnp
 from jaxtyping import Array, Bool, Float, Int, PRNGKeyArray
+from safetensors.flax import load_file
+from tokenizers import Tokenizer
-from lalamo.modules import Decoder, KVCache
+from lalamo.common import DTypeLike, ParameterTree, unflatten_parameters
+from lalamo.message_processor import AssistantMessage, Message, MessageProcessor, MessageProcessorConfig
+from lalamo.modules import Decoder, DecoderConfig, KVCache, LalamoModule, WeightLayout, config_converter
+from lalamo.sampling import SamplingPolicy, make_policy
 __all__ = [
-    "BanTokensPolicy",
-    "CompositePolicy",
-    "GreedyPolicy",
+    "GenerationConfig",
     "LanguageModel",
-    "SamplingPolicy",
-    "TemperaturePolicy",
-    "TopKPolicy",
-    "TopPPolicy",
+    "LanguageModelConfig",
 ]
-class SamplingPolicy(eqx.Module):
-    @abstractmethod
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]: ...
-    def __call__(self, logits: Float[Array, " vocabulary"], *, key: PRNGKeyArray) -> Int[Array, ""]:
-        return jax.random.categorical(key, self.process_logits(logits))
-class GreedyPolicy(SamplingPolicy):
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        max_logit_value = jnp.max(logits)
-        return jnp.where(logits == max_logit_value, 1.0, -jnp.inf)
-class TemperaturePolicy(SamplingPolicy):
-    temperature: float = eqx.field(static=True)
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        return logits / self.temperature
-class TopKPolicy(SamplingPolicy):
-    k: int = eqx.field(static=True)
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        top_k_logits, _ = jax.lax.top_k(logits, self.k)
-        min_logit_val = jnp.min(top_k_logits)
-        return jnp.where(logits >= min_logit_val, logits, -jnp.inf)
-class TopPPolicy(SamplingPolicy):
-    p: float = eqx.field(static=True)
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        sorted_indices = jnp.argsort(logits, descending=True)
-        sorted_logits = logits[sorted_indices]
-        cumulative_probs = jnp.cumsum(jax.nn.softmax(sorted_logits))
-        to_remove = cumulative_probs > self.p
-        to_remove = jnp.roll(to_remove, 1)
-        to_remove = to_remove.at[0].set(False)
-        return jnp.where(to_remove, -jnp.inf, logits)
-class BanTokensPolicy(SamplingPolicy):
-    banned_tokens: list[int] = eqx.field(static=True)
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        banned_tokens_indices = jnp.asarray(self.banned_tokens, dtype=jnp.int32)
-        return logits.at[banned_tokens_indices].set(-jnp.inf)
-class CompositePolicy(SamplingPolicy):
-    policies: list[SamplingPolicy] = eqx.field(static=True)
-    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
-        for policy in self.policies:
-            logits = policy.process_logits(logits)
-        return logits
 class PrefillResults(NamedTuple):
     last_token_logits: Float[Array, " vocabulary"]
     last_token_position: Int[Array, ""]
@@ -98,9 +37,66 @@ class DecodingState(NamedTuple):
 @dataclass(frozen=True)
-class LanguageModel:
+class GenerationConfig:
+    stop_token_ids: tuple[int, ...]
+    temperature: float | None
+    top_k: int | None
+    top_p: float | None
+    banned_tokens: tuple[int, ...] | None
+    def default_policy(self) -> SamplingPolicy:
+        return make_policy(self.temperature, self.top_k, self.top_p, self.banned_tokens)
+@dataclass(frozen=True)
+class LanguageModelConfig:
+    decoder_config: DecoderConfig
+    message_processor_config: MessageProcessorConfig
+    generation_config: GenerationConfig
+class LanguageModel(LalamoModule[LanguageModelConfig]):
     decoder: Decoder
+    message_processor: MessageProcessor = eqx.field(static=True)
+    @classmethod
+    def load(cls, path: Path | str, weight_layout: WeightLayout = WeightLayout.AUTO) -> Self:
+        if isinstance(path, str):
+            path = Path(path)
+        with open(path / "config.json") as config_file:
+            config_json = json.load(config_file)
+        config = config_converter.structure(config_json["model_config"], LanguageModelConfig)
+        weights = unflatten_parameters(load_file(path / "model.safetensors"))
+        decoder = config.decoder_config.empty().import_weights(weights, weight_layout)
+        tokenizer = Tokenizer.from_file(str(path / "tokenizer.json"))
+        message_processor = MessageProcessor(config.message_processor_config, tokenizer)
+        return cls(config, decoder, message_processor)
+    @property
+    def activation_precision(self) -> DTypeLike:
+        return self.decoder.activation_precision
+    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+        return self.decoder.export_weights(weight_layout)
+    def import_weights(
+        self,
+        weights: ParameterTree[Array],
+        weight_layout: WeightLayout = WeightLayout.AUTO,
+    ) -> Self:
+        return replace(
+            self,
+            decoder=self.decoder.import_weights(weights, weight_layout),
+        )
+    @property
+    def stop_token_ids(self) -> tuple[int, ...]:
+        return self.config.generation_config.stop_token_ids
+    def default_sampling_policy(self) -> SamplingPolicy:
+        return self.config.generation_config.default_policy()
+    @eqx.filter_jit
     def _prefill(
         self,
         token_ids: Int[Array, " tokens"],
@@ -137,7 +133,8 @@ class LanguageModel:
             kv_cache=decoder_outputs.updated_kv_cache,
         )
-    def generate(
+    @eqx.filter_jit
+    def generate_tokens(
         self,
         prompt_token_ids: Int[Array, " prompt_tokens"],
         sampling_policy: SamplingPolicy | None = None,
@@ -148,7 +145,9 @@ class LanguageModel:
         key: PRNGKeyArray | None = None,
     ) -> Int[Array, " response_tokens"]:
         if sampling_policy is None:
-            sampling_policy = TemperaturePolicy(temperature=1.0)
+            sampling_policy = self.default_sampling_policy()
+        if eos_token_ids is None:
+            eos_token_ids = jnp.array(self.stop_token_ids, dtype=jnp.int32)
         (input_length,) = prompt_token_ids.shape
         prefill_results = self._prefill(
@@ -177,10 +176,7 @@ class LanguageModel:
                 next_token_id = jax.random.categorical(key, processed_logits)
                 next_token_position = state.last_token_position + 1
-                if eos_token_ids is not None:
-                    stop_flag = state.stop_flag | jnp.any(next_token_id == eos_token_ids)
-                else:
-                    stop_flag = state.stop_flag
+                stop_flag = state.stop_flag | jnp.any(next_token_id == eos_token_ids)
                 decoder_outputs = self.decoder(
                     next_token_id.reshape(1),
@@ -207,7 +203,32 @@ class LanguageModel:
         return tokens
-    def stream(
+    def reply(
+        self,
+        messages: Iterable[Message],
+        sampling_policy: SamplingPolicy | None = None,
+        *,
+        key: PRNGKeyArray | None = None,
+    ) -> AssistantMessage:
+        formatted_messages = self.message_processor.render_request(messages)
+        token_ids = jnp.array(self.message_processor.tokenize(formatted_messages), dtype=jnp.int32)
+        response_ids = self.generate_tokens(token_ids, sampling_policy, key=key)
+        response_text = self.message_processor.detokenize(response_ids.tolist())
+        return self.message_processor.parse_response(response_text)
+    def stream_reply_text(
+        self,
+        messages: Iterable[Message],
+        sampling_policy: SamplingPolicy | None = None,
+        *,
+        key: PRNGKeyArray | None = None,
+    ) -> Iterable[str]:
+        formatted_messages = self.message_processor.render_request(messages)
+        token_ids = jnp.array(self.message_processor.tokenize(formatted_messages), dtype=jnp.int32)
+        for token_id in self.stream_tokens(token_ids, sampling_policy, key=key):
+            yield self.message_processor.detokenize([token_id.item()])
+    def stream_tokens(
         self,
         prompt_token_ids: Int[Array, " prompt_tokens"],
         sampling_policy: SamplingPolicy | None = None,
@@ -218,7 +239,9 @@ class LanguageModel:
         key: PRNGKeyArray | None = None,
     ) -> Iterable[Int[Array, ""]]:
         if sampling_policy is None:
-            sampling_policy = TemperaturePolicy(temperature=1.0)
+            sampling_policy = self.default_sampling_policy()
+        if eos_token_ids is None:
+            eos_token_ids = jnp.array(self.stop_token_ids, dtype=jnp.int32)
         (input_length,) = prompt_token_ids.shape
         prefill_results = self._prefill(
@@ -244,7 +267,7 @@ class LanguageModel:
             yield next_token_id
-            if eos_token_ids is not None and jnp.any(next_token_id == eos_token_ids):
+            if jnp.any(next_token_id == eos_token_ids):
                 return
             next_token_position = state.last_token_position + 1

{lalamo-0.2.6 → lalamo-0.3.0}/lalamo/main.py RENAMED Viewed

@@ -20,7 +20,17 @@ from rich.table import Table
 from safetensors.flax import save_file
 from typer import Argument, Exit, Option, Typer
+from lalamo.common import flatten_parameters
+from lalamo.language_model import LanguageModel
+from lalamo.message_processor import UserMessage
 from lalamo.model_import import REPO_TO_MODEL, ModelMetadata, ModelSpec, import_model
+from lalamo.model_import.common import (
+    DownloadingFileEvent,
+    FinishedDownloadingFileEvent,
+    FinishedInitializingModelEvent,
+    InitializingModelEvent,
+    StatusEvent,
+)
 from lalamo.modules import WeightLayout, config_converter
 from lalamo.utils import jax_uint4_to_packed_uint8
@@ -91,6 +101,52 @@ def _pack_uint4_weights(weights: dict[str, jnp.ndarray]) -> dict[str, jnp.ndarra
     return packed_weights
+@app.command(help="Chat with a converted model.")
+def chat(
+    model_path: Annotated[
+        Path,
+        Argument(
+            help="Path to the model directory.",
+            metavar="MODEL_PATH",
+        ),
+    ],
+    weight_layout: Annotated[
+        WeightLayout | None,
+        Option(
+            help=(
+                "(EXPERIMENTAL) Order of dimensions in the weights of linear layers."
+                "\n\n\n\n"
+                "If set to AUTO, the layout will depend on the model."
+            ),
+            show_default="auto",
+        ),
+    ] = None,
+) -> None:
+    if weight_layout is None:
+        weight_layout = WeightLayout.AUTO
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        transient=True,
+    ) as progress:
+        progress.add_task("🚀 [cyan]Loading model...[/cyan]")
+        model = LanguageModel.load(model_path, weight_layout)
+    messages = []
+    while True:
+        user_text = console.input("[cyan]user> [/cyan]")
+        user_message = UserMessage(user_text)
+        messages.append(user_message)
+        console.print("[red]assistant> [/red]", end="")
+        model_response_tokens = []
+        for token in model.stream_reply_text(messages):
+            console.print(token, end="")
+            model_response_tokens.append(token)
+        console.print()
+        model_response_text = "".join(model_response_tokens)
+        messages.append(model.message_processor.parse_response(model_response_text))
 @app.command(help="Convert the model for use with the Uzu inference engine.")
 def convert(
     model_repo: Annotated[
@@ -118,7 +174,7 @@ def convert(
         WeightLayout | None,
         Option(
             help=(
-                "Order of dimensions in the weights of linear layers."
+                "(EXPERIMENTAL) Order of dimensions in the weights of linear layers."
                 "\n\n\n\n"
                 "If set to AUTO, the layout will depend on the model."
             ),
@@ -194,41 +250,58 @@ def convert(
         TextColumn("[progress.description]{task.description}"),
         transient=True,
     ) as progress:
-        progress.add_task("👨‍🍳 Cooking...")
-        model, metadata, tokenizer_file_paths = import_model(
+        event_to_task = {}
+        def progress_callback(event: StatusEvent) -> None:
+            match event:
+                case DownloadingFileEvent(file_spec):
+                    event_to_task[event] = progress.add_task(f"Retrieving {file_spec.filename}...")
+                case FinishedDownloadingFileEvent(file_spec):
+                    progress.remove_task(event_to_task[event])
+                case InitializingModelEvent():
+                    event_to_task[event] = progress.add_task("Initializing model...")
+                case FinishedInitializingModelEvent():
+                    progress.remove_task(event_to_task[event])
+        main_task = progress.add_task("👨‍🍳 Cooking...")
+        model, metadata = import_model(
             model_repo,
             precision=precision_dtype,
             context_length=context_length,
+            progress_callback=progress_callback,
         )
-        progress.add_task(f"💾 Saving the model to {output_dir}")
+        save_task = progress.add_task(f"💾 Saving the model to {output_dir}")
         output_dir.mkdir(parents=True, exist_ok=True)
-        weights = dict(model.export_weights(weight_layout))
-        packed_weights = _pack_uint4_weights(weights)
-        save_file(packed_weights, output_dir / "model.safetensors")
-        config_json = config_converter.unstructure(metadata, ModelMetadata)
-        with open(output_dir / "config.json", "w") as file:
-            json.dump(config_json, file, indent=4)
-        for path in tokenizer_file_paths:
-            shutil.copy(path, output_dir / path.name)
         if include_traces:
-            progress.add_task("🚁 Generating traces...")
+            trace_task = progress.add_task("🚁 Generating traces...")
             num_tokens = 512
             token_stride = 8
             token_ids = jnp.arange(0, num_tokens, dtype=jnp.int32)
             token_positions = jnp.arange(0, num_tokens * token_stride, token_stride, dtype=jnp.int32)
-            result = model(
+            result = model.decoder(
                 token_ids,
                 token_positions,
                 return_updated_kv_cache=True,
                 return_activation_trace=True,
             )
-            traces = dict(result.export())
+            traces = flatten_parameters(result.export())
             save_file(traces, output_dir / "traces.safetensors")
+            progress.remove_task(trace_task)
+        progress.remove_task(main_task)
+        model.message_processor.tokenizer.save(str(output_dir / "tokenizer.json"))
+        weights = flatten_parameters(model.export_weights(weight_layout))
+        del model
+        packed_weights = _pack_uint4_weights(weights)
+        save_file(packed_weights, output_dir / "model.safetensors")
+        config_json = config_converter.unstructure(metadata, ModelMetadata)
+        with open(output_dir / "config.json", "w") as file:
+            json.dump(config_json, file, indent=4)
+        progress.remove_task(save_task)
     console.print(f"🧑‍🍳 Model successfully cooked and saved to [cyan]`{output_dir}`[/cyan]!")

lalamo 0.2.6__tar.gz → 0.3.0__tar.gz

lalamo 0.2.6tar.gz → 0.3.0tar.gz