PyPI - lalamo - Versions diffs - 0.2.1__py3-none-any.whl - Mend

lalamo 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

lalamo/__init__.py +11 -0
lalamo/common.py +60 -0
lalamo/language_model.py +263 -0
lalamo/main.py +299 -0
lalamo/quantization.py +92 -0
lalamo/utils.py +55 -0
lalamo-0.2.1.dist-info/METADATA +74 -0
lalamo-0.2.1.dist-info/RECORD +12 -0
lalamo-0.2.1.dist-info/WHEEL +5 -0
lalamo-0.2.1.dist-info/entry_points.txt +2 -0
lalamo-0.2.1.dist-info/licenses/LICENSE +21 -0
lalamo-0.2.1.dist-info/top_level.txt +1 -0

lalamo/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from lalamo.model_import import REPO_TO_MODEL, ModelSpec, import_model
+from lalamo.modules import Decoder
+__version__ = "0.2.1"
+__all__ = [
+    "REPO_TO_MODEL",
+    "Decoder",
+    "ModelSpec",
+    "import_model",
+]

lalamo/common.py ADDED Viewed

@@ -0,0 +1,60 @@
+from collections.abc import Iterable, Mapping
+import jax.numpy as jnp
+from jaxtyping import Array, DTypeLike
+__all__ = [
+    "DEFAULT_PRECISION",
+    "ParameterDict",
+    "ParameterPath",
+]
+DEFAULT_PRECISION: DTypeLike = jnp.bfloat16
+type NestedParameters = Mapping[str, Array | NestedParameters] | Iterable[Array | NestedParameters]
+class ParameterDict(dict[str, Array]):
+    def __init__(self, **kwargs: Array | NestedParameters | Iterable[Array | NestedParameters]) -> None:
+        super().__init__(self._flatten(kwargs))
+    def __setitem__(
+        self,
+        key: str,
+        value: Array | NestedParameters | Iterable[Array | NestedParameters],
+    ) -> None:
+        key = ParameterPath(key)
+        if isinstance(value, Array):
+            super().__setitem__(key, value)
+            return
+        for subkey, subvalue in self._flatten(value).items():
+            super().__setitem__(key / subkey, subvalue)
+    @classmethod
+    def _flatten(cls, nested_parameters: NestedParameters) -> dict[str, Array]:
+        result: dict[str, Array] = {}
+        if not isinstance(nested_parameters, Mapping):
+            nested_parameters = {str(i): value for i, value in enumerate(nested_parameters)}
+        for key, value in nested_parameters.items():
+            key_path = ParameterPath(key)
+            if isinstance(value, Array):
+                result[key_path] = value
+            else:
+                result.update({key_path / subkey: subvalue for subkey, subvalue in cls._flatten(value).items()})
+        return result
+class ParameterPath(str):
+    __slots__ = ()
+    @property
+    def components(self) -> tuple[str, ...]:
+        return tuple(self.split("."))
+    def __truediv__(self, other: str | int) -> "ParameterPath":
+        if not self:
+            return ParameterPath(str(other))
+        return ParameterPath(self + "." + str(other))

lalamo/language_model.py ADDED Viewed

@@ -0,0 +1,263 @@
+from abc import abstractmethod
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import NamedTuple
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+from jaxtyping import Array, Bool, Float, Int, PRNGKeyArray
+from lalamo.modules import Decoder, KVCache
+__all__ = [
+    "BanTokensPolicy",
+    "CompositePolicy",
+    "GreedyPolicy",
+    "LanguageModel",
+    "SamplingPolicy",
+    "TemperaturePolicy",
+    "TopKPolicy",
+    "TopPPolicy",
+]
+class SamplingPolicy(eqx.Module):
+    @abstractmethod
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]: ...
+    def __call__(self, logits: Float[Array, " vocabulary"], *, key: PRNGKeyArray) -> Int[Array, ""]:
+        return jax.random.categorical(key, self.process_logits(logits))
+class GreedyPolicy(SamplingPolicy):
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        max_logit_value = jnp.max(logits)
+        return jnp.where(logits == max_logit_value, 1.0, -jnp.inf)
+class TemperaturePolicy(SamplingPolicy):
+    temperature: float = eqx.field(static=True)
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        return logits / self.temperature
+class TopKPolicy(SamplingPolicy):
+    k: int = eqx.field(static=True)
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        top_k_logits, _ = jax.lax.top_k(logits, self.k)
+        min_logit_val = jnp.min(top_k_logits)
+        return jnp.where(logits >= min_logit_val, logits, -jnp.inf)
+class TopPPolicy(SamplingPolicy):
+    p: float = eqx.field(static=True)
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        sorted_indices = jnp.argsort(logits, descending=True)
+        sorted_logits = logits[sorted_indices]
+        cumulative_probs = jnp.cumsum(jax.nn.softmax(sorted_logits))
+        to_remove = cumulative_probs > self.p
+        to_remove = jnp.roll(to_remove, 1)
+        to_remove = to_remove.at[0].set(False)
+        return jnp.where(to_remove, -jnp.inf, logits)
+class BanTokensPolicy(SamplingPolicy):
+    banned_tokens: list[int] = eqx.field(static=True)
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        banned_tokens_indices = jnp.asarray(self.banned_tokens, dtype=jnp.int32)
+        return logits.at[banned_tokens_indices].set(-jnp.inf)
+class CompositePolicy(SamplingPolicy):
+    policies: list[SamplingPolicy] = eqx.field(static=True)
+    def process_logits(self, logits: Float[Array, " vocabulary"]) -> Float[Array, " vocabulary"]:
+        for policy in self.policies:
+            logits = policy.process_logits(logits)
+        return logits
+class PrefillResults(NamedTuple):
+    last_token_logits: Float[Array, " vocabulary"]
+    last_token_position: Int[Array, ""]
+    kv_cache: KVCache
+class DecodingState(NamedTuple):
+    last_token_logits: Float[Array, " vocabulary"]
+    last_token_position: Int[Array, ""]
+    kv_cache: KVCache
+    stop_flag: Bool[Array, ""]
+@dataclass(frozen=True)
+class LanguageModel:
+    decoder: Decoder
+    def _prefill(
+        self,
+        token_ids: Int[Array, " tokens"],
+        length_without_padding: Int[Array, ""] | int | None = None,
+        kv_cache_capacity: int | None = None,
+    ) -> PrefillResults:
+        (num_tokens,) = token_ids.shape
+        token_positions = jnp.arange(num_tokens, dtype=jnp.int32)
+        if kv_cache_capacity is not None:
+            kv_cache = self.decoder.init_static_kv_cache(kv_cache_capacity)
+        else:
+            kv_cache = None
+        decoder_outputs = self.decoder(
+            token_ids,
+            token_positions,
+            kv_cache,
+            return_updated_kv_cache=True,
+            length_without_padding=length_without_padding,
+        )
+        if length_without_padding is not None:
+            last_logits_index = length_without_padding - 1
+        else:
+            last_logits_index = num_tokens - 1
+        last_token_logits = decoder_outputs.logits[last_logits_index, :]
+        last_token_position = jnp.array(last_logits_index, dtype=jnp.int32)
+        assert decoder_outputs.updated_kv_cache is not None
+        return PrefillResults(
+            last_token_logits=last_token_logits,
+            last_token_position=last_token_position,
+            kv_cache=decoder_outputs.updated_kv_cache,
+        )
+    def generate(
+        self,
+        prompt_token_ids: Int[Array, " prompt_tokens"],
+        sampling_policy: SamplingPolicy | None = None,
+        prompt_length_without_padding: Int[Array, ""] | int | None = None,
+        max_output_length: int = 8192,
+        eos_token_ids: Int[Array, " eos_tokens"] | None = None,
+        *,
+        key: PRNGKeyArray | None = None,
+    ) -> Int[Array, " response_tokens"]:
+        if sampling_policy is None:
+            sampling_policy = TemperaturePolicy(temperature=1.0)
+        (input_length,) = prompt_token_ids.shape
+        prefill_results = self._prefill(
+            prompt_token_ids,
+            prompt_length_without_padding,
+            input_length + max_output_length,
+        )
+        initial_state = DecodingState(
+            prefill_results.last_token_logits,
+            prefill_results.last_token_position,
+            prefill_results.kv_cache,
+            jnp.array(0, dtype=jnp.bool),
+        )
+        if key is None:
+            key = jax.random.PRNGKey(0)
+        keys = jax.random.split(key, num=max_output_length)
+        def loop_iteration(
+            state: DecodingState,
+            key: PRNGKeyArray,
+        ) -> tuple[DecodingState, Int[Array, ""]]:
+            def sample_and_update() -> tuple[DecodingState, Int[Array, ""]]:
+                processed_logits = sampling_policy.process_logits(state.last_token_logits)
+                next_token_id = jax.random.categorical(key, processed_logits)
+                next_token_position = state.last_token_position + 1
+                if eos_token_ids is not None:
+                    stop_flag = state.stop_flag | jnp.any(next_token_id == eos_token_ids)
+                else:
+                    stop_flag = state.stop_flag
+                decoder_outputs = self.decoder(
+                    next_token_id.reshape(1),
+                    next_token_position.reshape(1),
+                    state.kv_cache,
+                    return_updated_kv_cache=True,
+                )
+                assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
+                new_state = DecodingState(
+                    decoder_outputs.logits.squeeze(),
+                    next_token_position,
+                    decoder_outputs.updated_kv_cache,
+                    stop_flag,
+                )
+                return new_state, next_token_id
+            def pad_and_repeat_state() -> tuple[DecodingState, Int[Array, ""]]:
+                pad_token = jnp.array(0, dtype=jnp.int32)
+                return state, pad_token
+            return jax.lax.cond(state.stop_flag, pad_and_repeat_state, sample_and_update)
+        _, tokens = jax.lax.scan(loop_iteration, initial_state, keys)
+        return tokens
+    def stream(
+        self,
+        prompt_token_ids: Int[Array, " prompt_tokens"],
+        sampling_policy: SamplingPolicy | None = None,
+        prompt_length_without_padding: Int[Array, ""] | int | None = None,
+        max_output_length: int = 8192,
+        eos_token_ids: Int[Array, " eos_tokens"] | None = None,
+        *,
+        key: PRNGKeyArray | None = None,
+    ) -> Iterable[Int[Array, ""]]:
+        if sampling_policy is None:
+            sampling_policy = TemperaturePolicy(temperature=1.0)
+        (input_length,) = prompt_token_ids.shape
+        prefill_results = self._prefill(
+            prompt_token_ids,
+            prompt_length_without_padding,
+            input_length + max_output_length,
+        )
+        if key is None:
+            key = jax.random.PRNGKey(0)
+        keys = jax.random.split(key, num=max_output_length)
+        state = DecodingState(
+            prefill_results.last_token_logits,
+            prefill_results.last_token_position,
+            prefill_results.kv_cache,
+            jnp.array(0, dtype=jnp.bool),
+        )
+        for iter_key in keys:
+            processed_logits = sampling_policy.process_logits(state.last_token_logits)
+            next_token_id = jax.random.categorical(iter_key, processed_logits)
+            yield next_token_id
+            if eos_token_ids is not None and jnp.any(next_token_id == eos_token_ids):
+                return
+            next_token_position = state.last_token_position + 1
+            decoder_outputs = self.decoder(
+                next_token_id.reshape(1),
+                next_token_position.reshape(1),
+                state.kv_cache,
+                return_updated_kv_cache=True,
+            )
+            assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
+            state = DecodingState(
+                decoder_outputs.logits.squeeze(),
+                next_token_position,
+                decoder_outputs.updated_kv_cache,
+                state.stop_flag,
+            )

lalamo/main.py ADDED Viewed

@@ -0,0 +1,299 @@
+import json
+import re
+import shutil
+import sys
+from enum import Enum
+from pathlib import Path
+from typing import Annotated
+import jax.numpy as jnp
+import thefuzz.process
+from click import Context as ClickContext
+from click import Parameter as ClickParameter
+from click import ParamType
+from jaxtyping import DTypeLike
+from rich import box
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+from safetensors.flax import save_file
+from typer import Argument, Exit, Option, Typer
+from lalamo.model_import import REPO_TO_MODEL, ModelMetadata, ModelSpec, import_model
+from lalamo.modules import WeightLayout, config_converter
+from lalamo.utils import jax_uint4_to_packed_uint8
+SCRIPT_NAME = Path(sys.argv[0]).name
+DEFAULT_OUTPUT_DIR = Path("models")
+class Precision(Enum):
+    FLOAT32 = "float32"
+    FLOAT16 = "float16"
+    BFLOAT16 = "bfloat16"
+console = Console()
+err_console = Console(stderr=True)
+app = Typer(
+    rich_markup_mode="rich",
+    add_completion=False,
+    pretty_exceptions_show_locals=False,
+)
+class ModelParser(ParamType):
+    name: str = "Huggingface Model Repo"
+    def convert(self, value: str, param: ClickParameter | None, ctx: ClickContext | None) -> ModelSpec:
+        result = REPO_TO_MODEL.get(value)
+        if result is None:
+            closest_repo = _closest_repo(value)
+            error_message_parts = [
+                f'"{value}".',
+            ]
+            if closest_repo:
+                error_message_parts.append(
+                    f' Perhaps you meant "{closest_repo}"?',
+                )
+            error_message_parts.append(
+                f"\n\nUse the `{SCRIPT_NAME} list-models` command to see the list of currently supported models.",
+            )
+            error_message = "".join(error_message_parts)
+            self.fail(error_message, param, ctx)
+        return result
+def _closest_repo(query: str, min_score: float = 80) -> str | None:
+    if not REPO_TO_MODEL:
+        return None
+    (closest_match, score), *_ = thefuzz.process.extract(query, list(REPO_TO_MODEL))
+    if closest_match and score >= min_score:
+        return closest_match
+    return None
+def _error(message: str) -> None:
+    panel = Panel(message, box=box.ROUNDED, title="Error", title_align="left", border_style="red")
+    err_console.print(panel)
+    raise Exit(1)
+def _pack_uint4_weights(weights: dict[str, jnp.ndarray]) -> dict[str, jnp.ndarray]:
+    packed_weights = {}
+    for key, value in weights.items():
+        if value.dtype == jnp.uint4:
+            packed_weights[key] = jax_uint4_to_packed_uint8(value)
+        else:
+            packed_weights[key] = value
+    return packed_weights
+@app.command(help="Convert the model for use with the Uzu inference engine.")
+def convert(
+    model_repo: Annotated[
+        ModelSpec,
+        Argument(
+            help=(
+                "Huggingface model repo. Example: [cyan]'meta-llama/Llama-3.2-1B-Instruct'[/cyan]."
+                "\n\n\n\n"
+                f"You can use the [cyan]`{SCRIPT_NAME} list-models`[/cyan] command to get a list of supported models."
+            ),
+            click_type=ModelParser(),
+            show_default=False,
+            metavar="MODEL_REPO",
+            autocompletion=lambda: list(REPO_TO_MODEL),
+        ),
+    ],
+    precision: Annotated[
+        Precision | None,
+        Option(
+            help="Precision to use for activations and non-quantized weights.",
+            show_default="Native precision of the model",
+        ),
+    ] = None,
+    weight_layout: Annotated[
+        WeightLayout | None,
+        Option(
+            help=(
+                "Order of dimensions in the weights of linear layers."
+                "\n\n\n\n"
+                "If set to AUTO, the layout will depend on the model."
+            ),
+            show_default="auto",
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Path | None,
+        Option(
+            help="Directory to save the converted model to.",
+            show_default="Saves the converted model in the `models/<model_name>` directory",
+        ),
+    ] = None,
+    context_length: Annotated[
+        int | None,
+        Option(
+            help="Maximum supported context length. Used to precompute positional embeddings.",
+            show_default="Model's native maximum context length.",
+        ),
+    ] = None,
+    include_traces: Annotated[
+        bool,
+        Option(
+            help="Export activation traces for debugging purposes.",
+        ),
+    ] = False,
+    overwrite: Annotated[
+        bool,
+        Option(
+            help="Overwrite existing model files.",
+        ),
+    ] = False,
+) -> None:
+    if precision is not None:
+        precision_dtype = config_converter.structure(precision.value, DTypeLike)  # type: ignore
+    else:
+        precision_dtype = None
+    if weight_layout is not None:
+        weight_layout = WeightLayout(weight_layout)
+    else:
+        weight_layout = WeightLayout.AUTO
+    if output_dir is None:
+        output_dir = DEFAULT_OUTPUT_DIR / model_repo.name
+    console.print(f"🚀 Converting [cyan]{model_repo.name}[/cyan] by [cyan]{model_repo.vendor}[/cyan].")
+    conversion_strs = [
+        f"⚙️ Using weight layout [cyan]{weight_layout}[/cyan]",
+    ]
+    if precision is not None:
+        conversion_strs.append(
+            f" and converting floating-point weights into [cyan]{precision.name.lower()}[/cyan] precision",
+        )
+    conversion_strs.append(".")
+    console.print("".join(conversion_strs))
+    if output_dir.exists() and not overwrite:
+        answer = console.input(
+            rf"⚠️ Output directory [cyan]{output_dir}[/cyan] already exists."
+            r" Do you want to overwrite it? [cyan]\[y/n][/cyan]: ",
+        )
+        while answer.lower() not in ["y", "n", "yes", "no"]:
+            answer = console.input("Please enter 'y' or 'n': ")
+        if answer.lower() in ["y", "yes"]:
+            shutil.rmtree(output_dir)
+        else:
+            console.print("Exiting...")
+            raise Exit
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        transient=True,
+    ) as progress:
+        progress.add_task("👨‍🍳 Cooking...")
+        model, metadata, tokenizer_file_paths = import_model(
+            model_repo,
+            precision=precision_dtype,
+            context_length=context_length,
+        )
+        progress.add_task(f"💾 Saving the model to {output_dir}")
+        output_dir.mkdir(parents=True, exist_ok=True)
+        weights = dict(model.export_weights(weight_layout))
+        packed_weights = _pack_uint4_weights(weights)
+        save_file(packed_weights, output_dir / "model.safetensors")
+        config_json = config_converter.unstructure(metadata, ModelMetadata)
+        with open(output_dir / "config.json", "w") as file:
+            json.dump(config_json, file, indent=4)
+        for path in tokenizer_file_paths:
+            shutil.copy(path, output_dir / path.name)
+        if include_traces:
+            progress.add_task("🚁 Generating traces...")
+            num_tokens = 512
+            token_stride = 8
+            token_ids = jnp.arange(0, num_tokens, dtype=jnp.int32)
+            token_positions = jnp.arange(0, num_tokens * token_stride, token_stride, dtype=jnp.int32)
+            result = model(
+                token_ids,
+                token_positions,
+                return_updated_kv_cache=True,
+                return_activation_trace=True,
+            )
+            traces = dict(result.export())
+            save_file(traces, output_dir / "traces.safetensors")
+    console.print(f"🧑‍🍳 Model successfully cooked and saved to [cyan]`{output_dir}`[/cyan]!")
+def _model_size_string_to_int(
+    size_str: str,
+    _regex: re.Pattern = re.compile(r"(?P<number>(\d+)(\.\d*)?)(?P<suffix>[KMBT])"),
+) -> float:
+    match = _regex.match(size_str)
+    factors = {
+        "K": 1024**1,
+        "M": 1024**2,
+        "B": 1024**3,
+        "T": 1024**4,
+    }
+    if match:
+        return float(match.group("number")) * factors[match.group("suffix")]
+    raise ValueError(f"Invalid size string: {size_str}")
+@app.command(help="List the supported models.")
+def list_models(
+    plain: Annotated[
+        bool,
+        Option(
+            help="Only list repo names without fancy formatting.",
+        ),
+    ] = False,
+) -> None:
+    sorted_specs = sorted(
+        REPO_TO_MODEL.values(),
+        key=lambda spec: (
+            spec.vendor.lower(),
+            spec.family.lower(),
+            _model_size_string_to_int(spec.size),
+            spec.name.lower(),
+        ),
+    )
+    if plain:
+        for spec in sorted_specs:
+            console.print(spec.repo)
+        return
+    table = Table(
+        show_header=True,
+        header_style="bold",
+        show_lines=True,
+        box=box.ROUNDED,
+    )
+    table.add_column("Vendor", justify="left", style="magenta")
+    table.add_column("Family", justify="left", style="magenta", no_wrap=True)
+    table.add_column("Size", justify="right", style="magenta")
+    table.add_column("Quant", justify="left", style="magenta")
+    table.add_column("Repo", justify="left", style="cyan", no_wrap=True)
+    for spec in sorted_specs:
+        table.add_row(
+            spec.vendor,
+            spec.family,
+            spec.size,
+            str(spec.quantization),
+            spec.repo,
+        )
+    console.print(table)
+if __name__ == "__main__":
+    app()

lalamo/quantization.py ADDED Viewed

@@ -0,0 +1,92 @@
+from enum import Enum
+from jax import numpy as jnp
+from jaxtyping import Array, DTypeLike, Float
+__all__ = ["QuantizationMode", "quantize_weights"]
+class QuantizationMode(Enum):
+    UINT4 = "uint4"
+    INT8 = "int8"
+    UINT8 = "uint8"
+    @classmethod
+    def from_num_bits(cls, num_bits: int) -> "QuantizationMode":
+        bit_to_mode = {
+            4: cls.UINT4,
+            8: cls.UINT8,
+        }
+        if num_bits not in bit_to_mode:
+            raise ValueError(f"No quantization mode defined for {num_bits} bits")
+        return bit_to_mode[num_bits]
+    @property
+    def range(self) -> tuple[int, int]:
+        return MODE_TO_RANGE[self]
+    @property
+    def dtype(self) -> DTypeLike:
+        value_to_dtype = {
+            QuantizationMode.UINT4: jnp.uint4,
+            QuantizationMode.INT8: jnp.int8,
+            QuantizationMode.UINT8: jnp.uint8,
+        }
+        return value_to_dtype[self]
+    @property
+    def bits(self) -> int:
+        value_to_bits = {
+            QuantizationMode.UINT4: 4,
+            QuantizationMode.INT8: 8,
+            QuantizationMode.UINT8: 8,
+        }
+        return value_to_bits[self]
+    def __str__(self) -> str:
+        return self.value
+MODE_TO_RANGE = {
+    QuantizationMode.UINT4: (0, 15),
+    QuantizationMode.INT8: (-128, 127),
+    QuantizationMode.UINT8: (0, 255),
+}
+def quantize_weights(x: Float[Array, "..."], mode: QuantizationMode) -> Float[Array, "..."]:
+    range_min, range_max = MODE_TO_RANGE[mode]
+    return jnp.clip(jnp.round(x), range_min, range_max)
+def dynamically_quantize_activations(
+    x: Float[Array, " channels"],
+    mode: QuantizationMode,
+) -> Float[Array, " channels"]:
+    # Reference implementation: https://github.com/pytorch/pytorch/blob/2ccbacfa24cae724ec1ea3bc7de189e5bf948d46/torch/ao/quantization/fx/_decomposed.py#L790
+    range_min, range_max = mode.range
+    min_val = jnp.min(x)
+    max_val = jnp.max(x)
+    min_val_neg = jnp.minimum(min_val, 0)
+    max_val_pos = jnp.maximum(max_val, 0)
+    # scale
+    scale = (max_val_pos - min_val_neg) / (range_max - range_min)
+    scale = jnp.maximum(scale, jnp.finfo(x.dtype).eps)
+    # zero point
+    descaled_min = min_val_neg / scale
+    descaled_max = max_val_pos / scale
+    zero_point_from_min_error = range_min + descaled_min
+    zero_point_from_max_error = range_max + descaled_max
+    zero_point = jnp.where(
+        zero_point_from_min_error + zero_point_from_max_error > 0,
+        range_min - descaled_min,
+        range_max - descaled_max,
+    )
+    zero_point = jnp.round(jnp.clip(zero_point, range_min, range_max))
+    x_normalized = x / scale + zero_point
+    x_quantized = jnp.clip(jnp.round(x_normalized), range_min, range_max)
+    return (x_quantized - zero_point) * scale

lalamo/utils.py ADDED Viewed

@@ -0,0 +1,55 @@
+import einops
+import jax.numpy as jnp
+import torch.utils.dlpack
+from jaxtyping import Array
+__all__ = [
+    "jax_to_torch",
+    "jax_uint4_to_packed_uint8",
+    "torch_to_jax",
+]
+@torch.no_grad()
+def _torch_to_jax_bfloat16(tensor: torch.Tensor) -> Array:
+    # Credit: https://github.com/jax-ml/ml_dtypes/issues/81#issuecomment-2399636232
+    if tensor.dtype != torch.bfloat16:
+        raise ValueError("Trying to convert non-bfloat16 tensor to bfloat16")
+    intermediate_tensor = tensor.view(torch.uint16)
+    return jnp.array(intermediate_tensor).view("bfloat16")
+def torch_to_jax(array: torch.Tensor) -> Array:
+    array = array.detach().cpu()
+    if array.dtype == torch.bfloat16:
+        return _torch_to_jax_bfloat16(array)
+    return jnp.array(array.numpy())
+def jax_to_torch(array: Array) -> torch.Tensor:
+    if array.dtype == jnp.bfloat16:
+        intermediate_array = array.view(jnp.uint16)
+        return torch.utils.dlpack.from_dlpack(intermediate_array).view(torch.bfloat16)
+    return torch.utils.dlpack.from_dlpack(array)
+def jax_uint4_to_packed_uint8(array: Array) -> Array:
+    if array.dtype != jnp.uint4:
+        raise ValueError(f"Input array must have dtype jnp.uint4, but got {array.dtype}")
+    if not array.shape:
+        raise ValueError("Input array cannot be a scalar and must have at least one dimension.")
+    *_, last_dim = array.shape
+    if last_dim % 2 != 0:
+        raise ValueError(f"The last dimension of the input array must be even, but got shape {array.shape}")
+    low_nibbles, high_nibbles = einops.rearrange(
+        array.astype(jnp.uint8),
+        "... (dim_half two) -> two ... dim_half",
+        two=2,
+    )
+    packed = (high_nibbles << 4) | low_nibbles
+    return packed.astype(jnp.uint8)

lalamo-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,74 @@
+Metadata-Version: 2.4
+Name: lalamo
+Version: 0.2.1
+Summary: JAX library for optimization and export of models for use with the UZU inference engine.
+Requires-Python: <4,>=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: cattrs>=24.1.2
+Requires-Dist: click>=8.1.8
+Requires-Dist: einops>=0.8.0
+Requires-Dist: equinox>=0.11.11
+Requires-Dist: huggingface-hub[hf-transfer]>=0.27.1
+Requires-Dist: jax>=0.4.38; sys_platform == "darwin"
+Requires-Dist: jax[cuda]>=0.4.38; sys_platform == "linux"
+Requires-Dist: jaxtyping>=0.2.36
+Requires-Dist: ml-dtypes>=0.5.1
+Requires-Dist: optax>=0.2.4
+Requires-Dist: rich>=14.0.0
+Requires-Dist: thefuzz>=0.22.1
+Requires-Dist: typer>=0.15.1
+Dynamic: license-file
+<p align="center">
+  <picture>
+    <img alt="Mirai" src="https://artifacts.trymirai.com/social/github/lalamo-header.jpg" style="max-width: 100%;">
+  </picture>
+</p>
+<a href="https://artifacts.trymirai.com/social/about_us.mp3"><img src="https://img.shields.io/badge/Listen-Podcast-red" alt="Listen to our podcast"></a>
+<a href="https://docsend.com/v/76bpr/mirai2025"><img src="https://img.shields.io/badge/View-Deck-red" alt="View our deck"></a>
+<a href="mailto:alexey@getmirai.co,dima@getmirai.co,aleksei@getmirai.co?subject=Interested%20in%20Mirai"><img src="https://img.shields.io/badge/Send-Email-green" alt="Contact us"></a>
+<a href="https://docs.trymirai.com/components/models"><img src="https://img.shields.io/badge/Read-Docs-blue" alt="Read docs"></a>
+[![License](https://img.shields.io/badge/License-MIT-blue)](LICENSE)
+# lalamo
+A set of tools for adapting Large Language Models to on-device inference using the [uzu](https://github.com/trymirai/uzu) inference engine.
+## Quick Start
+To get the list of [supported models](https://trymirai.com/models), run:
+```bash
+uv run lalamo list-models
+```
+To convert a model, run:
+```bash
+uv run lalamo convert MODEL_REPO --precision float16
+```
+After that, you can find the converted model in the `models` folder. For more options see `uv run lalamo convert --help`.
+## Model Support
+To add support for a new model, write the corresponding [ModelSpec](lalamo/model_import/model_specs), as shown in the example below:
+```python
+ModelSpec(
+    vendor="Google",
+    family="Gemma-3",
+    name="Gemma-3-1B-Instruct",
+    size="1B",
+    quantization=None,
+    repo="google/gemma-3-1b-it",
+    config_type=HFGemma3TextConfig,
+    config_file_name="config.json",
+    weights_file_names=huggingface_weight_files(1),
+    weights_type=WeightsType.SAFETENSORS,
+    tokenizer_files=HUGGINGFACE_TOKENIZER_FILES,
+    use_cases=tuple(),
+)
+```

lalamo-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+lalamo/__init__.py,sha256=uKBR6vAH2AmdpPqz1q2zVVwQyCpWRWUHAfm-uQg8DAM,217
+lalamo/common.py,sha256=uYLw68V4AF3zlENG3KAIKRpOFXVHv8xX_n0cc3qJnj4,1877
+lalamo/language_model.py,sha256=GiA_BDQuYCgVBFHljb_ltW_M7g3I1Siwm111M3Jc8MM,9286
+lalamo/main.py,sha256=K2RLyTcxvBCP0teSsminssj_oUkuQAQ5y9ixa1uOqas,9546
+lalamo/quantization.py,sha256=8o6ryIZLzzDYQuvBTboPfaVVdfijAKGpTxOcg3GKVD8,2752
+lalamo/utils.py,sha256=QzkT0_82nd9pS5p0e7yOOdL_ZeKQr_Ftj4kFrWF35R8,1754
+lalamo-0.2.1.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
+lalamo-0.2.1.dist-info/METADATA,sha256=1qDWPQiCYK_EIeff-oiaF7VeIksGNdZ4nCFikHXGJR4,2611
+lalamo-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+lalamo-0.2.1.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
+lalamo-0.2.1.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
+lalamo-0.2.1.dist-info/RECORD,,

lalamo-0.2.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

lalamo-0.2.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ lalamo = lalamo.main:app

lalamo-0.2.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Mirai Tech Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

lalamo-0.2.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ lalamo