PyPI - lalamo - Versions diffs - 0.6.5__tar.gz → 0.6.6__tar.gz - Mend

lalamo 0.6.5tar.gz → 0.6.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

{lalamo-0.6.5 → lalamo-0.6.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.6.5
+Version: 0.6.6
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.6.5 → lalamo-0.6.6}/lalamo/__init__.py RENAMED Viewed

@@ -32,7 +32,7 @@ from lalamo.speculator import (
     SpeculatorTrainingEvent,
 )
-__version__ = "0.6.5"
+__version__ = "0.6.6"
 __all__ = [
     "AssistantMessage",

{lalamo-0.6.5 → lalamo-0.6.6}/lalamo/commands.py RENAMED Viewed

@@ -1,16 +1,22 @@
 import json
+import shutil
+import tempfile
 from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from enum import Enum
 from itertools import chain
 from pathlib import Path
+import polars as pl
+import requests
+import thefuzz.process
 from jaxtyping import DTypeLike
-from lalamo.common import flatten_parameters
-from lalamo.data import import_hf_parquet
+from lalamo.common import flatten_parameters, get_default_device_bytes
+from lalamo.data import load_hf_parquet, shuffle_dataset
+from lalamo.data.huggingface_message import HFMessage
 from lalamo.data.lalamo_completions import LalamoCompletion
-from lalamo.message_processor import Message
+from lalamo.message_processor import AssistantMessage, Message
 from lalamo.model_import import ModelMetadata, ModelSpec, import_model
 from lalamo.model_import.common import (
     DownloadingFileEvent,
@@ -20,15 +26,107 @@ from lalamo.model_import.common import (
     InitializingModelEvent,
     StatusEvent,
 )
+from lalamo.model_import.remote_registry import RegistryModel, RegistryModelFile
 from lalamo.models import LanguageModelConfig
+from lalamo.models.common import BatchSizesComputedEvent, InferenceConfig
+from lalamo.models.lm_helpers import estimate_batchsize_from_bytes
 from lalamo.modules import config_converter
 from lalamo.safetensors import safe_write
-from lalamo.speculator.estimator import EstimateBatchsizeFromMemoryEvent, estimate_batchsize_from_memory
 from lalamo.speculator.inference import CollectTracesEvent, inference_collect_traces
 from lalamo.speculator.ngram import NGramSpeculator
 from lalamo.speculator.utils import SpeculatorTrainingEvent, train_speculator
+@dataclass
+class PullCallbacks:
+    model_spec: RegistryModel
+    output_dir: Path
+    overwrite: bool
+    def started(self) -> None:
+        pass
+    def output_dir_exists(self) -> None:
+        raise RuntimeError(f"{self.output_dir=} already exists, refusing to overwrite!")
+    def downloading(self, file_spec: RegistryModelFile) -> None:
+        pass
+    def finished_downloading(self, file_spec: RegistryModelFile) -> None:
+        pass
+    def finished(self) -> None:
+        pass
+def _download_file(url: str, dest_path: Path) -> None:
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()
+    with open(dest_path, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+def _suggest_similar_models(query: str, available_models: list[RegistryModel], limit: int = 3) -> list[str]:
+    repo_ids = [m.repo_id for m in available_models]
+    matches = thefuzz.process.extract(query, repo_ids, limit=limit)
+    return [match[0] for match in matches if match[1] >= 50]
+def pull(
+    model_spec: RegistryModel,
+    output_dir: Path,
+    callbacks_type: Callable[
+        [
+            RegistryModel,
+            Path,
+            bool,
+        ],
+        PullCallbacks,
+    ] = PullCallbacks,
+    overwrite: bool = False,
+) -> None:
+    callbacks = callbacks_type(model_spec, output_dir, overwrite)
+    if output_dir.exists():
+        callbacks.output_dir_exists()
+    callbacks.started()
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        for file_spec in model_spec.files:
+            callbacks.downloading(file_spec)
+            # Security: validate filename to prevent path traversal attacks
+            safe_name = Path(file_spec.name).name
+            if not safe_name or safe_name != file_spec.name:
+                raise RuntimeError(
+                    f"Invalid filename from registry: {file_spec.name!r}. "
+                    f"Filenames must not contain path separators or traversal sequences.",
+                )
+            file_path = temp_path / safe_name
+            try:
+                _download_file(file_spec.url, file_path)
+            except requests.RequestException as e:
+                raise RuntimeError(f"Failed to download {safe_name}: {e}") from e
+            callbacks.finished_downloading(file_spec)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for file_spec in model_spec.files:
+            safe_name = Path(file_spec.name).name
+            src = temp_path / safe_name
+            dst = output_dir / safe_name
+            shutil.move(str(src), str(dst))
+    callbacks.finished()
 class Precision(Enum):
     FLOAT32 = "float32"
     FLOAT16 = "float16"
@@ -244,16 +342,19 @@ def estimate_batchsize(
     model = LanguageModelConfig.load_model(model_path)
     callbacks.finished_loading_model()
-    def progress_callback(event: EstimateBatchsizeFromMemoryEvent) -> None:
-        callbacks.estimating_batchsize(event.lo, event.hi)
-    bs = estimate_batchsize_from_memory(
-        model,
-        max_input_length,
-        max_output_length,
-        num_logits_per_token,
+    def memory_per_batchsize(batch_size: int) -> int:
+        inference_config = InferenceConfig(
+            max_output_length=max_output_length,
+            padded_length=max_input_length,
+            num_top_logits_to_return=num_logits_per_token,
+            batch_size=batch_size,
+        )
+        return model.estimate_memory_consumption(inference_config=inference_config)
+    bs = estimate_batchsize_from_bytes(
+        memory_per_batchsize,
         mem,
-        progress_callback,
+        lambda event: callbacks.estimating_batchsize(event.lo, event.hi),
     )
     callbacks.finished_estimating_batchsize(bs)
@@ -329,7 +430,11 @@ def collect_traces(
     callbacks.finished_loading_model()
     callbacks.loading_dataset()
-    dataset = iter(import_hf_parquet(dataset_path))
+    dataframe = shuffle_dataset(load_hf_parquet(dataset_path))
+    conversations = dataframe.get_column("conversation")
+    dataset = iter(
+        [HFMessage.from_dict(message).as_message() for message in conversation] for conversation in conversations
+    )
     dataset = chain([next(dataset)], dataset)  # iterator is lazy, force it to actually open the file
     callbacks.finished_loading_dataset()
@@ -427,3 +532,131 @@ def train(
     with open(output_path, "wb") as fd:
         fd.write(speculator.serialize())
     callbacks.finished_saving_speculator()
+@dataclass
+class GenerateRepliesCallbacks:
+    model_path: Path
+    dataset_path: Path
+    output_path: Path
+    max_vram: int | None
+    batch_size: int | None
+    total_rows: int
+    def loading_model(self) -> None:
+        pass
+    def finished_loading_model(self) -> None:
+        pass
+    def loading_dataset(self) -> None:
+        pass
+    def finished_loading_dataset(self) -> None:
+        pass
+    def estimating_batchsize(self, sequence_length: int, lo: int, hi: int | None) -> None:
+        pass
+    def batch_sizes_estimated(self) -> None:
+        pass
+    def batch_sizes_computed(self, event: BatchSizesComputedEvent) -> None:
+        pass
+    def generation_progress(self, rows_processed: int) -> None:
+        pass
+    def finished_generation(self) -> None:
+        pass
+def generate_replies(
+    model_path: Path,
+    dataset_path: Path,
+    output_path: Path,
+    max_vram: int | None,
+    max_output_length: int = 8192,
+    batch_size: int | None = None,
+    callbacks_type: Callable[
+        [
+            Path,
+            Path,
+            Path,
+            int | None,
+            int | None,
+            int,
+        ],
+        GenerateRepliesCallbacks,
+    ] = GenerateRepliesCallbacks,
+) -> None:
+    # figure out max_vram if neither batch_size nor max_vram is set
+    if max_vram is None and batch_size is None:
+        max_vram = get_default_device_bytes()
+        if max_vram is None:
+            raise ValueError(
+                "Unable to determine default defice memory capacity; please specify either --vram-gb or --batch-size",
+            )
+    # Count rows without loading full dataset
+    total_rows = pl.scan_parquet(dataset_path).select(pl.len()).collect().item()
+    callbacks = callbacks_type(
+        model_path,
+        dataset_path,
+        output_path,
+        max_vram,
+        batch_size,
+        total_rows,
+    )
+    callbacks.loading_model()
+    model = LanguageModelConfig.load_model(model_path)
+    callbacks.finished_loading_model()
+    callbacks.loading_dataset()
+    dataframe = load_hf_parquet(dataset_path).collect()
+    conversations = dataframe.get_column("conversation")
+    dataset = iter(
+        [HFMessage.from_dict(message).as_message() for message in conversation] for conversation in conversations
+    )
+    try:
+        first_row = next(dataset)
+    except StopIteration:
+        callbacks.finished_loading_dataset()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        pl.DataFrame({"response": [], "chain_of_thought": []}).write_parquet(output_path)
+        return
+    dataset = chain([first_row], dataset)  # iterator is lazy, force it to actually open the file
+    callbacks.finished_loading_dataset()
+    inference_config = InferenceConfig(max_output_length=max_output_length, batch_size=batch_size)
+    callbacks.batch_sizes_estimated()
+    replies: list[tuple[int, AssistantMessage]] = []
+    for rows_processed, (idx, reply) in enumerate(
+        model.reply_many(
+            dataset,
+            inference_config=inference_config,
+            vram_bytes=max_vram,
+            batch_sizes_callback=callbacks.batch_sizes_computed,
+        ),
+    ):
+        replies.append((idx, reply))
+        callbacks.generation_progress(rows_processed)
+    # Sort by original index to restore input order
+    replies.sort(key=lambda x: x[0])
+    df = pl.DataFrame(
+        {
+            "response": [reply.response for _, reply in replies],
+            "chain_of_thought": [reply.chain_of_thought for _, reply in replies],
+        },
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.write_parquet(output_path)
+    callbacks.finished_generation()

{lalamo-0.6.5 → lalamo-0.6.6}/lalamo/common.py RENAMED Viewed

@@ -1,11 +1,11 @@
-import warnings
+import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Mapping, Sequence
+from collections.abc import Mapping, Sequence
 from typing import cast
+import jax
 import jax.numpy as jnp
 from jax._src.api import ShapeDtypeStruct
-from jax.errors import JaxRuntimeError
 from jaxtyping import Array, DTypeLike
 from lalamo.utils import MapDictValues, MapSequence
@@ -16,7 +16,6 @@ __all__ = [
     "LalamoWarning",
     "ParameterPath",
     "ParameterTree",
-    "decrease_batchsize_on_oom",
     "dummy_array",
     "flatten_parameters",
     "require_array",
@@ -131,47 +130,27 @@ class ParameterPath(str):
         return ParameterPath(self + "." + str(other))
-def decrease_batchsize_on_oom[T](
-    fn: Callable[[int], Iterable[T]],
-    starting_batch_size: int,
-) -> Iterable[T]:
-    """
-    Execute fn(batch_size) with automatic batch size reduction on OOM.
-    Only reduces batch size if OOM happened on the first batch.
-    Args:
-        fn: Function that takes batch_size and returns an iterable
-        starting_batch_size: Initial batch size to try
-    Yields:
-        Results from fn(batch_size)
-    Raises:
-        JaxRuntimeError: If OOM occurs after first batch completes or at batch_size=1
-    """
-    first_batch_completed = False
-    effective_batch_size = starting_batch_size
-    while True:
-        try:
-            for result in fn(effective_batch_size):
-                yield result
-                # as soon as we yielded we are not allowed to retry anymore
-                # to make sure we don't ever miss/duplicate outputs
-                first_batch_completed = True
-            break
-        except JaxRuntimeError:
-            if first_batch_completed:
-                raise
-            # because OOM's sometimes generate stuff that won't be garbage collected,
-            # we need to be very aggressive with decreasing batchsize here
-            new_bs = max(int(0.7 * effective_batch_size - 1), 1)
-            if new_bs == 1 and effective_batch_size == 1:
-                raise
-            warnings.warn(
-                f"OOM detected. Reducing batch size {effective_batch_size} -> {new_bs}.",
-                LalamoWarning,
-                stacklevel=3,
-            )
-            effective_batch_size = new_bs
+def get_default_device_bytes() -> int | None:
+    dynamic_allocate = False
+    preallocate = os.getenv("XLA_PYTHON_CLIENT_PREALLOCATE", "")
+    dynamic_allocate |= preallocate.strip().lower() in {"0", "false", "no", "off"}
+    allocator = os.getenv("XLA_PYTHON_CLIENT_ALLOCATOR", "")
+    dynamic_allocate |= allocator.strip().lower() in {"platform", "cuda_malloc_async"}
+    if dynamic_allocate:
+        return None
+    memory_stats = jax.local_devices()[0].memory_stats()
+    if memory_stats is None or "bytes_limit" not in memory_stats:
+        return None
+    # 500mb is seemingly the usually observed overhead
+    memory_limit = memory_stats["bytes_limit"] - (500 * 1000 * 1000)
+    return memory_limit
+def get_usable_memory_from_bytes(limit_bytes: int) -> int:
+    return int(limit_bytes * 0.95)

{lalamo-0.6.5 → lalamo-0.6.6}/lalamo/data/__init__.py RENAMED Viewed

@@ -1,7 +1,8 @@
-from .huggingface_message import import_hf_parquet
+from .huggingface_message import load_hf_parquet, shuffle_dataset
 from .utils import get_prefixes_ending_in_user_message
 __all__ = [
     "get_prefixes_ending_in_user_message",
-    "import_hf_parquet",
+    "load_hf_parquet",
+    "shuffle_dataset",
 ]

{lalamo-0.6.5 → lalamo-0.6.6}/lalamo/data/huggingface_message.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
 from typing import ClassVar, Self
@@ -30,10 +29,10 @@ class HFMessage:
                 raise ValueError(f"Cannot convert {other} message")
-def import_hf_parquet(path: Path | str) -> Iterable[list[Message]]:
+def load_hf_parquet(path: Path | str) -> pl.LazyFrame:
     path = Path(path)
+    return pl.scan_parquet(path)
-    dataframe = pl.scan_parquet(path).collect()
-    for conversation in dataframe.get_column("conversation").shuffle(1337):
-        yield [HFMessage.from_dict(message).as_message() for message in conversation]
+def shuffle_dataset(frame: pl.LazyFrame, seed: int = 1337) -> pl.DataFrame:
+    return frame.collect().sample(fraction=1.0, shuffle=True, seed=seed)

lalamo 0.6.5__tar.gz → 0.6.6__tar.gz

lalamo 0.6.5tar.gz → 0.6.6tar.gz