PyPI - lalamo - Versions diffs - 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

lalamo 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

lalamo/__init__.py +1 -1
lalamo/common.py +55 -1
lalamo/speculator/estimator.py +1 -1
lalamo/speculator/inference.py +65 -54
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/METADATA +1 -1
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/RECORD +10 -10
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/WHEEL +0 -0
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/entry_points.txt +0 -0
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/top_level.txt +0 -0

lalamo/__init__.py CHANGED Viewed

@@ -32,7 +32,7 @@ from lalamo.speculator import (
     SpeculatorTrainingEvent,
 )
-__version__ = "0.6.4"
+__version__ = "0.6.5"
 __all__ = [
     "AssistantMessage",

lalamo/common.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import warnings
 from collections import defaultdict
-from collections.abc import Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from typing import cast
 import jax.numpy as jnp
 from jax._src.api import ShapeDtypeStruct
+from jax.errors import JaxRuntimeError
 from jaxtyping import Array, DTypeLike
 from lalamo.utils import MapDictValues, MapSequence
@@ -11,8 +13,10 @@ from lalamo.utils import MapDictValues, MapSequence
 __all__ = [
     "DEFAULT_PRECISION",
     "ArrayLike",
+    "LalamoWarning",
     "ParameterPath",
     "ParameterTree",
+    "decrease_batchsize_on_oom",
     "dummy_array",
     "flatten_parameters",
     "require_array",
@@ -23,6 +27,10 @@ __all__ = [
 DEFAULT_PRECISION: DTypeLike = jnp.bfloat16
+class LalamoWarning(UserWarning):
+    """Custom warning class for Lalamo-specific warnings."""
 type ArrayLike = Array | ShapeDtypeStruct
@@ -121,3 +129,49 @@ class ParameterPath(str):
         if not self:
             return ParameterPath(str(other))
         return ParameterPath(self + "." + str(other))
+def decrease_batchsize_on_oom[T](
+    fn: Callable[[int], Iterable[T]],
+    starting_batch_size: int,
+) -> Iterable[T]:
+    """
+    Execute fn(batch_size) with automatic batch size reduction on OOM.
+    Only reduces batch size if OOM happened on the first batch.
+    Args:
+        fn: Function that takes batch_size and returns an iterable
+        starting_batch_size: Initial batch size to try
+    Yields:
+        Results from fn(batch_size)
+    Raises:
+        JaxRuntimeError: If OOM occurs after first batch completes or at batch_size=1
+    """
+    first_batch_completed = False
+    effective_batch_size = starting_batch_size
+    while True:
+        try:
+            for result in fn(effective_batch_size):
+                yield result
+                # as soon as we yielded we are not allowed to retry anymore
+                # to make sure we don't ever miss/duplicate outputs
+                first_batch_completed = True
+            break
+        except JaxRuntimeError:
+            if first_batch_completed:
+                raise
+            # because OOM's sometimes generate stuff that won't be garbage collected,
+            # we need to be very aggressive with decreasing batchsize here
+            new_bs = max(int(0.7 * effective_batch_size - 1), 1)
+            if new_bs == 1 and effective_batch_size == 1:
+                raise
+            warnings.warn(
+                f"OOM detected. Reducing batch size {effective_batch_size} -> {new_bs}.",
+                LalamoWarning,
+                stacklevel=3,
+            )
+            effective_batch_size = new_bs

lalamo/speculator/estimator.py CHANGED Viewed

@@ -41,7 +41,7 @@ def get_default_device_bytes() -> int | None:
 def get_usable_memory_from_bytes(limit_bytes: int) -> int:
     # JAX allocates a bit more than it needs, so we discount it by some safety factor
-    return int(limit_bytes * 0.95)
+    return int(limit_bytes * 0.93)
 def estimate_memory_from_batchsize(

lalamo/speculator/inference.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import functools
 from collections.abc import Callable, Iterable
-from itertools import batched, chain
+from itertools import batched, chain, islice
 from typing import NamedTuple
 import jax
 import jax.numpy as jnp
+from jax._src.stages import Compiled
+from lalamo.common import decrease_batchsize_on_oom
 from lalamo.data.lalamo_completions import LalamoCompletion
 from lalamo.data.utils import get_prefixes_ending_in_user_message
 from lalamo.message_processor import Message
@@ -27,75 +29,84 @@ def inference_collect_traces(
     tokens_to_generate: int | None = None,
     progress_callback: Callable[[CollectTracesEvent], None] | None = None,
 ) -> Iterable[LalamoCompletion]:
-    generate_tokens_compiled = (
-        jax.jit(
-            functools.partial(
-                LanguageModel.generate_tokens,
-                max_output_length=max_output_length,
-                num_top_logits_to_return=num_top_logits_to_collect,
-            ),
+    def make_generate_tokens_compiled(batch_size: int) -> Compiled:
+        return (
+            jax.jit(
+                functools.partial(
+                    LanguageModel.generate_tokens,
+                    max_output_length=max_output_length,
+                    num_top_logits_to_return=num_top_logits_to_collect,
+                ),
+            )
+            .lower(
+                model,
+                prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
+                prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
+            )
+            # the autotune levels are (according to https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level)
+            # 0 - no autotune, gpu shouldn't be touched
+            # 1 - basic level, gpu should be touched veeery little
+            # 2,3 - gpu touched more and more
+            # 4 (default) - gpu might allocate more memory than the run would require!
+            .compile(compiler_options={"xla_gpu_autotune_level": "0"})
         )
-        .lower(
-            model,
-            prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
-            prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
-        )
-        # the autotune levels are (according to https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level)
-        # 0 - no autotune, gpu shouldn't be touched
-        # 1 - basic level, gpu should be touched veeery little
-        # 2,3 - gpu touched more and more
-        # 4 (default) - gpu might allocate more memory than the run would require!
-        .compile(compiler_options={"xla_gpu_autotune_level": "2"})
-    )
     prefixes = chain.from_iterable(map(get_prefixes_ending_in_user_message, conversations))
     tokenized_prefixes = map(model.message_processor.tokenize_request, prefixes)
     filtered_prefixes = filter(lambda conv: len(conv) <= max_input_length, tokenized_prefixes)
-    tokens_generated, sequences_processed = 0, 0
+    test_batch = list(islice(filtered_prefixes, batch_size))
-    for real_batch in batched(filtered_prefixes, n=batch_size):
-        batch_padding = batch_size - len(real_batch)
-        batch = (*real_batch, *(([0],) * batch_padding))
+    def collect_traces_body(batch_size: int) -> Iterable[LalamoCompletion]:
+        tokens_generated, sequences_processed = 0, 0
+        generate_tokens_compiled = make_generate_tokens_compiled(batch_size)
+        for real_batch in batched(chain(test_batch, filtered_prefixes), n=batch_size):
+            batch_padding = batch_size - len(real_batch)
+            batch = (*real_batch, *(([0],) * batch_padding))
-        length_without_padding = jnp.array(list(map(len, batch)))
+            length_without_padding = jnp.array(list(map(len, batch)))
-        padded = jnp.array(
-            [jnp.pad(jnp.array(tokens), (0, max_input_length - len(tokens)), constant_values=0) for tokens in batch],
-        )
+            padded = jnp.array(
+                [
+                    jnp.pad(jnp.array(tokens), (0, max_input_length - len(tokens)), constant_values=0)
+                    for tokens in batch
+                ],
+            )
-        generated = generate_tokens_compiled(
-            model,
-            prompt_token_ids=padded,
-            prompt_lengths_without_padding=length_without_padding,
-        )
+            generated = generate_tokens_compiled(
+                model,
+                prompt_token_ids=padded,
+                prompt_lengths_without_padding=length_without_padding,
+            )
-        assert generated.top_k_token_ids is not None and generated.top_k_token_logits is not None
+            assert generated.top_k_token_ids is not None and generated.top_k_token_logits is not None
-        for conv_idx in range(len(real_batch)):
-            token_ids = generated.token_ids[conv_idx].tolist()
-            seqlen = next((i + 1 for i, t in enumerate(token_ids) if t in model.stop_token_ids), len(token_ids))
-            if tokens_to_generate is not None:
-                seqlen = min(seqlen, tokens_to_generate - tokens_generated)
-            tokens_generated += seqlen
-            sequences_processed += 1
+            for conv_idx in range(len(real_batch)):
+                token_ids = generated.token_ids[conv_idx].tolist()
+                seqlen = next((i + 1 for i, t in enumerate(token_ids) if t in model.stop_token_ids), len(token_ids))
+                if tokens_to_generate is not None:
+                    seqlen = min(seqlen, tokens_to_generate - tokens_generated)
+                tokens_generated += seqlen
+                sequences_processed += 1
-            token_ids = token_ids[:seqlen]
-            token_logits_ids = generated.top_k_token_ids[conv_idx, : len(token_ids)].tolist()
-            token_logits_values = generated.top_k_token_logits[conv_idx, : len(token_ids)].tolist()
-            token_logits = [
-                dict(zip(keys, values, strict=True))
-                for keys, values in zip(token_logits_ids, token_logits_values, strict=True)
-            ]
+                token_ids = token_ids[:seqlen]
+                token_logits_ids = generated.top_k_token_ids[conv_idx, : len(token_ids)].tolist()
+                token_logits_values = generated.top_k_token_logits[conv_idx, : len(token_ids)].tolist()
+                token_logits = [
+                    dict(zip(keys, values, strict=True))
+                    for keys, values in zip(token_logits_ids, token_logits_values, strict=True)
+                ]
-            yield LalamoCompletion(batch[conv_idx], token_ids, token_logits)
+                yield LalamoCompletion(batch[conv_idx], token_ids, token_logits)
+                if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
+                    break
+            if progress_callback is not None:
+                progress_callback(CollectTracesEvent(sequences_processed, tokens_generated))
             if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
                 break
-        if progress_callback is not None:
-            progress_callback(CollectTracesEvent(sequences_processed, tokens_generated))
-        if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
-            break
+    yield from decrease_batchsize_on_oom(collect_traces_body, batch_size)

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.6.4
+Version: 0.6.5
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-lalamo/__init__.py,sha256=RDkf5Hhglc-fLZ-CmI4R-th6UgJKYmN-1hdbCzTiVx8,1532
+lalamo/__init__.py,sha256=RpKc5sKIQHI8tPVwzH7lIJJWE7tJy6FZauEhabEp2Hg,1532
 lalamo/commands.py,sha256=zXyyrLTHhP9wouwtpX4RUZeEF6No-_9ee-y_GWGhw7k,10972
-lalamo/common.py,sha256=WaNJx20eUX4CBF50aym9lniGAiX-SzBJzDzO5Jh6zXA,4312
+lalamo/common.py,sha256=ddGIPlFCgo6Q683v8uP8G2dh8nsCJe9woZL8A_7_Rt4,6124
 lalamo/main.py,sha256=f1zHYQpX_OndAguOE0wqIOkzjzUolUC7w3_1ndtMC4Y,27655
 lalamo/message_processor.py,sha256=PMKte9YijT3h9N7DjTNp8H4V45A_qlDqJaubqFevLX8,5924
 lalamo/quantization.py,sha256=8o6ryIZLzzDYQuvBTboPfaVVdfijAKGpTxOcg3GKVD8,2752
@@ -83,13 +83,13 @@ lalamo/modules/token_mixers/state/mamba_state.py,sha256=LHzJvNE6MkB7nrsZSNto6pxb
 lalamo/modules/token_mixers/state/short_conv_state.py,sha256=osjcDHoeFWQaUoOROzeJe8F1qC8rvqunimGD4CuIDHo,895
 lalamo/speculator/__init__.py,sha256=9-tmZcbCom_lIGpJYn6xLlnEahFLFidpqmgkafmu--k,456
 lalamo/speculator/common.py,sha256=PudF_gkpe5_nQ-57sAC-foE1xCy_H2Axh5KwRoA86lo,587
-lalamo/speculator/estimator.py,sha256=6T8NdmDdhvP0BPg7vdkB_pxAkfgpu4WktNpUHtFuyiE,3833
-lalamo/speculator/inference.py,sha256=uEv33Qqcpa2xqEKdIzmPzkAzRsZOlb8TPeEG6TP6fjo,4071
+lalamo/speculator/estimator.py,sha256=WPG3rxKq4iLro8QwcePF766ageexHc17ANiF5rKAlKU,3833
+lalamo/speculator/inference.py,sha256=47TUiLV0Dkk3dbf1-IkdlWbHCICFw6IDwKZ73FYQUQo,4802
 lalamo/speculator/ngram.py,sha256=2eqInIieJPaQHCvIfnCIDtwMa8PGEtiND_NkG7plE34,5899
 lalamo/speculator/utils.py,sha256=0wZoMMIzzk0Q-3zq5H5f-JBplePNHxywndkrNtOJOyo,1697
-lalamo-0.6.4.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
-lalamo-0.6.4.dist-info/METADATA,sha256=oS1EAJBl3jBtvZU0Rd-UcjnL2Trngree7Syn2L16Rx8,3112
-lalamo-0.6.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-lalamo-0.6.4.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
-lalamo-0.6.4.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
-lalamo-0.6.4.dist-info/RECORD,,
+lalamo-0.6.5.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
+lalamo-0.6.5.dist-info/METADATA,sha256=EWI8eHaPSj7tXrW7xW9BPNpeDRjboNNvtGbq3hRELzU,3112
+lalamo-0.6.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+lalamo-0.6.5.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
+lalamo-0.6.5.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
+lalamo-0.6.5.dist-info/RECORD,,

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lalamo-0.6.4.dist-info → lalamo-0.6.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

lalamo 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl

lalamo 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl