PyPI - lalamo - Versions diffs - 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl - Mend

lalamo 0.6.2py3-none-any.whl → 0.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

lalamo/__init__.py CHANGED Viewed

@@ -1,3 +1,8 @@
+import os
+# Must run before importing jax / tensorflow, this hides the XLA optimization logs
+os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
 from lalamo.commands import (
     CollectTracesCallbacks,
     ConversionCallbacks,
@@ -27,7 +32,7 @@ from lalamo.speculator import (
     SpeculatorTrainingEvent,
 )
-__version__ = "0.6.2"
+__version__ = "0.6.4"
 __all__ = [
     "AssistantMessage",

lalamo/main.py CHANGED Viewed

@@ -49,7 +49,10 @@ from lalamo.message_processor import UserMessage
 from lalamo.model_import import REPO_TO_MODEL, ModelSpec
 from lalamo.model_import.common import FileSpec
 from lalamo.models import ClassifierModelConfig, LanguageModelConfig
-from lalamo.speculator.estimator import get_default_device_memory
+from lalamo.speculator.estimator import (
+    get_default_device_bytes,
+    get_usable_memory_from_bytes,
+)
 from lalamo.speculator.ngram import NGramSpeculator
 from lalamo.speculator.utils import test_speculator
@@ -384,6 +387,7 @@ class CliTraceCallbacks(TraceCallbacks):
         self.stack.close()
         console.print(f"💾 Trace saved to [cyan]{self.output_path}[/cyan]")
 @app.command(help="Trace a model.")
 def trace(
     model_path: Annotated[
@@ -557,14 +561,24 @@ def estimate_batchsize(
     ] = None,
 ) -> None:
     if vram_gb is not None:
-        mem = vram_gb * 1024 * 1024 * 1024
-    elif (mem := get_default_device_memory()) is None:
+        # note that in practice GPUs use GiB in their docs, e.g. H100 actually has 85GB of memory
+        mem_bytes = vram_gb * 1000 * 1000 * 1000
+    elif (mem_bytes := get_default_device_bytes()) is None:
         err_console.print("Cannot get the default device's memory stats, use --vram-gb")
         raise Exit(1)
+    usable_mem = get_usable_memory_from_bytes(mem_bytes)
     callbacks_type = CliEstimateBatchsizeCallbacks
-    _estimate_batchsize(model_path, mem, max_input_length, max_output_length, num_logits_per_token, callbacks_type)
+    _estimate_batchsize(
+        model_path,
+        usable_mem,
+        max_input_length,
+        max_output_length,
+        num_logits_per_token,
+        callbacks_type,
+    )
 @dataclass

lalamo/speculator/estimator.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import functools
 import itertools
+import os
 from collections.abc import Callable
 from typing import NamedTuple
@@ -9,11 +10,38 @@ import jax.numpy as jnp
 from lalamo.models import LanguageModel
-def get_default_device_memory() -> int | None:
+def get_default_device_bytes() -> int | None:
+    dynamic_allocate = False
+    preallocate = os.getenv("XLA_PYTHON_CLIENT_PREALLOCATE", "")
+    dynamic_allocate |= preallocate.strip().lower() in {"0", "false", "no", "off"}
+    allocator = os.getenv("XLA_PYTHON_CLIENT_ALLOCATOR", "")
+    dynamic_allocate |= allocator.strip().lower() in {"platform", "cuda_malloc_async"}
+    if dynamic_allocate:
+        return None
     memory_stats = jax.local_devices()[0].memory_stats()
     if memory_stats is None or "bytes_limit" not in memory_stats:
         return None
-    return memory_stats["bytes_limit"]
+    mem_fraction_raw = os.getenv("XLA_PYTHON_CLIENT_MEM_FRACTION", "")
+    try:
+        mem_fraction = float(mem_fraction_raw)
+    except ValueError:
+        mem_fraction = 0.75  # jax default https://docs.jax.dev/en/latest/gpu_memory_allocation.html
+    # 500mb is seemingly the usually observed overhead; this tries to match the actual capacity of the gpu
+    # so it should correspond to something you'd see in nvidia-smi
+    memory_limit = memory_stats["bytes_limit"] / min(mem_fraction, 1.0) + (500 * 1000 * 1000)
+    return get_usable_memory_from_bytes(memory_limit)
+def get_usable_memory_from_bytes(limit_bytes: int) -> int:
+    # JAX allocates a bit more than it needs, so we discount it by some safety factor
+    return int(limit_bytes * 0.95)
 def estimate_memory_from_batchsize(
@@ -30,14 +58,14 @@ def estimate_memory_from_batchsize(
                 max_output_length=max_output_length,
                 num_top_logits_to_return=num_logits_per_token,
             ),
-            backend="cpu",  # cuda backend tries to allocate in .compile() and ooms
         )
         .lower(
             model,
             prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
             prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
         )
-        .compile()
+        # disables autotune, see https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level
+        .compile(compiler_options={"xla_gpu_autotune_level": "0"})
         .memory_analysis()
     )

lalamo/speculator/inference.py CHANGED Viewed

@@ -40,7 +40,12 @@ def inference_collect_traces(
             prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
             prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
         )
-        .compile()
+        # the autotune levels are (according to https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level)
+        # 0 - no autotune, gpu shouldn't be touched
+        # 1 - basic level, gpu should be touched veeery little
+        # 2,3 - gpu touched more and more
+        # 4 (default) - gpu might allocate more memory than the run would require!
+        .compile(compiler_options={"xla_gpu_autotune_level": "2"})
     )
     prefixes = chain.from_iterable(map(get_prefixes_ending_in_user_message, conversations))

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.6.2
+Version: 0.6.4
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-lalamo/__init__.py,sha256=LlHkzLyEJUp3M_Zvcwqc2CTzLhhyqnHGqk2bs6hV3fY,1386
+lalamo/__init__.py,sha256=RDkf5Hhglc-fLZ-CmI4R-th6UgJKYmN-1hdbCzTiVx8,1532
 lalamo/commands.py,sha256=zXyyrLTHhP9wouwtpX4RUZeEF6No-_9ee-y_GWGhw7k,10972
 lalamo/common.py,sha256=WaNJx20eUX4CBF50aym9lniGAiX-SzBJzDzO5Jh6zXA,4312
-lalamo/main.py,sha256=Tez84CtMxUi1ySuRSqQElu4Zr1UWs_Gw6HX1xtCZknQ,27383
+lalamo/main.py,sha256=f1zHYQpX_OndAguOE0wqIOkzjzUolUC7w3_1ndtMC4Y,27655
 lalamo/message_processor.py,sha256=PMKte9YijT3h9N7DjTNp8H4V45A_qlDqJaubqFevLX8,5924
 lalamo/quantization.py,sha256=8o6ryIZLzzDYQuvBTboPfaVVdfijAKGpTxOcg3GKVD8,2752
 lalamo/registry_abc.py,sha256=qTikqviqqeseNzkjqoyQvL4dEWJYWzN0rI05T-JNTmo,2187
@@ -83,13 +83,13 @@ lalamo/modules/token_mixers/state/mamba_state.py,sha256=LHzJvNE6MkB7nrsZSNto6pxb
 lalamo/modules/token_mixers/state/short_conv_state.py,sha256=osjcDHoeFWQaUoOROzeJe8F1qC8rvqunimGD4CuIDHo,895
 lalamo/speculator/__init__.py,sha256=9-tmZcbCom_lIGpJYn6xLlnEahFLFidpqmgkafmu--k,456
 lalamo/speculator/common.py,sha256=PudF_gkpe5_nQ-57sAC-foE1xCy_H2Axh5KwRoA86lo,587
-lalamo/speculator/estimator.py,sha256=S_TRwMnjWg5qt9le2AYua_Vmo6QkIT-0Si7TjCfC7xc,2670
-lalamo/speculator/inference.py,sha256=5GntUgj0HQLeLn3HIHnVX8EEO0EBzmKeP5-_U7kdFAM,3670
+lalamo/speculator/estimator.py,sha256=6T8NdmDdhvP0BPg7vdkB_pxAkfgpu4WktNpUHtFuyiE,3833
+lalamo/speculator/inference.py,sha256=uEv33Qqcpa2xqEKdIzmPzkAzRsZOlb8TPeEG6TP6fjo,4071
 lalamo/speculator/ngram.py,sha256=2eqInIieJPaQHCvIfnCIDtwMa8PGEtiND_NkG7plE34,5899
 lalamo/speculator/utils.py,sha256=0wZoMMIzzk0Q-3zq5H5f-JBplePNHxywndkrNtOJOyo,1697
-lalamo-0.6.2.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
-lalamo-0.6.2.dist-info/METADATA,sha256=ZxR_Z-Q90tm45WUk2Wh1e_SpjKT0oW-FvkqmNXAqdvA,3112
-lalamo-0.6.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-lalamo-0.6.2.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
-lalamo-0.6.2.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
-lalamo-0.6.2.dist-info/RECORD,,
+lalamo-0.6.4.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
+lalamo-0.6.4.dist-info/METADATA,sha256=oS1EAJBl3jBtvZU0Rd-UcjnL2Trngree7Syn2L16Rx8,3112
+lalamo-0.6.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+lalamo-0.6.4.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
+lalamo-0.6.4.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
+lalamo-0.6.4.dist-info/RECORD,,

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lalamo-0.6.2.dist-info → lalamo-0.6.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

lalamo 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

lalamo 0.6.2py3-none-any.whl → 0.6.4py3-none-any.whl