PyPI - lalamo - Versions diffs - 0.6.5__py3-none-any.whl → 0.6.6__py3-none-any.whl - Mend

lalamo 0.6.5py3-none-any.whl → 0.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

lalamo/__init__.py +1 -1
lalamo/commands.py +247 -14
lalamo/common.py +27 -48
lalamo/data/__init__.py +3 -2
lalamo/data/huggingface_message.py +4 -5
lalamo/main.py +274 -9
lalamo/message_processor.py +19 -1
lalamo/model_import/common.py +17 -1
lalamo/model_import/model_specs/mistral.py +5 -0
lalamo/model_import/remote_registry.py +44 -0
lalamo/models/__init__.py +3 -0
lalamo/models/common.py +22 -0
lalamo/models/compile_helpers.py +58 -0
lalamo/models/language_model.py +342 -56
lalamo/models/lm_helpers.py +198 -0
lalamo/modules/decoder.py +4 -0
lalamo/modules/token_mixers/mamba.py +345 -105
lalamo/speculator/__init__.py +0 -2
lalamo/speculator/inference.py +41 -78
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/METADATA +1 -1
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/RECORD +25 -23
lalamo/speculator/estimator.py +0 -127
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/WHEEL +0 -0
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/entry_points.txt +0 -0
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/top_level.txt +0 -0

lalamo/speculator/inference.py CHANGED Viewed

@@ -1,17 +1,12 @@
-import functools
 from collections.abc import Callable, Iterable
-from itertools import batched, chain, islice
+from itertools import chain
 from typing import NamedTuple
-import jax
-import jax.numpy as jnp
-from jax._src.stages import Compiled
-from lalamo.common import decrease_batchsize_on_oom
 from lalamo.data.lalamo_completions import LalamoCompletion
 from lalamo.data.utils import get_prefixes_ending_in_user_message
 from lalamo.message_processor import Message
 from lalamo.models import LanguageModel
+from lalamo.models.common import InferenceConfig
 class CollectTracesEvent(NamedTuple):
@@ -29,84 +24,52 @@ def inference_collect_traces(
     tokens_to_generate: int | None = None,
     progress_callback: Callable[[CollectTracesEvent], None] | None = None,
 ) -> Iterable[LalamoCompletion]:
-    def make_generate_tokens_compiled(batch_size: int) -> Compiled:
-        return (
-            jax.jit(
-                functools.partial(
-                    LanguageModel.generate_tokens,
-                    max_output_length=max_output_length,
-                    num_top_logits_to_return=num_top_logits_to_collect,
-                ),
-            )
-            .lower(
-                model,
-                prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
-                prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
-            )
-            # the autotune levels are (according to https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level)
-            # 0 - no autotune, gpu shouldn't be touched
-            # 1 - basic level, gpu should be touched veeery little
-            # 2,3 - gpu touched more and more
-            # 4 (default) - gpu might allocate more memory than the run would require!
-            .compile(compiler_options={"xla_gpu_autotune_level": "0"})
-        )
     prefixes = chain.from_iterable(map(get_prefixes_ending_in_user_message, conversations))
     tokenized_prefixes = map(model.message_processor.tokenize_request, prefixes)
     filtered_prefixes = filter(lambda conv: len(conv) <= max_input_length, tokenized_prefixes)
+    filtered_prefixes = list(filtered_prefixes)  # eagerly materialize the prompts into RAM
+    config = InferenceConfig(
+        max_output_length=max_output_length,
+        num_top_logits_to_return=num_top_logits_to_collect,
+        padded_length=max_input_length,
+        batch_size=batch_size,
+    )
+    tokens_generated = 0
+    for idx, generated in enumerate(
+        model.generate_tokens_many(
+            filtered_prefixes,
+            inference_config=config,
+        ),
+    ):
+        token_ids = generated.token_ids.tolist()
+        seqlen = next(
+            (i + 1 for i, t in enumerate(token_ids) if t in model.stop_token_ids),
+            len(token_ids),
+        )
-    test_batch = list(islice(filtered_prefixes, batch_size))
-    def collect_traces_body(batch_size: int) -> Iterable[LalamoCompletion]:
-        tokens_generated, sequences_processed = 0, 0
-        generate_tokens_compiled = make_generate_tokens_compiled(batch_size)
-        for real_batch in batched(chain(test_batch, filtered_prefixes), n=batch_size):
-            batch_padding = batch_size - len(real_batch)
-            batch = (*real_batch, *(([0],) * batch_padding))
-            length_without_padding = jnp.array(list(map(len, batch)))
-            padded = jnp.array(
-                [
-                    jnp.pad(jnp.array(tokens), (0, max_input_length - len(tokens)), constant_values=0)
-                    for tokens in batch
-                ],
-            )
-            generated = generate_tokens_compiled(
-                model,
-                prompt_token_ids=padded,
-                prompt_lengths_without_padding=length_without_padding,
-            )
-            assert generated.top_k_token_ids is not None and generated.top_k_token_logits is not None
-            for conv_idx in range(len(real_batch)):
-                token_ids = generated.token_ids[conv_idx].tolist()
-                seqlen = next((i + 1 for i, t in enumerate(token_ids) if t in model.stop_token_ids), len(token_ids))
-                if tokens_to_generate is not None:
-                    seqlen = min(seqlen, tokens_to_generate - tokens_generated)
-                tokens_generated += seqlen
-                sequences_processed += 1
+        if tokens_to_generate is not None:
+            seqlen = min(seqlen, tokens_to_generate - tokens_generated)
-                token_ids = token_ids[:seqlen]
-                token_logits_ids = generated.top_k_token_ids[conv_idx, : len(token_ids)].tolist()
-                token_logits_values = generated.top_k_token_logits[conv_idx, : len(token_ids)].tolist()
-                token_logits = [
-                    dict(zip(keys, values, strict=True))
-                    for keys, values in zip(token_logits_ids, token_logits_values, strict=True)
-                ]
+        tokens_generated += seqlen
-                yield LalamoCompletion(batch[conv_idx], token_ids, token_logits)
+        token_ids = token_ids[:seqlen]
-                if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
-                    break
+        assert generated.top_k_token_ids is not None and generated.top_k_token_logits is not None
+        token_logits_ids = generated.top_k_token_ids[:seqlen].tolist()
+        token_logits_values = generated.top_k_token_logits[:seqlen].tolist()
+        token_logits = [
+            dict(zip(keys, values, strict=True))
+            for keys, values in zip(token_logits_ids, token_logits_values, strict=True)
+        ]
-            if progress_callback is not None:
-                progress_callback(CollectTracesEvent(sequences_processed, tokens_generated))
+        # We need the original prompt tokens - get from indexed_inputs
+        yield LalamoCompletion(filtered_prefixes[idx], token_ids, token_logits)
-            if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
-                break
+        if progress_callback is not None:
+            progress_callback(CollectTracesEvent(idx + 1, tokens_generated))
-    yield from decrease_batchsize_on_oom(collect_traces_body, batch_size)
+        if tokens_to_generate is not None and tokens_generated >= tokens_to_generate:
+            break

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lalamo
-Version: 0.6.5
+Version: 0.6.6
 Summary: JAX library for optimization and export of models for use with the UZU inference engine.
 Requires-Python: <4,>=3.12
 Description-Content-Type: text/markdown

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,22 @@
-lalamo/__init__.py,sha256=RpKc5sKIQHI8tPVwzH7lIJJWE7tJy6FZauEhabEp2Hg,1532
-lalamo/commands.py,sha256=zXyyrLTHhP9wouwtpX4RUZeEF6No-_9ee-y_GWGhw7k,10972
-lalamo/common.py,sha256=ddGIPlFCgo6Q683v8uP8G2dh8nsCJe9woZL8A_7_Rt4,6124
-lalamo/main.py,sha256=f1zHYQpX_OndAguOE0wqIOkzjzUolUC7w3_1ndtMC4Y,27655
-lalamo/message_processor.py,sha256=PMKte9YijT3h9N7DjTNp8H4V45A_qlDqJaubqFevLX8,5924
+lalamo/__init__.py,sha256=FFdoG3pwkVS9Xi1X_aCiByG9QyzSP_NufH2lIhj60EY,1532
+lalamo/commands.py,sha256=3zUZE2bg39XFFGh5PQ-L2oIs73GPsp0EkFinMZQNHro,18097
+lalamo/common.py,sha256=odAWGfzRRfsKJPtVekOLTlN3SrHcMerPzMk4BfVjn9I,5262
+lalamo/main.py,sha256=V6VwWlo7QO5RH5HpsbpxQgDeOGFv2Y4abhJOjHFw2MA,37153
+lalamo/message_processor.py,sha256=gf-CiidoRp1XmLdy8jkv06Gg0Nqe_DAlpYObOF9JfpA,6490
 lalamo/quantization.py,sha256=8o6ryIZLzzDYQuvBTboPfaVVdfijAKGpTxOcg3GKVD8,2752
 lalamo/registry_abc.py,sha256=qTikqviqqeseNzkjqoyQvL4dEWJYWzN0rI05T-JNTmo,2187
 lalamo/safetensors.py,sha256=kUiTSgx2zhfD1hxV_AA1DOLaKAKzjRd_vOYZCFf0em0,3048
 lalamo/sampling.py,sha256=GE6Av7zS-pr5Bg7FtOivRce7I0JIYuNYqfqsRe-yjQk,3867
 lalamo/utils.py,sha256=c88IP110gHZJ6hYDq7p36A9u-vLRM_YdavFom56gsNQ,4111
-lalamo/data/__init__.py,sha256=exfhBLxHrg7BWutM0tAln5QuIWlNQmOhaG2noFYxfPI,189
-lalamo/data/huggingface_message.py,sha256=-7lN9eIcETQzt1Pnx3d4d8p3_I7WYMNf4mp1P91N7fI,1115
+lalamo/data/__init__.py,sha256=QH23n37CWLcY69oLVE8gNNr6aJ57G0D_ZsO8eYs7-Jk,225
+lalamo/data/huggingface_message.py,sha256=8oTCxL_IOHRCVgQneRv52sgJNprsiAIPvps5nBu6LWo,1037
 lalamo/data/lalamo_completions.py,sha256=U_m3UNSJASUFz3rJq_taZOtL_U4B8Oj-ndkTF-JH-v4,1509
 lalamo/data/utils.py,sha256=B96gLaULyStKYuR8wjFdTpFc6YIDC8EEvGh1eiMe_Ec,338
 lalamo/model_import/__init__.py,sha256=Z8pS9rbKKx1QgUy7KZtHxiNWlZhII3mdovT9d37vAxg,168
-lalamo/model_import/common.py,sha256=MIbvK3mxgrDSXea6jujvCOu9Jjyip6MXeTsJjNTBJAU,12325
+lalamo/model_import/common.py,sha256=evZeeizFev2i7Whd9X7sgUQV8v5apjfmy_BhshnFbyo,13011
 lalamo/model_import/huggingface_generation_config.py,sha256=xicv_kJOfIGlz4gi5fRFIkiAZ9_QRDLRtW8nKMm5tVU,2022
 lalamo/model_import/huggingface_tokenizer_config.py,sha256=xvwdmio7b9nhn2H3uMBVligiYj58JaCFCvHY3-8dBvM,2502
+lalamo/model_import/remote_registry.py,sha256=4VjZSwlYMqflMfSSPi7-GSb9tmTLMZELzXoJJ3Tsx5s,1045
 lalamo/model_import/decoder_configs/__init__.py,sha256=YvlSsJqNEQPCNKcUzCw0MLjt8H3vcfjc4sz1OK7qdIQ,679
 lalamo/model_import/decoder_configs/common.py,sha256=L8PCgF5fIt3RqPlmLiJpBzDguKk9iTjk4XSItxwVG4c,3260
 lalamo/model_import/decoder_configs/executorch.py,sha256=fTEG_j-7d8riR3Fu_H5tHDjOTrWevfyw7QbWF1mUdOQ,5924
@@ -47,20 +48,22 @@ lalamo/model_import/model_specs/lfm2.py,sha256=wg4Ggt6BbMO4ScJ6h8tjvBc3IVSrMudES
 lalamo/model_import/model_specs/llama.py,sha256=TxhKbIBFmGV2NopOg_k3ltsKlJccbxKyu-GQ7hYWCyw,3140
 lalamo/model_import/model_specs/llamba.py,sha256=Ic3sWTv34FLJ4fG6OR_Mc5goGJQR6fa5b2WbVXbn9FA,1471
 lalamo/model_import/model_specs/mirai.py,sha256=eifYVV5-fABiLH6rr82_DiVFtDyqpW0vbvXCYsQQzto,617
-lalamo/model_import/model_specs/mistral.py,sha256=HAojorjOqsJn2DoMBzYRw8A70qCslhFEsE9AF5xumlg,1278
+lalamo/model_import/model_specs/mistral.py,sha256=i616AQg876PP9GHqHwaFIc29rUlsuhs0Z8_p0wv9eYg,1479
 lalamo/model_import/model_specs/pleias.py,sha256=5sRpZGYwLdsav6bLiW-459y1Cs9iJKgKkBIuGsOxtsQ,368
 lalamo/model_import/model_specs/polaris.py,sha256=Mw1-6bByjDmPIKlIUIV46CsmV5xUp_laI5Qquo5DmAQ,520
 lalamo/model_import/model_specs/qwen.py,sha256=HvN080ILpOwkqJbRLMqCa8Z8ImlLfTwiEIhWxUdTRfo,7563
 lalamo/model_import/model_specs/reka.py,sha256=dOUYbEMMvovQdzQuBO_DCsjGI39syhoKCvnxLkNEDCw,423
-lalamo/models/__init__.py,sha256=Vn5PcvSqKppIchkSZwQVTn_GpRvOOzZVxo5PUeDl6N8,283
+lalamo/models/__init__.py,sha256=XMYuKSsiiIQUOq-ZtjIJcaIjTeCMYaO9bKJ9kvvLq98,394
 lalamo/models/classifier.py,sha256=LvL54crCVi4HVSIXuoaSLB_5jtcx74GL7kgdy2Y16Zc,2094
-lalamo/models/common.py,sha256=uU6eCHtIqMeC_aRGVo09NdpAtvQ6RKSbm6pumVvL8pc,2943
-lalamo/models/language_model.py,sha256=HtFS-R4Uqr7SohFstoAZFVrJI293N9cG_LVkXhZxgFI,13546
+lalamo/models/common.py,sha256=8gMDvu0JXNejRslhzdurrPAS3ZymcR2Grq1RRpddc4M,3402
+lalamo/models/compile_helpers.py,sha256=t_rGCznSAQC2W4ioGZUg4Oc7lpTL6VfutKtOZ06qfXo,2227
+lalamo/models/language_model.py,sha256=YL86--CwI-T7h4ymCk3DXZ5Cswq3OCn_7wJGfYI6swk,26113
+lalamo/models/lm_helpers.py,sha256=rocQ184MCF5gnFwLbzWR7mDrV6b-0VxvOkqbhPxsCKE,6590
 lalamo/modules/__init__.py,sha256=OHIQn08jx2c3L2KIQA-7SJ4yVb2E5m6T6FqTHFJTDdM,4006
 lalamo/modules/activations.py,sha256=25F4XytJMIwPPmUbxiDUrcrdUi4c-O9SUbwv9lnZbuU,992
 lalamo/modules/classifier.py,sha256=Q5eNzJ68to6JGk8IDZiKv6Rmwh15UyT2xC52tP5njoQ,11767
 lalamo/modules/common.py,sha256=Rc9zenrUMntDKZydI1tzt1ZIY8ggfyk3ZDB-xi81ibw,3406
-lalamo/modules/decoder.py,sha256=I30fptNifcdw9OOCU50aZnEqsJ2X4VM9YXdtRkxbqGc,7014
+lalamo/modules/decoder.py,sha256=zC4IlSzBeEbHiAlGCl8TGCBqGLVtXb_FrJuC9cPwYqo,7103
 lalamo/modules/embedding.py,sha256=PdNy4tGt9F1zve4X73WKNS0DXL-nHUFOlZmGFUAarkQ,27727
 lalamo/modules/linear.py,sha256=4xIhmeouD7R10lt8KJBLxgypVXYhpGmXdHUc-96Upfk,42871
 lalamo/modules/mlp.py,sha256=ogxi9q8J38FnuBkAtC7_KTMc7JZG4BRdsAHYprHZNvM,17690
@@ -74,22 +77,21 @@ lalamo/modules/utils.py,sha256=t_TayWT6g5LtYKhJaod-u_COWaI_VbNd3eYek9Nj0lc,441
 lalamo/modules/token_mixers/__init__.py,sha256=lwxUl0eG5IvuVc_HOsINP2vtbv9F0cUmSNHFHaEmPGk,1109
 lalamo/modules/token_mixers/attention.py,sha256=ielw1-KWBfCPCPmzSHgM0TaSUcmSkWKTxrN3N_FsGm4,16144
 lalamo/modules/token_mixers/common.py,sha256=CcrbXXvGU27uxGLh5L-G8VDtcOiW5Wpm13uBEOd6lVg,1986
-lalamo/modules/token_mixers/mamba.py,sha256=zV5CnhEbAtJ32V32a2VZGsbjZ-sohMqRbR5kW9XH1AI,19087
+lalamo/modules/token_mixers/mamba.py,sha256=EFyuAEjp6pNwOriesFnulOafyGRHYgqotou6UE55axE,28945
 lalamo/modules/token_mixers/short_conv.py,sha256=k1z9UwcJGag2NHWad7cYiAnhxULtmva9RrdhqVbir18,5085
 lalamo/modules/token_mixers/state/__init__.py,sha256=OKWPmiwszMWgwamewoVHd28owanHAO2j2e30Iivtv-4,384
 lalamo/modules/token_mixers/state/common.py,sha256=dcwBevAdeJpBjf7_YRk7TKrJHsCnpljhfzZy-3h9898,661
 lalamo/modules/token_mixers/state/kv_cache.py,sha256=QfnS3XgSmyDI9MBUbeLI4ABHLxiMcXDbZsqe0fd3KQo,8788
 lalamo/modules/token_mixers/state/mamba_state.py,sha256=LHzJvNE6MkB7nrsZSNto6pxbnMJCl--JOoe9Fkcc9Mg,1642
 lalamo/modules/token_mixers/state/short_conv_state.py,sha256=osjcDHoeFWQaUoOROzeJe8F1qC8rvqunimGD4CuIDHo,895
-lalamo/speculator/__init__.py,sha256=9-tmZcbCom_lIGpJYn6xLlnEahFLFidpqmgkafmu--k,456
+lalamo/speculator/__init__.py,sha256=Ye3gMhrtNxaWPMzWbXFqKX7Rv32LGlT2k9eX2uvifKg,364
 lalamo/speculator/common.py,sha256=PudF_gkpe5_nQ-57sAC-foE1xCy_H2Axh5KwRoA86lo,587
-lalamo/speculator/estimator.py,sha256=WPG3rxKq4iLro8QwcePF766ageexHc17ANiF5rKAlKU,3833
-lalamo/speculator/inference.py,sha256=47TUiLV0Dkk3dbf1-IkdlWbHCICFw6IDwKZ73FYQUQo,4802
+lalamo/speculator/inference.py,sha256=-RfgtdwMU4-EGnzc7oT8zJEhiA_Md03rPMyyi_BF26k,2792
 lalamo/speculator/ngram.py,sha256=2eqInIieJPaQHCvIfnCIDtwMa8PGEtiND_NkG7plE34,5899
 lalamo/speculator/utils.py,sha256=0wZoMMIzzk0Q-3zq5H5f-JBplePNHxywndkrNtOJOyo,1697
-lalamo-0.6.5.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
-lalamo-0.6.5.dist-info/METADATA,sha256=EWI8eHaPSj7tXrW7xW9BPNpeDRjboNNvtGbq3hRELzU,3112
-lalamo-0.6.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-lalamo-0.6.5.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
-lalamo-0.6.5.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
-lalamo-0.6.5.dist-info/RECORD,,
+lalamo-0.6.6.dist-info/licenses/LICENSE,sha256=diHRfjSEJHD1nnEeMIfMRCjR3UERf8bT3eseD6b1ayA,1072
+lalamo-0.6.6.dist-info/METADATA,sha256=Yc0I-RS-xekkjmN-Y5LQf-0R3gKuIeuFmZ3hPXJh4tY,3112
+lalamo-0.6.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+lalamo-0.6.6.dist-info/entry_points.txt,sha256=qli7qTfnBk5WP10rOGXXEckHMtt-atJMDWd8jN89Uks,43
+lalamo-0.6.6.dist-info/top_level.txt,sha256=VHvWL5JN5XRG36NsN_MieJ7EwRihEOrEjyDaTdFJ-aI,7
+lalamo-0.6.6.dist-info/RECORD,,

lalamo/speculator/estimator.py DELETED Viewed

@@ -1,127 +0,0 @@
-import functools
-import itertools
-import os
-from collections.abc import Callable
-from typing import NamedTuple
-import jax
-import jax.numpy as jnp
-from lalamo.models import LanguageModel
-def get_default_device_bytes() -> int | None:
-    dynamic_allocate = False
-    preallocate = os.getenv("XLA_PYTHON_CLIENT_PREALLOCATE", "")
-    dynamic_allocate |= preallocate.strip().lower() in {"0", "false", "no", "off"}
-    allocator = os.getenv("XLA_PYTHON_CLIENT_ALLOCATOR", "")
-    dynamic_allocate |= allocator.strip().lower() in {"platform", "cuda_malloc_async"}
-    if dynamic_allocate:
-        return None
-    memory_stats = jax.local_devices()[0].memory_stats()
-    if memory_stats is None or "bytes_limit" not in memory_stats:
-        return None
-    mem_fraction_raw = os.getenv("XLA_PYTHON_CLIENT_MEM_FRACTION", "")
-    try:
-        mem_fraction = float(mem_fraction_raw)
-    except ValueError:
-        mem_fraction = 0.75  # jax default https://docs.jax.dev/en/latest/gpu_memory_allocation.html
-    # 500mb is seemingly the usually observed overhead; this tries to match the actual capacity of the gpu
-    # so it should correspond to something you'd see in nvidia-smi
-    memory_limit = memory_stats["bytes_limit"] / min(mem_fraction, 1.0) + (500 * 1000 * 1000)
-    return get_usable_memory_from_bytes(memory_limit)
-def get_usable_memory_from_bytes(limit_bytes: int) -> int:
-    # JAX allocates a bit more than it needs, so we discount it by some safety factor
-    return int(limit_bytes * 0.93)
-def estimate_memory_from_batchsize(
-    model: LanguageModel,
-    max_input_length: int,
-    max_output_length: int,
-    num_logits_per_token: int,
-    batch_size: int,
-) -> int:
-    memory_analysis = (
-        jax.jit(
-            functools.partial(
-                LanguageModel.generate_tokens,
-                max_output_length=max_output_length,
-                num_top_logits_to_return=num_logits_per_token,
-            ),
-        )
-        .lower(
-            model,
-            prompt_token_ids=jax.ShapeDtypeStruct((batch_size, max_input_length), jnp.int32),
-            prompt_lengths_without_padding=jax.ShapeDtypeStruct((batch_size,), jnp.int32),
-        )
-        # disables autotune, see https://guides.lw1.at/all-xla-options/#--xla_gpu_autotune_level
-        .compile(compiler_options={"xla_gpu_autotune_level": "0"})
-        .memory_analysis()
-    )
-    assert hasattr(memory_analysis, "argument_size_in_bytes")
-    assert hasattr(memory_analysis, "output_size_in_bytes")
-    assert hasattr(memory_analysis, "temp_size_in_bytes")
-    return (
-        memory_analysis.argument_size_in_bytes
-        + memory_analysis.output_size_in_bytes
-        + memory_analysis.temp_size_in_bytes
-    )
-class EstimateBatchsizeFromMemoryEvent(NamedTuple):
-    lo: int
-    hi: int | None
-def estimate_batchsize_from_memory(
-    model: LanguageModel,
-    max_input_length: int,
-    max_output_length: int,
-    num_logits_per_token: int,
-    target_mem: int,
-    progress: Callable[[EstimateBatchsizeFromMemoryEvent], None] | None = None,
-) -> int:
-    mem_for_bs = functools.cache(
-        functools.partial(
-            estimate_memory_from_batchsize,
-            model,
-            max_input_length,
-            max_output_length,
-            num_logits_per_token,
-        ),
-    )
-    lo = 0
-    hi = 0
-    for candidate_exp in itertools.count():
-        lo = hi
-        hi = 2**candidate_exp
-        if progress is not None:
-            progress(EstimateBatchsizeFromMemoryEvent(lo, None))
-        if target_mem < mem_for_bs(hi):
-            break
-    while hi - lo > 1:
-        mid = (lo + hi) // 2
-        if progress is not None:
-            progress(EstimateBatchsizeFromMemoryEvent(lo, hi))
-        if target_mem < mem_for_bs(mid):
-            hi = mid
-        else:
-            lo = mid
-    return lo

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lalamo-0.6.5.dist-info → lalamo-0.6.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

lalamo 0.6.5__py3-none-any.whl → 0.6.6__py3-none-any.whl

lalamo 0.6.5py3-none-any.whl → 0.6.6py3-none-any.whl