PyPI - lalamo - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

lalamo 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

lalamo/__init__.py +20 -5
lalamo/data/__init__.py +8 -0
lalamo/data/huggingface_message.py +38 -0
lalamo/data/lalamo_completions.py +43 -0
lalamo/data/utils.py +8 -0
lalamo/language_model.py +152 -69
lalamo/main.py +271 -43
lalamo/message_processor.py +11 -1
lalamo/model_import/common.py +10 -6
lalamo/model_import/decoder_configs/__init__.py +3 -0
lalamo/model_import/decoder_configs/executorch.py +12 -6
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +1 -3
lalamo/model_import/decoder_configs/huggingface/gemma2.py +11 -5
lalamo/model_import/decoder_configs/huggingface/gemma3.py +14 -5
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +195 -0
lalamo/model_import/decoder_configs/huggingface/llama.py +38 -8
lalamo/model_import/decoder_configs/huggingface/mistral.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen2.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen3.py +12 -6
lalamo/model_import/huggingface_tokenizer_config.py +1 -3
lalamo/model_import/loaders/executorch.py +10 -9
lalamo/model_import/loaders/huggingface.py +104 -9
lalamo/model_import/loaders/utils.py +92 -0
lalamo/model_import/model_specs/__init__.py +4 -1
lalamo/model_import/model_specs/common.py +15 -12
lalamo/model_import/model_specs/gpt_oss.py +21 -0
lalamo/modules/__init__.py +35 -7
lalamo/modules/activations.py +24 -14
lalamo/modules/attention.py +73 -20
lalamo/modules/common.py +8 -57
lalamo/modules/decoder.py +48 -34
lalamo/modules/decoder_layer.py +57 -43
lalamo/modules/embedding.py +13 -19
lalamo/modules/kv_cache.py +53 -16
lalamo/modules/linear.py +260 -79
lalamo/modules/mlp.py +395 -23
lalamo/modules/normalization.py +2 -3
lalamo/modules/rope.py +32 -21
lalamo/modules/utils.py +10 -0
lalamo/speculator/__init__.py +11 -0
lalamo/speculator/common.py +22 -0
lalamo/speculator/inference.py +75 -0
lalamo/speculator/ngram.py +154 -0
lalamo/speculator/utils.py +52 -0
lalamo/utils.py +27 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.0.dist-info}/METADATA +11 -4
lalamo-0.4.0.dist-info/RECORD +71 -0
lalamo-0.3.4.dist-info/RECORD +0 -59
{lalamo-0.3.4.dist-info → lalamo-0.4.0.dist-info}/WHEEL +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.3.4.dist-info → lalamo-0.4.0.dist-info}/top_level.txt +0 -0

lalamo/__init__.py CHANGED Viewed

@@ -1,11 +1,26 @@
-from lalamo.model_import import REPO_TO_MODEL, ModelSpec, import_model
-from lalamo.modules import Decoder
+from lalamo.language_model import LanguageModel
+from lalamo.message_processor import (
+    AssistantMessage,
+    ContentBlock,
+    Image,
+    Message,
+    SystemMessage,
+    ToolSchema,
+    UserMessage,
+)
+from lalamo.model_import import ModelSpec, import_model
-__version__ = "0.3.4"
+__version__ = "0.4.0"
 __all__ = [
-    "REPO_TO_MODEL",
-    "Decoder",
+    "AssistantMessage",
+    "ContentBlock",
+    "Image",
+    "LanguageModel",
+    "Message",
     "ModelSpec",
+    "SystemMessage",
+    "ToolSchema",
+    "UserMessage",
     "import_model",
 ]

lalamo/data/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .huggingface_message import import_hf_parquet
+from .utils import get_prefixes_ending_in_user_message
+__all__ = [
+    "get_prefixes_ending_in_user_message",
+    "import_hf_parquet",
+]

lalamo/data/huggingface_message.py ADDED Viewed

@@ -0,0 +1,38 @@
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, Self
+import cattrs
+import polars as pl
+from lalamo.message_processor import AssistantMessage, Message, UserMessage
+@dataclass(frozen=True)
+class HFMessage:
+    _converter: ClassVar[cattrs.Converter] = cattrs.Converter()
+    role: str
+    content: str
+    @classmethod
+    def from_dict(cls, obj: dict) -> Self:
+        return cls._converter.structure(obj, cls)
+    def as_message(self) -> Message:
+        match self.role:
+            case "user":
+                return UserMessage(self.content)
+            case "assistant":
+                return AssistantMessage(None, self.content)
+            case other:
+                raise ValueError(f"Cannot convert {other} message")
+def import_hf_parquet(path: Path | str) -> Iterable[list[Message]]:
+    path = Path(path)
+    dataframe = pl.scan_parquet(path).collect()
+    for conversation in dataframe.get_column("conversation").shuffle(1337):
+        yield [HFMessage.from_dict(message).as_message() for message in conversation]

lalamo/data/lalamo_completions.py ADDED Viewed

@@ -0,0 +1,43 @@
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import IO, Any, ClassVar, Self
+import msgpack
+from cattrs.preconf.msgpack import MsgpackConverter
+from cattrs.preconf.msgpack import make_converter as make_msgpack_converter
+@dataclass(frozen=True)
+class LalamoCompletion:
+    _converter: ClassVar[MsgpackConverter] = make_msgpack_converter()
+    prefix_token_ids: list[int]
+    completion_token_ids: list[int]
+    completion_token_logits: list[dict[int, float]]
+    def __post_init__(self) -> None:
+        if len(self.completion_token_ids) != len(self.completion_token_logits):
+            raise ValueError(f"({len(self.completion_token_ids)=}) != ({len(self.completion_token_logits)=})")
+    def serialize(self) -> bytes:
+        return self._converter.dumps(self)
+    @classmethod
+    def deserialize(cls, data: bytes | IO[bytes]) -> Self:
+        if isinstance(data, bytes):
+            obj: Any = msgpack.unpackb(data, strict_map_key=False)
+        else:
+            obj = msgpack.unpack(data, strict_map_key=False)
+        return cls._converter.structure(obj, cls)
+    @classmethod
+    def deserialize_many(cls, data: bytes | IO[bytes]) -> Iterable[Self]:
+        if isinstance(data, bytes):
+            unpacker = msgpack.Unpacker(strict_map_key=False)
+            unpacker.feed(data)
+        else:
+            unpacker = msgpack.Unpacker(file_like=data, strict_map_key=False)
+        for obj in unpacker:
+            yield cls._converter.structure(obj, cls)

lalamo/data/utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+from collections.abc import Iterable
+from lalamo.message_processor import Message, UserMessage
+def get_prefixes_ending_in_user_message(conversation: Iterable[Message]) -> list[list[Message]]:
+    conversation = list(conversation)
+    return [conversation[: i + 1] for i, msg in enumerate(conversation) if isinstance(msg, UserMessage)]

lalamo/language_model.py CHANGED Viewed

@@ -7,33 +7,56 @@ from typing import NamedTuple, Self
 import equinox as eqx
 import jax
 import jax.numpy as jnp
+from einops import rearrange
+from jax import vmap
 from jaxtyping import Array, Bool, Float, Int, PRNGKeyArray
-from safetensors.flax import load_file
 from tokenizers import Tokenizer
 from lalamo.common import DTypeLike, ParameterTree, unflatten_parameters
 from lalamo.message_processor import AssistantMessage, Message, MessageProcessor, MessageProcessorConfig
-from lalamo.modules import Decoder, DecoderConfig, KVCache, LalamoModule, WeightLayout, config_converter
+from lalamo.modules import Decoder, DecoderConfig, KVCache, LalamoModule, config_converter
+from lalamo.modules.common import ForwardPassMode
+from lalamo.modules.decoder import DecoderForwardPassConfig
 from lalamo.sampling import SamplingPolicy, make_policy
+from lalamo.utils import open_safetensors
 __all__ = [
+    "ForwardPassConfig",
     "GenerationConfig",
     "LanguageModel",
     "LanguageModelConfig",
 ]
+_COMPILED_PROMPT_LENGTHS = [512 * 2**i for i in range(10)]
+type ForwardPassConfig = DecoderForwardPassConfig
 class PrefillResults(NamedTuple):
-    last_token_logits: Float[Array, " vocabulary"]
-    last_token_position: Int[Array, ""]
+    last_token_logits: Float[Array, "batch vocabulary"]
+    last_token_indices: Int[Array, " batch"]
     kv_cache: KVCache
 class DecodingState(NamedTuple):
-    last_token_logits: Float[Array, " vocabulary"]
-    last_token_position: Int[Array, ""]
+    last_token_logits: Float[Array, "batch vocabulary"]
+    last_token_indices: Int[Array, " batch"]
     kv_cache: KVCache
-    stop_flag: Bool[Array, ""]
+    stop_flags: Bool[Array, " batch"]
+class GenerationStepResults(NamedTuple):
+    token_ids: Int[Array, " batch"]
+    top_k_token_ids: Int[Array, " batch k"] | None
+    top_k_token_logits: Float[Array, " batch k"] | None
+class GenerationResults(NamedTuple):
+    token_ids: Int[Array, "batch response_tokens"]
+    top_k_token_ids: Int[Array, "batch response_tokens k"] | None
+    top_k_token_logits: Float[Array, "batch response_tokens k"] | None
 @dataclass(frozen=True)
@@ -60,14 +83,15 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
     message_processor: MessageProcessor = eqx.field(static=True)
     @classmethod
-    def load(cls, path: Path | str, weight_layout: WeightLayout = WeightLayout.AUTO) -> Self:
+    def load(cls, path: Path | str) -> Self:
         if isinstance(path, str):
             path = Path(path)
         with open(path / "config.json") as config_file:
             config_json = json.load(config_file)
         config = config_converter.structure(config_json["model_config"], LanguageModelConfig)
-        weights = unflatten_parameters(load_file(path / "model.safetensors"))
-        decoder = config.decoder_config.empty().import_weights(weights, weight_layout)
+        with open_safetensors(path / "model.safetensors") as weights_dict:
+            weights = unflatten_parameters(weights_dict)
+            decoder = config.decoder_config.empty().import_weights(weights)
         tokenizer = Tokenizer.from_file(str(path / "tokenizer.json"))
         message_processor = MessageProcessor(config.message_processor_config, tokenizer)
         return cls(config, decoder, message_processor)
@@ -76,17 +100,16 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
     def activation_precision(self) -> DTypeLike:
         return self.decoder.activation_precision
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
-        return self.decoder.export_weights(weight_layout)
+    def export_weights(self) -> ParameterTree:
+        return self.decoder.export_weights()
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,
     ) -> Self:
         return replace(
             self,
-            decoder=self.decoder.import_weights(weights, weight_layout),
+            decoder=self.decoder.import_weights(weights),
         )
     @property
@@ -99,14 +122,15 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
     @eqx.filter_jit
     def _prefill(
         self,
-        token_ids: Int[Array, " tokens"],
-        length_without_padding: Int[Array, ""] | int | None = None,
+        token_ids: Int[Array, "batch tokens"],
+        lengths_without_padding: Int[Array, " batch"] | None = None,
         kv_cache_capacity: int | None = None,
+        forward_pass_config: ForwardPassConfig | None = None,
     ) -> PrefillResults:
-        (num_tokens,) = token_ids.shape
-        token_positions = jnp.arange(num_tokens, dtype=jnp.int32)
+        batch_size, sequence_length = token_ids.shape
+        token_positions = jnp.repeat(jnp.arange(sequence_length, dtype=jnp.int32)[None, ...], batch_size, axis=0)
         if kv_cache_capacity is not None:
-            kv_cache = self.decoder.init_static_kv_cache(kv_cache_capacity)
+            kv_cache = self.decoder.init_static_kv_cache(batch_size, kv_cache_capacity)
         else:
             kv_cache = None
@@ -115,52 +139,56 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
             token_positions,
             kv_cache,
             return_updated_kv_cache=True,
-            length_without_padding=length_without_padding,
+            lengths_without_padding=lengths_without_padding,
+            forward_pass_mode=ForwardPassMode.MULTI_TOKEN,
+            forward_pass_config=forward_pass_config,
         )
-        if length_without_padding is not None:
-            last_logits_index = length_without_padding - 1
+        if lengths_without_padding is not None:
+            last_logits_indices = lengths_without_padding - 1
         else:
-            last_logits_index = num_tokens - 1
+            last_logits_indices = jnp.array([sequence_length - 1] * batch_size, dtype=jnp.int32)
-        last_token_logits = decoder_outputs.logits[last_logits_index, :]
-        last_token_position = jnp.array(last_logits_index, dtype=jnp.int32)
+        last_token_logits = vmap(lambda logits, index: logits[index])(decoder_outputs.logits, last_logits_indices)
         assert decoder_outputs.updated_kv_cache is not None
         return PrefillResults(
             last_token_logits=last_token_logits,
-            last_token_position=last_token_position,
+            last_token_indices=last_logits_indices,
             kv_cache=decoder_outputs.updated_kv_cache,
         )
     @eqx.filter_jit
     def generate_tokens(
         self,
-        prompt_token_ids: Int[Array, " prompt_tokens"],
+        prompt_token_ids: Int[Array, "batch prompt_tokens"],
         sampling_policy: SamplingPolicy | None = None,
-        prompt_length_without_padding: Int[Array, ""] | int | None = None,
+        prompt_lengths_without_padding: Int[Array, " batch"] | None = None,
         max_output_length: int = 8192,
         eos_token_ids: Int[Array, " eos_tokens"] | None = None,
+        forward_pass_config: ForwardPassConfig | None = None,
+        num_top_logits_to_return: int | None = None,
         *,
         key: PRNGKeyArray | None = None,
-    ) -> Int[Array, " response_tokens"]:
+    ) -> GenerationResults:
         if sampling_policy is None:
             sampling_policy = self.default_sampling_policy()
         if eos_token_ids is None:
             eos_token_ids = jnp.array(self.stop_token_ids, dtype=jnp.int32)
-        (input_length,) = prompt_token_ids.shape
+        batch_size, sequence_length = prompt_token_ids.shape
         prefill_results = self._prefill(
             prompt_token_ids,
-            prompt_length_without_padding,
-            input_length + max_output_length,
+            prompt_lengths_without_padding,
+            sequence_length + max_output_length,
+            forward_pass_config=forward_pass_config,
         )
         initial_state = DecodingState(
             prefill_results.last_token_logits,
-            prefill_results.last_token_position,
+            prefill_results.last_token_indices,
             prefill_results.kv_cache,
-            jnp.array(0, dtype=jnp.bool),
+            jnp.zeros(batch_size, dtype=jnp.bool),
         )
         if key is None:
@@ -170,49 +198,88 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         def loop_iteration(
             state: DecodingState,
             key: PRNGKeyArray,
-        ) -> tuple[DecodingState, Int[Array, ""]]:
-            def sample_and_update() -> tuple[DecodingState, Int[Array, ""]]:
-                processed_logits = sampling_policy.process_logits(state.last_token_logits)
-                next_token_id = jax.random.categorical(key, processed_logits)
-                next_token_position = state.last_token_position + 1
-                stop_flag = state.stop_flag | jnp.any(next_token_id == eos_token_ids)
+        ) -> tuple[DecodingState, GenerationStepResults]:
+            def sample_and_update() -> tuple[DecodingState, GenerationStepResults]:
+                upcasted_logits = state.last_token_logits.astype(jnp.float32)
+                processed_logits = vmap(sampling_policy.process_logits)(upcasted_logits)
+                next_token_ids = jax.random.categorical(key, processed_logits)
+                next_token_ids = jnp.where(state.stop_flags, jnp.zeros(batch_size, dtype=jnp.int32), next_token_ids)
+                if num_top_logits_to_return is not None:
+                    next_top_k_token_logits, next_top_k_token_ids = jax.lax.top_k(
+                        processed_logits,
+                        num_top_logits_to_return,
+                    )
+                else:
+                    next_top_k_token_ids = None
+                    next_top_k_token_logits = None
+                next_token_indices = state.last_token_indices + 1
+                stop_flags = state.stop_flags | jnp.any(next_token_ids[:, None] == eos_token_ids[None, :], axis=-1)
+                if batch_size == 1:
+                    forward_pass_mode = ForwardPassMode.SINGLE_TOKEN
+                else:
+                    forward_pass_mode = ForwardPassMode.MULTI_TOKEN
                 decoder_outputs = self.decoder(
-                    next_token_id.reshape(1),
-                    next_token_position.reshape(1),
+                    next_token_ids[:, None],
+                    next_token_indices[:, None],
                     state.kv_cache,
                     return_updated_kv_cache=True,
+                    forward_pass_mode=forward_pass_mode,
+                    forward_pass_config=forward_pass_config,
                 )
                 assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
                 new_state = DecodingState(
-                    decoder_outputs.logits.squeeze(),
-                    next_token_position,
+                    decoder_outputs.logits.squeeze(1),
+                    next_token_indices,
                     decoder_outputs.updated_kv_cache,
-                    stop_flag,
+                    stop_flags,
                 )
-                return new_state, next_token_id
+                return new_state, GenerationStepResults(next_token_ids, next_top_k_token_ids, next_top_k_token_logits)
-            def pad_and_repeat_state() -> tuple[DecodingState, Int[Array, ""]]:
-                pad_token = jnp.array(0, dtype=jnp.int32)
-                return state, pad_token
+            def pad_and_repeat_state() -> tuple[DecodingState, GenerationStepResults]:
+                (batch_size,) = state.stop_flags.shape
+                pad_token = jnp.zeros(batch_size, dtype=jnp.int32)
+                if num_top_logits_to_return is not None:
+                    top_k_token_ids = jnp.zeros((batch_size, num_top_logits_to_return), dtype=jnp.int32)
+                    top_k_token_logits = jnp.zeros((batch_size, num_top_logits_to_return), dtype=jnp.float32)
+                else:
+                    top_k_token_ids = None
+                    top_k_token_logits = None
+                return state, GenerationStepResults(pad_token, top_k_token_ids, top_k_token_logits)
-            return jax.lax.cond(state.stop_flag, pad_and_repeat_state, sample_and_update)
+            return jax.lax.cond(jnp.all(state.stop_flags), pad_and_repeat_state, sample_and_update)
-        _, tokens = jax.lax.scan(loop_iteration, initial_state, keys)
+        _, generated = jax.lax.scan(loop_iteration, initial_state, keys)
-        return tokens
+        token_ids = rearrange(generated.token_ids, "iteration batch -> batch iteration")
+        if num_top_logits_to_return is not None:
+            top_k_token_ids = rearrange(generated.top_k_token_ids, "iteration batch k -> batch iteration k")
+            top_k_token_logits = rearrange(generated.top_k_token_logits, "iteration batch k -> batch iteration k")
+        else:
+            top_k_token_ids = None
+            top_k_token_logits = None
+        return GenerationResults(token_ids, top_k_token_ids, top_k_token_logits)
     def reply(
         self,
         messages: Iterable[Message],
         sampling_policy: SamplingPolicy | None = None,
+        forward_pass_config: ForwardPassConfig | None = None,
         *,
         key: PRNGKeyArray | None = None,
     ) -> AssistantMessage:
         formatted_messages = self.message_processor.render_request(messages)
-        token_ids = jnp.array(self.message_processor.tokenize(formatted_messages), dtype=jnp.int32)
-        response_ids = self.generate_tokens(token_ids, sampling_policy, key=key)
+        token_ids = jnp.array(self.message_processor.tokenize(formatted_messages), dtype=jnp.int32)[None, :]
+        response_ids = self.generate_tokens(
+            token_ids,
+            sampling_policy,
+            forward_pass_config=forward_pass_config,
+            key=key,
+        ).token_ids.squeeze(0)
         response_text = self.message_processor.detokenize(response_ids.tolist())
         return self.message_processor.parse_response(response_text)
@@ -220,21 +287,29 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         self,
         messages: Iterable[Message],
         sampling_policy: SamplingPolicy | None = None,
+        max_output_length: int = 8192,
+        forward_pass_config: ForwardPassConfig | None = None,
         *,
         key: PRNGKeyArray | None = None,
     ) -> Iterable[str]:
         formatted_messages = self.message_processor.render_request(messages)
         token_ids = jnp.array(self.message_processor.tokenize(formatted_messages), dtype=jnp.int32)
-        for token_id in self.stream_tokens(token_ids, sampling_policy, key=key):
+        for token_id in self.stream_tokens(
+            token_ids,
+            sampling_policy,
+            max_output_length,
+            forward_pass_config=forward_pass_config,
+            key=key,
+        ):
             yield self.message_processor.detokenize([token_id.item()])
     def stream_tokens(
         self,
         prompt_token_ids: Int[Array, " prompt_tokens"],
         sampling_policy: SamplingPolicy | None = None,
-        prompt_length_without_padding: Int[Array, ""] | int | None = None,
         max_output_length: int = 8192,
         eos_token_ids: Int[Array, " eos_tokens"] | None = None,
+        forward_pass_config: ForwardPassConfig | None = None,
         *,
         key: PRNGKeyArray | None = None,
     ) -> Iterable[Int[Array, ""]]:
@@ -244,10 +319,16 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
             eos_token_ids = jnp.array(self.stop_token_ids, dtype=jnp.int32)
         (input_length,) = prompt_token_ids.shape
+        padded_input_length = min(length for length in _COMPILED_PROMPT_LENGTHS if length >= input_length)
+        padded_token_ids = jnp.zeros((padded_input_length,), dtype=jnp.int32)
+        padded_token_ids = padded_token_ids.at[:input_length].set(prompt_token_ids)
         prefill_results = self._prefill(
-            prompt_token_ids,
-            prompt_length_without_padding,
-            input_length + max_output_length,
+            padded_token_ids[None, :],
+            jnp.array([input_length], dtype=jnp.int32),
+            padded_input_length + max_output_length,
+            forward_pass_config=forward_pass_config,
         )
         if key is None:
@@ -256,13 +337,14 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
         state = DecodingState(
             prefill_results.last_token_logits,
-            prefill_results.last_token_position,
+            prefill_results.last_token_indices,
             prefill_results.kv_cache,
-            jnp.array(0, dtype=jnp.bool),
+            jnp.array([0], dtype=jnp.bool),
         )
         for iter_key in keys:
-            processed_logits = sampling_policy.process_logits(state.last_token_logits)
+            upcasted_logits = state.last_token_logits.astype(jnp.float32)
+            processed_logits = sampling_policy.process_logits(upcasted_logits.squeeze(0))
             next_token_id = jax.random.categorical(iter_key, processed_logits)
             yield next_token_id
@@ -270,17 +352,18 @@ class LanguageModel(LalamoModule[LanguageModelConfig]):
             if jnp.any(next_token_id == eos_token_ids):
                 return
-            next_token_position = state.last_token_position + 1
+            next_token_indices = state.last_token_indices + 1
             decoder_outputs = self.decoder(
-                next_token_id.reshape(1),
-                next_token_position.reshape(1),
+                next_token_id.reshape(1, 1),
+                next_token_indices.reshape(1, 1),
                 state.kv_cache,
                 return_updated_kv_cache=True,
+                forward_pass_config=forward_pass_config,
             )
             assert decoder_outputs.updated_kv_cache is not None, "updated_kv_cache should not be None"
             state = DecodingState(
-                decoder_outputs.logits.squeeze(),
-                next_token_position,
+                decoder_outputs.logits.squeeze(1),
+                next_token_indices,
                 decoder_outputs.updated_kv_cache,
-                state.stop_flag,
+                state.stop_flags,
             )

lalamo 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

lalamo 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl