PyPI - lalamo - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

lalamo/__init__.py +20 -5
lalamo/data/__init__.py +8 -0
lalamo/data/huggingface_message.py +38 -0
lalamo/data/lalamo_completions.py +43 -0
lalamo/data/utils.py +8 -0
lalamo/language_model.py +152 -69
lalamo/main.py +271 -43
lalamo/message_processor.py +11 -1
lalamo/model_import/common.py +17 -7
lalamo/model_import/decoder_configs/__init__.py +3 -0
lalamo/model_import/decoder_configs/executorch.py +12 -6
lalamo/model_import/decoder_configs/huggingface/__init__.py +2 -0
lalamo/model_import/decoder_configs/huggingface/common.py +1 -3
lalamo/model_import/decoder_configs/huggingface/gemma2.py +11 -5
lalamo/model_import/decoder_configs/huggingface/gemma3.py +14 -5
lalamo/model_import/decoder_configs/huggingface/gpt_oss.py +195 -0
lalamo/model_import/decoder_configs/huggingface/llama.py +38 -8
lalamo/model_import/decoder_configs/huggingface/mistral.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen2.py +12 -6
lalamo/model_import/decoder_configs/huggingface/qwen3.py +12 -6
lalamo/model_import/huggingface_tokenizer_config.py +1 -4
lalamo/model_import/loaders/executorch.py +10 -9
lalamo/model_import/loaders/huggingface.py +104 -9
lalamo/model_import/loaders/utils.py +92 -0
lalamo/model_import/model_specs/__init__.py +4 -1
lalamo/model_import/model_specs/common.py +15 -12
lalamo/model_import/model_specs/gpt_oss.py +21 -0
lalamo/modules/__init__.py +35 -7
lalamo/modules/activations.py +24 -14
lalamo/modules/attention.py +73 -20
lalamo/modules/common.py +8 -57
lalamo/modules/decoder.py +48 -34
lalamo/modules/decoder_layer.py +57 -43
lalamo/modules/embedding.py +13 -19
lalamo/modules/kv_cache.py +53 -16
lalamo/modules/linear.py +260 -79
lalamo/modules/mlp.py +395 -23
lalamo/modules/normalization.py +2 -3
lalamo/modules/rope.py +32 -21
lalamo/modules/utils.py +10 -0
lalamo/speculator/__init__.py +11 -0
lalamo/speculator/common.py +22 -0
lalamo/speculator/inference.py +75 -0
lalamo/speculator/ngram.py +154 -0
lalamo/speculator/utils.py +52 -0
lalamo/utils.py +27 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/METADATA +11 -4
lalamo-0.4.0.dist-info/RECORD +71 -0
lalamo-0.3.3.dist-info/RECORD +0 -59
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/WHEEL +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/entry_points.txt +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/licenses/LICENSE +0 -0
{lalamo-0.3.3.dist-info → lalamo-0.4.0.dist-info}/top_level.txt +0 -0

lalamo/modules/decoder_layer.py CHANGED Viewed

@@ -1,41 +1,48 @@
 from collections.abc import Mapping
 from dataclasses import dataclass, replace
+from functools import partial
 from typing import Self
 import equinox as eqx
 import jax
+import jax.numpy as jnp
 from jax import vmap
 from jaxtyping import Array, DTypeLike, Float, Int, PRNGKeyArray
 from lalamo.common import ParameterTree
 from .attention import Attention, AttentionConfig
-from .common import AttentionType, LalamoModule, WeightLayout
+from .common import AttentionType, ForwardPassMode, LalamoModule
 from .kv_cache import KVCacheLayer, StaticKVCacheLayer
-from .mlp import MLP, MLPConfig
+from .mlp import MLPBase, MLPConfig, MLPForwardPassConfig
 from .normalization import RMSNorm, RMSNormConfig
 from .rope import PositionalEmbeddings
+from .utils import vmap_twice
 __all__ = [
     "DecoderLayer",
     "DecoderLayerActivationTrace",
     "DecoderLayerConfig",
+    "DecoderLayerForwardPassConfig",
     "DecoderLayerResult",
 ]
+type DecoderLayerForwardPassConfig = MLPForwardPassConfig
 class DecoderLayerActivationTrace(eqx.Module):
-    inputs: Float[Array, "suffix_tokens channels"]
+    inputs: Float[Array, "batch suffix_tokens channels"]
     positional_embeddings: PositionalEmbeddings
     kv_cache: KVCacheLayer | None
-    mlp_inputs: Float[Array, "suffix_tokens channels"]
-    pre_attention_norm: Float[Array, "suffix_tokens channels"]
-    attention: Float[Array, "suffix_tokens channels"]
-    post_attention_norm: Float[Array, "suffix_tokens channels"] | None
-    pre_mlp_norm: Float[Array, "suffix_tokens channels"]
-    mlp: Float[Array, "suffix_tokens channels"]
-    post_mlp_norm: Float[Array, "suffix_tokens channels"] | None
+    mlp_inputs: Float[Array, "batch suffix_tokens channels"]
+    pre_attention_norm: Float[Array, "batch suffix_tokens channels"]
+    attention: Float[Array, "batch suffix_tokens channels"]
+    post_attention_norm: Float[Array, "batch suffix_tokens channels"] | None
+    pre_mlp_norm: Float[Array, "batch suffix_tokens channels"]
+    mlp: Float[Array, "batch suffix_tokens channels"]
+    post_mlp_norm: Float[Array, "batch suffix_tokens channels"] | None
     def export(self) -> ParameterTree:
         result = dict(
@@ -171,7 +178,7 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
     attention: Attention
     post_attention_norm: RMSNorm | None
     pre_mlp_norm: RMSNorm
-    mlp: MLP
+    mlp: MLPBase
     post_mlp_norm: RMSNorm | None
     @property
@@ -201,44 +208,50 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
             )
         if self.mlp.model_dim != model_dim:
             raise ValueError(
-                f"MLP up projection dim {self.mlp.up_projection.input_dim} does not match"
+                f"MLP up projection dim {self.mlp.model_dim} does not match"
                 f" the first normalization layer dim {model_dim}",
             )
-        if self.mlp.hidden_dim != self.mlp.down_projection.input_dim:
-            raise ValueError(
-                f"MLP down projection dim {self.mlp.down_projection.input_dim} does not match"
-                f" the up projection dim {self.mlp.hidden_dim}",
-            )
     @eqx.filter_jit
     def __call__(
         self,
-        inputs: Float[Array, "suffix_tokens channels"],
+        inputs: Float[Array, "batch suffix_tokens channels"],
         positional_embeddings: PositionalEmbeddings,
         kv_cache: KVCacheLayer | None = None,
         return_updated_kv_cache: bool = False,
         return_activation_trace: bool = False,
-        length_without_padding: Int[Array, ""] | int | None = None,
+        lengths_without_padding: Int[Array, " batch"] | None = None,
+        forward_pass_mode: ForwardPassMode = ForwardPassMode.MULTI_TOKEN,
+        forward_pass_config: DecoderLayerForwardPassConfig | None = None,
     ) -> DecoderLayerResult:
-        normalized_attention_inputs = vmap(self.pre_attention_norm, in_axes=0)(inputs)
-        attention_outputs, updated_kv_cache = self.attention(
+        if inputs.ndim != 3:
+            raise ValueError(
+                f"Inputs to decoder layers must be a 3D arrays of size (batch_size, sequence_length, hidden_dim),"
+                f" got {inputs.shape}",
+            )
+        normalized_attention_inputs = vmap_twice(self.pre_attention_norm)(inputs)
+        batched_attention_fn = vmap(partial(self.attention, return_updated_kv_cache=return_updated_kv_cache))
+        attention_outputs, updated_kv_cache = batched_attention_fn(
             normalized_attention_inputs,
             positional_embeddings,
             kv_cache=kv_cache,
-            return_updated_kv_cache=return_updated_kv_cache,
-            length_without_padding=length_without_padding,
+            length_without_padding=lengths_without_padding,
         )
         if self.post_attention_norm is not None:
-            normalized_attention_outputs = vmap(self.post_attention_norm, in_axes=0)(attention_outputs)
+            normalized_attention_outputs = vmap_twice(self.post_attention_norm)(attention_outputs)
             mlp_inputs = inputs + normalized_attention_outputs
         else:
             normalized_attention_outputs = None
             mlp_inputs = inputs + attention_outputs
-        normalized_mlp_inputs = vmap(self.pre_mlp_norm, in_axes=0)(mlp_inputs)
-        mlp_outputs = vmap(self.mlp, in_axes=0)(normalized_mlp_inputs)
+        normalized_mlp_inputs = vmap_twice(self.pre_mlp_norm)(mlp_inputs)
+        mlp_outputs = self.mlp(
+            normalized_mlp_inputs,
+            forward_pass_mode=forward_pass_mode,
+            forward_pass_config=forward_pass_config,
+        )
         if self.post_mlp_norm is not None:
-            normalized_mlp_outputs = vmap(self.post_mlp_norm, in_axes=0)(mlp_outputs)
+            normalized_mlp_outputs = vmap_twice(self.post_mlp_norm)(mlp_outputs)
             outputs = mlp_inputs + normalized_mlp_outputs
         else:
             normalized_mlp_outputs = None
@@ -266,26 +279,28 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
             activation_trace=activation_trace,
         )
-    def init_static_kv_cache(self, capacity: int) -> StaticKVCacheLayer:
-        return self.attention.init_static_kv_cache(capacity)
+    def init_static_kv_cache(self, batch_size: int, capacity: int) -> StaticKVCacheLayer:
+        return jax.tree.map(
+            lambda array: jnp.repeat(array[None, ...], batch_size, axis=0),
+            self.attention.init_static_kv_cache(capacity),
+        )
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+    def export_weights(self) -> ParameterTree:
         result = dict(
-            pre_attention_norm=self.pre_attention_norm.export_weights(weight_layout),
-            attention=self.attention.export_weights(weight_layout),
-            pre_mlp_norm=self.pre_mlp_norm.export_weights(weight_layout),
-            mlp=self.mlp.export_weights(weight_layout),
+            pre_attention_norm=self.pre_attention_norm.export_weights(),
+            attention=self.attention.export_weights(),
+            pre_mlp_norm=self.pre_mlp_norm.export_weights(),
+            mlp=self.mlp.export_weights(),
         )
         if self.post_attention_norm is not None:
-            result["post_attention_norm"] = self.post_attention_norm.export_weights(weight_layout)
+            result["post_attention_norm"] = self.post_attention_norm.export_weights()
         if self.post_mlp_norm is not None:
-            result["post_mlp_norm"] = self.post_mlp_norm.export_weights(weight_layout)
+            result["post_mlp_norm"] = self.post_mlp_norm.export_weights()
         return result
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,
     ) -> Self:
         assert isinstance(weights, Mapping)
         assert isinstance(weights["pre_attention_norm"], Mapping)
@@ -297,21 +312,20 @@ class DecoderLayer(LalamoModule[DecoderLayerConfig]):
             assert isinstance(weights["post_attention_norm"], Mapping)
             post_attention_norm = self.post_attention_norm.import_weights(
                 weights["post_attention_norm"],
-                weight_layout,
             )
         else:
             post_attention_norm = None
         if self.post_mlp_norm is not None:
             assert isinstance(weights["post_mlp_norm"], Mapping)
-            post_mlp_norm = self.post_mlp_norm.import_weights(weights["post_mlp_norm"], weight_layout)
+            post_mlp_norm = self.post_mlp_norm.import_weights(weights["post_mlp_norm"])
         else:
             post_mlp_norm = None
         return replace(
             self,
-            pre_attention_norm=self.pre_attention_norm.import_weights(weights["pre_attention_norm"], weight_layout),
-            attention=self.attention.import_weights(weights["attention"], weight_layout),
+            pre_attention_norm=self.pre_attention_norm.import_weights(weights["pre_attention_norm"]),
+            attention=self.attention.import_weights(weights["attention"]),
             post_attention_norm=post_attention_norm,
-            pre_mlp_norm=self.pre_mlp_norm.import_weights(weights["pre_mlp_norm"], weight_layout),
-            mlp=self.mlp.import_weights(weights["mlp"], weight_layout),
+            pre_mlp_norm=self.pre_mlp_norm.import_weights(weights["pre_mlp_norm"]),
+            mlp=self.mlp.import_weights(weights["mlp"]),
             post_mlp_norm=post_mlp_norm,
         )

lalamo/modules/embedding.py CHANGED Viewed

@@ -13,9 +13,6 @@ from lalamo.quantization import QuantizationMode, dynamically_quantize_activatio
 from .common import (
     LalamoModule,
-    WeightLayout,
-    from_layout,
-    into_layout,
     register_config_union,
 )
 from .utils import apply_soft_capping
@@ -35,7 +32,7 @@ __all__ = [
 @dataclass(frozen=True)
 class EmbeddingConfigBase:
     input_scale: float | None
-    logits_soft_cap: float | None
+    logit_soft_cap: float | None
     @abstractmethod
     def random_init(
@@ -79,8 +76,8 @@ class EmbeddingBase[ConfigT: EmbeddingConfigBase](LalamoModule[ConfigT]):
     @eqx.filter_jit
     def readout(self, x: Float[Array, " channels"]) -> Float[Array, " vocabulary"]:
         logits = self._prepare_output_weights() @ x
-        if self.config.logits_soft_cap is not None:
-            logits = apply_soft_capping(logits, self.config.logits_soft_cap)
+        if self.config.logit_soft_cap is not None:
+            logits = apply_soft_capping(logits, self.config.logit_soft_cap)
         return logits
@@ -136,13 +133,12 @@ class TiedEmbedding(EmbeddingBase[TiedEmbeddingConfig]):
     def _prepare_output_weights(self) -> Float[Array, "vocabulary channels"]:
         return self.weights
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:  # noqa: ARG002
+    def export_weights(self) -> ParameterTree:
         return {"weights": self.weights}
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,  # noqa: ARG002
     ) -> Self:
         assert isinstance(weights, Mapping)
         return replace(self, weights=weights["weights"])
@@ -184,7 +180,7 @@ class UntiedEmbeddingConfig(EmbeddingConfigBase):
 class UntiedEmbedding(EmbeddingBase[UntiedEmbeddingConfig]):
     input_weights: Float[Array, "vocabulary channels"]
-    output_weights: Float[Array, "vocabulary channels"]
+    output_weights: Float[Array, "channels vocabulary"]
     @property
     def activation_precision(self) -> DTypeLike:
@@ -228,22 +224,21 @@ class UntiedEmbedding(EmbeddingBase[UntiedEmbeddingConfig]):
     def _prepare_output_weights(self) -> Float[Array, "vocabulary channels"]:
         return self.output_weights
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+    def export_weights(self) -> ParameterTree:
         return {
             "input_weights": self.input_weights,
-            "output_weights": into_layout(self.output_weights, weight_layout),
+            "output_weights": self.output_weights,
         }
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,
     ) -> Self:
         assert isinstance(weights, Mapping)
         return replace(
             self,
             input_weights=weights["input_weights"],
-            output_weights=from_layout(weights["output_weights"], weight_layout),
+            output_weights=weights["output_weights"],
         )
@@ -339,23 +334,22 @@ class QuantizedTiedEmbedding(EmbeddingBase[QuantizedTiedEmbeddingConfig]):
             x = dynamically_quantize_activations(x, self.config.activation_quantization_mode)
         return super().readout(x)
-    def export_weights(self, weight_layout: WeightLayout = WeightLayout.AUTO) -> ParameterTree:
+    def export_weights(self) -> ParameterTree:
         return {
-            "weights": into_layout(self.int_weights, weight_layout),
-            "scales": into_layout(self.scales, weight_layout),
+            "weights": self.int_weights,
+            "scales": self.scales,
         }
     def import_weights(
         self,
         weights: ParameterTree[Array],
-        weight_layout: WeightLayout = WeightLayout.AUTO,
     ) -> Self:
         assert isinstance(weights, Mapping)
         assert isinstance(weights["weights"], Array)
         return replace(
             self,
-            weights=from_layout(weights["weights"].astype(self.weights.dtype), weight_layout),
-            scales=from_layout(weights["scales"], weight_layout),
+            weights=weights["weights"].astype(self.weights.dtype),
+            scales=weights["scales"],
         )

lalamo/modules/kv_cache.py CHANGED Viewed

@@ -13,13 +13,14 @@ __all__ = ["DynamicKVCacheLayer", "KVCache", "KVCacheLayer", "StaticKVCacheLayer
 class KVCacheLayer(eqx.Module):
-    keys: Float[Array, "tokens groups head_channels"]
-    values: Float[Array, "tokens groups head_channels"]
+    has_sinks: bool = eqx.field(static=True)
+    keys: Float[Array, "*batch tokens groups head_channels"]
+    values: Float[Array, "*batch tokens groups head_channels"]
     def __post_init__(self) -> None:
-        if self.keys.ndim != 3:
+        if self.keys.ndim not in (3, 4):
             raise ValueError(
-                f"Key and value buffers must have 3 dimensions: capacity, groups, head_channels,"
+                f"Key and value buffers must have 3 or 4 dimensions: [batch], capacity, groups, head_channels,"
                 f" got shape {self.keys.shape}",
             )
         if self.keys.shape != self.values.shape:
@@ -27,11 +28,18 @@ class KVCacheLayer(eqx.Module):
         if self.keys.dtype != self.values.dtype:
             raise ValueError("Keys and values buffers must have the same dtype")
+    def _raise_if_batched(self) -> None:
+        if self.keys.ndim != 3:
+            raise ValueError(
+                "Attempted to call a method on a batched version of KVCacheLayer. Use vmap instead.",
+            )
     @abstractmethod
     def attention_mask(
         self,
         suffix_length: int,
         is_causal: bool,
+        suffix_length_without_padding: Int[Array, ""] | int | None = None,
         sliding_window_size: int | None = None,
     ) -> Bool[Array, "suffix_tokens tokens"]: ...
@@ -68,29 +76,42 @@ class DynamicKVCacheLayer(KVCacheLayer):
     @classmethod
     def init(
         cls,
+        has_sinks: bool,
         keys: Float[Array, "tokens groups head_channels"],
         values: Float[Array, "tokens groups head_channels"],
         length: Int[Array, ""] | int | None = None,
     ) -> "DynamicKVCacheLayer":
-        num_tokens, _, _ = keys.shape
+        num_tokens, num_groups, head_dim = keys.shape
         if length is None:
             padding_mask = None
         else:
-            padding_mask = jnp.arange(num_tokens, dtype=jnp.int32) < length
-        return cls(keys, values, padding_mask)
+            token_indices = jnp.arange(num_tokens, dtype=jnp.int32)
+            padding_mask = token_indices < length
+        if has_sinks:
+            sinks = jnp.zeros((1, num_groups, head_dim), dtype=keys.dtype)
+            keys = jnp.concatenate([sinks, keys], axis=0)
+            values = jnp.concatenate([sinks, values], axis=0)
+            if padding_mask is not None:
+                true = jnp.ones((1,), dtype=jnp.bool)
+                padding_mask = jnp.concatenate([true, padding_mask], axis=0)
+        return cls(has_sinks, keys, values, padding_mask)
     def attention_mask(
         self,
         suffix_length: int,
         is_causal: bool,
+        suffix_length_without_padding: Int[Array, ""] | int | None = None,  # noqa: ARG002
         sliding_window_size: int | None = None,
     ) -> Bool[Array, "suffix_tokens tokens"]:
+        self._raise_if_batched()
         total_num_tokens, _, _ = self.keys.shape
         result = jnp.ones((suffix_length, total_num_tokens), dtype=jnp.bool)
         if is_causal:
             result = jnp.tril(result, k=total_num_tokens - suffix_length)
         if sliding_window_size is not None:
             result = jnp.triu(result, k=1 - sliding_window_size)
+        if self.has_sinks:
+            result = result.at[:, 0].set(True)
         if self.padding_mask is not None:
             result = result & self.padding_mask[None, :]
         return result
@@ -101,12 +122,13 @@ class DynamicKVCacheLayer(KVCacheLayer):
         added_values: Float[Array, "new_tokens groups head_channels"],
         added_length: Int[Array, ""] | int | None = None,
     ) -> "DynamicKVCacheLayer":
+        self._raise_if_batched()
         updated_keys = jnp.concatenate([self.keys, added_keys], axis=0)
         updated_values = jnp.concatenate([self.values, added_values], axis=0)
         added_padded_length, _, _ = added_keys.shape
         if self.padding_mask is None and added_length is None:
-            return DynamicKVCacheLayer(updated_keys, updated_values)
+            return DynamicKVCacheLayer(self.has_sinks, updated_keys, updated_values)
         if added_length is None:
             added_length = added_padded_length
@@ -118,20 +140,24 @@ class DynamicKVCacheLayer(KVCacheLayer):
         added_padding_mask = jnp.arange(added_padded_length, dtype=jnp.int32) < added_length
         updated_padding_mask = jnp.concatenate([old_padding_mask, added_padding_mask], axis=0)
-        return DynamicKVCacheLayer(updated_keys, updated_values, updated_padding_mask)
+        return DynamicKVCacheLayer(self.has_sinks, updated_keys, updated_values, updated_padding_mask)
 class StaticKVCacheLayer(KVCacheLayer):
-    current_length: Int[Array, ""]
+    current_length: Int[Array, "*batch"]
     def attention_mask(
         self,
         suffix_length: int,
         is_causal: bool,
+        suffix_length_without_padding: Int[Array, ""] | int | None = None,
         sliding_window_size: int | None = None,
     ) -> Bool[Array, "suffix_tokens tokens"]:
+        self._raise_if_batched()
+        if suffix_length_without_padding is None:
+            suffix_length_without_padding = suffix_length
         if is_causal:
-            query_offsets = jnp.arange(-suffix_length, 0, dtype=jnp.int32)
+            query_offsets = jnp.arange(0, suffix_length, dtype=jnp.int32) - suffix_length_without_padding
         else:
             query_offsets = jnp.zeros(suffix_length, dtype=jnp.int32)
@@ -142,15 +168,19 @@ class StaticKVCacheLayer(KVCacheLayer):
         if sliding_window_size is not None:
             swa_mask = query_indices[:, None] < (key_indices[None, :] + sliding_window_size)
             result = result & swa_mask
+        if self.has_sinks:
+            result = result.at[:, 0].set(True)
         return result
     @property
     def padding_mask(self) -> Bool[Array, " tokens"] | None:
+        self._raise_if_batched()
         return jnp.arange(self.capacity, dtype=jnp.int32) < self.current_length
     @property
     def capacity(self) -> int:
+        self._raise_if_batched()
         result, _, _ = self.keys.shape
         return result
@@ -160,6 +190,7 @@ class StaticKVCacheLayer(KVCacheLayer):
         added_values: Float[Array, "tokens groups head_channels"],
         added_length: Int[Array, ""] | int | None = None,
     ) -> "StaticKVCacheLayer":
+        self._raise_if_batched()
         if added_keys.shape != added_values.shape:
             raise ValueError("Keys and values must have the same shape")
         num_added_tokens, new_num_groups, new_head_dim = added_keys.shape
@@ -185,12 +216,18 @@ class StaticKVCacheLayer(KVCacheLayer):
             allow_negative_indices=False,
         )
         updated_sequence_length = self.current_length + added_length
-        return StaticKVCacheLayer(keys=updated_keys, values=updated_values, current_length=updated_sequence_length)
+        return StaticKVCacheLayer(
+            has_sinks=self.has_sinks,
+            keys=updated_keys,
+            values=updated_values,
+            current_length=updated_sequence_length,
+        )
     @classmethod
-    def empty(cls, capacity: int, num_groups: int, head_dim: int, dtype: DTypeLike) -> Self:
+    def empty(cls, has_sinks: bool, capacity: int, num_groups: int, head_dim: int, dtype: DTypeLike) -> Self:
         return cls(
-            keys=jnp.empty((capacity, num_groups, head_dim), dtype=dtype),
-            values=jnp.empty((capacity, num_groups, head_dim), dtype=dtype),
-            current_length=jnp.array(0, dtype=jnp.int32),
+            has_sinks=has_sinks,
+            keys=jnp.zeros((capacity, num_groups, head_dim), dtype=dtype),
+            values=jnp.zeros((capacity, num_groups, head_dim), dtype=dtype),
+            current_length=jnp.array(has_sinks, dtype=jnp.int32),
         )

lalamo 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

lalamo 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl