PyPI - optimum-rbln - Versions diffs - 0.7.3a2__py3-none-any.whl → 0.7.3a4__py3-none-any.whl - Mend

optimum-rbln 0.7.3a2py3-none-any.whl → 0.7.3a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

optimum/rbln/__version__.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.7.3a2'
-__version_tuple__ = version_tuple = (0, 7, 3)
+__version__ = version = '0.7.3a4'
+__version_tuple__ = version_tuple = (0, 7, 3, 'a4')

optimum/rbln/modeling.py CHANGED Viewed

@@ -134,6 +134,9 @@ class RBLNModel(RBLNBaseModel):
         for preprocessor in preprocessors:
             preprocessor.save_pretrained(save_dir_path / subfolder)
+        # ad-hoc
+        rbln_kwargs["n_model_params"] = sum(p.numel() for p in model.parameters())
         # Get compilation arguments (e.g. input_info)
         rbln_config: RBLNConfig = cls.get_rbln_config(
             preprocessors=preprocessors, model_config=config, rbln_kwargs=rbln_kwargs

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -282,6 +282,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             **kwargs,
         )
+    @classmethod
+    def _check_compiled_models(
+        cls, compiled_models: Dict[str, rebel.RBLNCompiledModel], rbln_config: RBLNConfig, config: "PretrainedConfig"
+    ):
+        # check compiled model can create runtimes.
+        # this logic currently only works in LLM
+        # fail when LLM model using Paged Attention can't guarantee max sequence length
+        pass
     @classmethod
     def _from_compiled_models(
         cls,
@@ -295,6 +304,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
     ):
         if isinstance(model_save_dir, str):
             model_save_dir = Path(model_save_dir)
+        cls._check_compiled_models(compiled_models=rbln_compiled_models, rbln_config=rbln_config, config=config)
         # FIXME:: Should we convert it?
         compiled_model_names = [cfg.compiled_model_name for cfg in rbln_config.compile_cfgs]
         rbln_compiled_models = [rbln_compiled_models[cm_name] for cm_name in compiled_model_names]

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -28,7 +28,7 @@ else:
 def register_rbln_custom_paged_attention():
     torch.library.define(
         "rbln_custom_ops::paged_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_attn_decode", "cpu")
@@ -57,28 +57,17 @@ def register_rbln_custom_paged_attention():
         - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
-            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
-            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     @register_fake("rbln_custom_ops::paged_attn_decode")
     def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     torch.library.define(
         "rbln_custom_ops::paged_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_attn_prefill", "cpu")
@@ -105,23 +94,20 @@ def register_rbln_custom_paged_attention():
         - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
-            - empty_kcache: Same shape as input kcache - Placeholder for compiler
-            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
         """
-        return q, kcache, vcache
+        return q
     @register_fake("rbln_custom_ops::paged_attn_prefill")
     def attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
-        return q, kcache, vcache
+        return q
 @lru_cache
 def register_rbln_custom_paged_causal_attention():
     torch.library.define(
         "rbln_custom_ops::paged_causal_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_causal_attn_decode", "cpu")
@@ -149,28 +135,17 @@ def register_rbln_custom_paged_causal_attention():
         - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
-            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
-            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     @register_fake("rbln_custom_ops::paged_causal_attn_decode")
     def attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     torch.library.define(
         "rbln_custom_ops::paged_causal_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_causal_attn_prefill", "cpu")
@@ -197,23 +172,20 @@ def register_rbln_custom_paged_causal_attention():
         - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
-            - empty_kcache: Same shape as input kcache - Placeholder for compiler
-            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
         """
-        return q, kcache, vcache
+        return q
     @register_fake("rbln_custom_ops::paged_causal_attn_prefill")
     def attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        return q, kcache, vcache
+        return q
 @lru_cache
 def register_rbln_custom_add_softmax_attention():
     torch.library.define(
         "rbln_custom_ops::add_softmax_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::add_softmax_attn_decode", "cpu")
@@ -240,21 +212,10 @@ def register_rbln_custom_add_softmax_attention():
         - scale: [] - Attention scale factor
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
-            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
-            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     @register_fake("rbln_custom_ops::add_softmax_attn_decode")
     def add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -28,71 +28,55 @@ else:
 def register_rbln_custom_paged_flash_attention():
     torch.library.define(
         "rbln_custom_ops::paged_flash_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_flash_attn_decode", "cpu")
     def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     @register_fake("rbln_custom_ops::paged_flash_attn_decode")
     def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     torch.library.define(
         "rbln_custom_ops::paged_flash_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::flash_attn_prefill", "cpu")
     def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return q, kcache, vcache
+        return q
     @register_fake("rbln_custom_ops::paged_flash_attn_prefill")
     def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return q, kcache, vcache
+        return q
 @lru_cache
 def register_rbln_custom_paged_flash_causal_attention():
     torch.library.define(
         "rbln_custom_ops::paged_flash_causal_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_decode", "cpu")
     def flash_attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     @register_fake("rbln_custom_ops::paged_flash_causal_attn_decode")
     def flash_attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
     torch.library.define(
         "rbln_custom_ops::paged_flash_causal_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_prefill", "cpu")
     def flash_attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return q, kcache, vcache
+        return q
     @register_fake("rbln_custom_ops::paged_flash_causal_attn_prefill")
     def flash_attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
-        return q, kcache, vcache
+        return q

optimum/rbln/ops/kv_cache_update.py CHANGED Viewed

@@ -45,10 +45,10 @@ def register_rbln_custom_cache_update():
         # Update the specified portion of the cache tensor with the state tensor, using `slice_scatter`.
         # This operation modifies the cache tensor in-place directly on the device, avoiding any unnecessary transfers between host and device.
-        updated_cache = cache.slice_scatter(state, dim=axis, start=s, end=e)
+        cache.slice_scatter(state, dim=axis, start=s, end=e)
-        # Return the updated cache tensor.
-        return updated_cache
+        # 'rbln_cache_update' is an in-place operation that isn't tracked in JIT trace, so a dummy output was added to the return value.
+        return torch.empty([256])
     # Register a "fake" implementation of the "rbln_cache_update" operation.
     # This serves as an abstract definition for the RBLN compiler to recognize the operation and generate an optimized implementation.
@@ -57,4 +57,4 @@ def register_rbln_custom_cache_update():
         # Return a tensor with the same shape as the input cache tensor.
         # This is a placeholder for the abstract implementation and does not perform any actual computation.
         # Like the actual implementation, the abstraction assumes in-place device-side updates.
-        return torch.empty_like(cache)
+        return torch.empty([256])

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -281,7 +281,7 @@ class DecoderOnlyWrapper(nn.Module):
             _past_key_values.append(past_key_value)
         past_key_values = _past_key_values
-        logit, present_key_values = self.causal_lm(
+        logit = self.causal_lm(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -292,15 +292,7 @@ class DecoderOnlyWrapper(nn.Module):
             block_tables=block_tables,
         )
-        # ((key, value)) * n_layer -> [key, value] * n_layer
-        _present_key_values = ()
-        for i in range(self.num_hidden_layers):
-            key_states = present_key_values[i][0]
-            value_states = present_key_values[i][1]
-            _present_key_values = _present_key_values + (key_states, value_states)
-        present_key_values = _present_key_values
-        return logit, present_key_values
+        return logit
 class DecoderOnlyForCausalLM(nn.Module):
@@ -353,7 +345,7 @@ class DecoderOnlyForCausalLM(nn.Module):
         block_tables: Optional[torch.Tensor] = None,
     ):
         # outputs
-        hidden_states, present_key_values = self.model(
+        hidden_states = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -367,8 +359,7 @@ class DecoderOnlyForCausalLM(nn.Module):
             hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
         logits = self._original_mod.lm_head(hidden_states)
-        output = (logits, present_key_values)
-        return output
+        return logits
 class DecoderOnlyModel(nn.Module):
@@ -484,20 +475,19 @@ class DecoderOnlyModel(nn.Module):
         else:
             seq_positions = cache_position[:, :1]
-        present_key_values = past_key_values
         for layer in self.layers:
-            hidden_states, present_key_values = layer(
+            hidden_states = layer(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
                 seq_positions=seq_positions,
-                past_key_values=present_key_values,
+                past_key_values=past_key_values,
                 cos=cos,
                 sin=sin,
                 block_tables=block_tables,
             )
         hidden_states = self.get_last_layernorm()(hidden_states)
-        return hidden_states, present_key_values
+        return hidden_states
 class DecoderOnlyLayer(nn.Module):
@@ -559,7 +549,7 @@ class DecoderOnlyLayer(nn.Module):
         residual = hidden_states
         hidden_states = self.get_pre_attention_layernorm()(hidden_states)
-        hidden_states, present_key_values = self.self_attn(
+        hidden_states = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             seq_positions=seq_positions,
@@ -576,7 +566,7 @@ class DecoderOnlyLayer(nn.Module):
         hidden_states = self._original_mod.mlp(hidden_states)
         hidden_states = residual + hidden_states
-        return hidden_states, present_key_values
+        return hidden_states
 class DecoderOnlyAttention(nn.Module):
@@ -678,7 +668,7 @@ class DecoderOnlyAttention(nn.Module):
         if batch_size > 1 and self.phase == "prefill":
             raise NotImplementedError(f"batch size should be 1 if prefill phase, but got {batch_size}.")
-        attn_output, key_state, value_state = self.attention(
+        attn_output = self.attention(
             query_states,
             key_states,
             value_states,
@@ -690,12 +680,9 @@ class DecoderOnlyAttention(nn.Module):
             block_tables=block_tables,
             block_size=self.kvcache_block_size,
         )
-        key_states = key_state
-        value_states = value_state
         attn_outputs = self.o_proj(attn_output)
-        past_key_values[self.layer_idx] = key_states, value_states
-        return attn_outputs, past_key_values
+        return attn_outputs
 class AttentionOp(nn.Module):
@@ -733,7 +720,7 @@ class AttentionOp(nn.Module):
             scale: Scale applied to attn weights
         Returns:
-            Tuple of (attention_output, key_state, value_state)
+            Tensor: attention_output: [batch, num_heads, seq_len, head_dim]
         """
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
         key_state = key_state.unsqueeze(2)  # 1, 32, 1, 128, 128
@@ -756,7 +743,7 @@ class AttentionOp(nn.Module):
         if self.phase == "decode":
             if self.use_attention_mask:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_attn_decode(
+                attn_output = torch.ops.rbln_custom_ops.paged_attn_decode(
                     query_state,
                     key_state,
                     value_state,
@@ -769,7 +756,7 @@ class AttentionOp(nn.Module):
                     block_size,
                 )
             else:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_causal_attn_decode(
+                attn_output = torch.ops.rbln_custom_ops.paged_causal_attn_decode(
                     query_state,
                     key_state,
                     value_state,
@@ -783,7 +770,7 @@ class AttentionOp(nn.Module):
         else:
             if self.use_attention_mask:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_attn_prefill(
+                attn_output = torch.ops.rbln_custom_ops.paged_attn_prefill(
                     query_state,
                     key_state,
                     value_state,
@@ -796,7 +783,7 @@ class AttentionOp(nn.Module):
                     block_size,
                 )
             else:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_causal_attn_prefill(
+                attn_output = torch.ops.rbln_custom_ops.paged_causal_attn_prefill(
                     query_state,
                     key_state,
                     value_state,
@@ -812,7 +799,7 @@ class AttentionOp(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
-        return attn_output, key_state.squeeze(2), value_state.squeeze(2)
+        return attn_output
 def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
@@ -947,7 +934,7 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
         if cos is not None and sin is not None:
             query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)
-        attn_output, key_state, value_state = self.attention(
+        attn_output = self.attention(
             query_states,
             key_states,
             value_states,
@@ -959,13 +946,9 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
             block_tables=block_tables,
             kvcache_block_size=self.kvcache_block_size,
         )
-        key_states = key_state
-        value_states = value_state
         attn_outputs = self.o_proj(attn_output)
-        past_key_values[self.layer_idx] = key_states, value_states
-        return attn_outputs, past_key_values
+        return attn_outputs
 class FlashAttentionOp(AttentionOp):
@@ -1019,7 +1002,7 @@ class FlashAttentionOp(AttentionOp):
         if self.phase == "decode":
             if self.use_attention_mask:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_flash_attn_decode(
+                attn_output = torch.ops.rbln_custom_ops.paged_flash_attn_decode(
                     query_state,
                     key_state,
                     value_state,
@@ -1033,7 +1016,7 @@ class FlashAttentionOp(AttentionOp):
                     self.kvcache_partition_size,
                 )
             else:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_flash_causal_attn_decode(
+                attn_output = torch.ops.rbln_custom_ops.paged_flash_causal_attn_decode(
                     query_state,
                     key_state,
                     value_state,
@@ -1047,7 +1030,7 @@ class FlashAttentionOp(AttentionOp):
                 )
         else:
             if self.use_attention_mask:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_flash_attn_prefill(
+                attn_output = torch.ops.rbln_custom_ops.paged_flash_attn_prefill(
                     query_state,
                     key_state,
                     value_state,
@@ -1061,7 +1044,7 @@ class FlashAttentionOp(AttentionOp):
                     self.kvcache_partition_size,
                 )
             else:
-                attn_output, key_state, value_state = torch.ops.rbln_custom_ops.paged_flash_causal_attn_prefill(
+                attn_output = torch.ops.rbln_custom_ops.paged_flash_causal_attn_prefill(
                     query_state,
                     key_state,
                     value_state,
@@ -1079,4 +1062,4 @@ class FlashAttentionOp(AttentionOp):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
-        return attn_output, key_state, value_state
+        return attn_output

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import inspect
+import math
 from collections import deque
 from dataclasses import dataclass
 from pathlib import Path
@@ -54,7 +55,6 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         block_tables: torch.Tensor,
         free_block_pool: Deque,
         kvcache_block_size: int,
-        kvcache_num_blocks: int,
         use_attention_mask: bool,
         attn_impl: str,
         **kwargs: Any,
@@ -72,7 +72,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         self.free_block_pool = free_block_pool
         self.kvcache_block_size = kvcache_block_size
-        self.empty_block = kvcache_num_blocks - 1
+        self.empty_block = -1
         self.attn_impl = attn_impl
         if self.phase == "prefill":
@@ -97,58 +97,61 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             torch.Tensor: Updated block tables.
         """
-        def update_block(batch_idx, block_idx):
+        NO_BLOCKS_ERROR = (
+            "No memory blocks are available for allocation."
+            "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln."
+            "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html)."
+            "Using vllm-rbln should fix this issue and enhance inference performance."
+        )
+        def update_block(batch_idx: int, block_idx: int):
             """
-            Helper function to update the block table for a given batch index and block index.
             If the block is empty (empty_block), allocates a block from the free_block_pool.
-            Args:
-                batch_idx (int): Batch index.
-                block_idx (int): Block index.
-            Raises:
-                RuntimeError: Raised if no available blocks are found in the free_block_pool.
             """
             if self.block_tables[batch_idx][block_idx] == self.empty_block:
                 if self.free_block_pool:
                     block = self.free_block_pool.popleft()
                     self.block_tables[batch_idx][block_idx] = block
                 else:
-                    raise RuntimeError("Not available blocks")
+                    raise RuntimeError(NO_BLOCKS_ERROR)
-        if self.attn_impl == "eager":
-            if self.phase == "prefill":
-                return self.block_tables[batch_idx]
+        def replace_empty_block(block_tables: torch.Tensor):
+            """
+            Replaces all occurrences of `self.empty_block` in `block_tables` with a dummy block from `self.free_block_pool`.
+            """
+            if not torch.any(block_tables == self.empty_block):
+                return block_tables.clone()
+            elif self.free_block_pool:
+                _free_block = self.free_block_pool[0]
+                return torch.where(block_tables == self.empty_block, _free_block, block_tables)
             else:
-                return self.block_tables
-        # Case for 'flash_attn' attention implementation
+                raise RuntimeError(NO_BLOCKS_ERROR)
+        if self.phase == "prefill":
+            # Track previously used blocks and return them to the free_block_pool and
+            # reset the current batch's block table to empty blocks
+            prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
+            self.free_block_pool.extend(prev_blocks)
+            self.block_tables[batch_idx].fill_(self.empty_block)
+            # Get the start (s) and end (e) positions from cache_position and
+            # iterate over the cache positions to allocate necessary blocks
+            s, e = cache_position[0][0].item(), cache_position[0][-1].item()
+            for position in range(s, e + 1, self.kvcache_block_size):
+                block_idx = position // self.kvcache_block_size
+                if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+                    raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
+                update_block(batch_idx, block_idx)
+            return replace_empty_block(self.block_tables[batch_idx])
+        # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
         else:
-            if self.phase == "prefill":
-                # Track previously used blocks and return them to the free_block_pool and
-                # reset the current batch's block table to empty blocks
-                prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
-                self.free_block_pool.extend(prev_blocks)
-                self.block_tables[batch_idx].fill_(self.empty_block)
-                # Get the start (s) and end (e) positions from cache_position and
-                # iterate over the cache positions to allocate necessary blocks
-                s, e = cache_position[0][0].item(), cache_position[0][-1].item()
-                for position in range(s, e + 1, self.kvcache_block_size):
-                    block_idx = position // self.kvcache_block_size
-                    if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
-                        raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
-                    update_block(batch_idx, block_idx)
-                return self.block_tables[batch_idx]
-            # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
-            else:
-                for b_idx in range(self.batch_size):
-                    position = cache_position[b_idx][0].item()
-                    block_idx = position // self.kvcache_block_size
-                    update_block(b_idx, block_idx)
+            for b_idx in range(self.batch_size):
+                position = cache_position[b_idx][0].item()
+                block_idx = position // self.kvcache_block_size
+                update_block(b_idx, block_idx)
-                return self.block_tables
+            return replace_empty_block(self.block_tables)
     def forward(
         self,
@@ -380,14 +383,10 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
         dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
-        if attn_impl == "eager":
-            block_tables = torch.arange(0, self.batch_size, dtype=torch.int16).reshape(self.batch_size, 1)
-            free_block_pool = None
-        else:
-            block_tables = torch.zeros(
-                self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
-            ).fill_(self.kvcache_num_blocks - 1)
-            free_block_pool = deque(x for x in range(self.kvcache_num_blocks - 1))
+        block_tables = torch.zeros(
+            self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
+        ).fill_(-1)
+        free_block_pool = deque(x for x in range(self.kvcache_num_blocks))
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
@@ -399,7 +398,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             block_tables=block_tables,
             free_block_pool=free_block_pool,
             kvcache_block_size=self.kvcache_block_size,
-            kvcache_num_blocks=self.kvcache_num_blocks,
             vocab_size=self.config.vocab_size,
             prefill_chunk_size=self.prefill_chunk_size,
             max_seq_len=self.max_seq_len,
@@ -416,7 +414,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             block_tables=block_tables,
             free_block_pool=free_block_pool,
             kvcache_block_size=self.kvcache_block_size,
-            kvcache_num_blocks=self.kvcache_num_blocks,
             use_attention_mask=self.use_attention_mask,
             attn_impl=attn_impl,
         )
@@ -569,6 +566,72 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         return compile_model(quantize_config=quantize_config)
+    @classmethod
+    def get_maximum_num_blocks(
+        cls,
+        config: PretrainedConfig,
+        tensor_parallel_size: int,
+        kvcache_block_size: int,
+        nbits_per_param: int,
+        n_model_params: int,
+    ) -> int:
+        num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
+        num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
+        vocab_size = config.vocab_size
+        hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
+        num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
+        TARGET_DRAM_LIMIT = int(tensor_parallel_size * 15.7 * 2**30)  # 16GB # TODO(jongho): 더 정확한 값
+        def align(x: int, nbytes: int) -> int:
+            return int(math.ceil(x / nbytes) * nbytes)
+        def align_2MB(x: int) -> int:
+            return align(x, 2 * 1024 * 1024)
+        def get_kernel_size() -> int:
+            # TODO: Implement
+            lm_heads_params = align(vocab_size, 64) * hidden_size
+            lm_heads_nbytes = (
+                align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
+            )
+            params = n_model_params - lm_heads_params
+            layer_nbytes = (
+                align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
+                * num_layers
+                * tensor_parallel_size
+            )
+            return layer_nbytes + lm_heads_nbytes
+        available_dram = TARGET_DRAM_LIMIT - get_kernel_size()
+        buffer = 2**30  # 1GB
+        if tensor_parallel_size <= 2:
+            buffer /= 4
+        available_dram -= buffer
+        def get_nbytes_per_block() -> int:
+            return (
+                align_2MB(
+                    kvcache_block_size
+                    * head_dim
+                    * math.ceil(num_key_value_heads / tensor_parallel_size)  # Shard
+                    * 2  # (fp16)
+                )
+                * num_layers
+                * 2  # (k, v)
+                * tensor_parallel_size
+            )
+        nbytes_per_block = get_nbytes_per_block()
+        n_blocks = available_dram // nbytes_per_block
+        return n_blocks, nbytes_per_block
     @classmethod
     def _get_rbln_config(
         cls,
@@ -622,8 +685,27 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             else:
                 rbln_kvcache_block_size = rbln_kvcache_partition_len
-        # FIXME temporal num_blocks
-        rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
+        max_num_blocks, nbytes_per_block = cls.get_maximum_num_blocks(
+            config=model_config,
+            tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
+            kvcache_block_size=rbln_kvcache_block_size,
+            nbits_per_param=16 if rbln_quantization is None else 4,  # TODO(jongho): FIX Ad-hoc
+            n_model_params=rbln_kwargs["n_model_params"],
+        )
+        model_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
+        rbln_kvcache_num_blocks = min(model_num_blocks, max_num_blocks)
+        required_blocks = rbln_max_seq_len // rbln_kvcache_block_size + 1
+        if rbln_kvcache_num_blocks < required_blocks:
+            rbln_kvcache_num_blocks = required_blocks
+        logger.info(f"[KVCache] Compiling with num_blocks: {rbln_kvcache_num_blocks}")
+        if rbln_kvcache_num_blocks < rbln_batch_size:
+            raise RuntimeError(
+                f"Batch size ({rbln_batch_size}) exceeds available KV cache blocks ({rbln_kvcache_num_blocks}). "
+                "Ensure the number of blocks is at least equal to the batch size."
+            )
         num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
         num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
@@ -723,6 +805,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 "kvcache_block_size": rbln_kvcache_block_size,
                 "attn_impl": rbln_attn_impl,
                 "kvcache_num_blocks": rbln_kvcache_num_blocks,
+                "model_num_blocks": model_num_blocks,
+                "max_num_blocks": max_num_blocks,
+                "nbytes_per_block": nbytes_per_block,
             }
         )

optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py CHANGED Viewed

@@ -114,11 +114,9 @@ class Seq2SeqEncoderWrapper(nn.Module):
         # 3. update the cross_attention's past_key_value direct to the device-dram for optimization.
         batch_axis = torch.tensor(1, dtype=torch.int16)
-        cross_key_values = torch.ops.rbln_custom_ops.rbln_cache_update(
-            cross_key_values, cross_kv, b_idx[0], batch_axis
-        )
+        enc_out = torch.ops.rbln_custom_ops.rbln_cache_update(cross_key_values, cross_kv, b_idx[0], batch_axis)
-        return cross_key_values
+        return enc_out
 class Seq2SeqDecoderWrapper(nn.Module):
@@ -193,7 +191,7 @@ class Seq2SeqDecoderWrapper(nn.Module):
             cross_past_key_values = cross_past_key_values + ((cross_kv_cache[i], cross_kv_cache[i + 1]),)
         # decode
-        lm_logits, self_present_key_values = self.conditional_generation(
+        lm_logits = self.conditional_generation(
             input_ids=input_ids,
             attention_mask=attention_mask,
             encoder_attention_mask=encoder_attention_mask,
@@ -203,9 +201,7 @@ class Seq2SeqDecoderWrapper(nn.Module):
             block_tables=block_tables,
         )
-        outputs = (lm_logits,) + self_present_key_values
-        return outputs
+        return lm_logits
 class Seq2SeqForConditionalGeneration(nn.Module):
@@ -250,7 +246,7 @@ class Seq2SeqForConditionalGeneration(nn.Module):
         cache_position,
         block_tables: Optional[torch.Tensor] = None,
     ):
-        hidden_states, self_present_key_values = self.decoder(
+        hidden_states = self.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
             encoder_attention_mask=encoder_attention_mask,
@@ -265,7 +261,7 @@ class Seq2SeqForConditionalGeneration(nn.Module):
         lm_logits = self.lm_head(hidden_states)
-        return lm_logits, self_present_key_values
+        return lm_logits
 class Seq2SeqDecoder(torch.nn.Module):
@@ -326,11 +322,10 @@ class Seq2SeqDecoder(torch.nn.Module):
             hidden_states = self.apply_position_embedding(hidden_states, cache_position)
         # iterate decoder_layer
-        self_present_key_values = ()
         for decoder_layer, self_past_key_value, cross_past_key_value in zip(
             self.layers, self_past_key_values, cross_past_key_values
         ):
-            hidden_states, self_present_key_value = decoder_layer(
+            hidden_states = decoder_layer(
                 hidden_states,
                 attention_mask=attention_mask,
                 encoder_attention_mask=encoder_attention_mask,
@@ -339,12 +334,11 @@ class Seq2SeqDecoder(torch.nn.Module):
                 cache_position=cache_position,
                 block_tables=block_tables,
             )
-            self_present_key_values += self_present_key_value
         if self.final_layer_norm is not None:
             hidden_states = self.final_layer_norm(hidden_states)
-        return hidden_states, self_present_key_values
+        return hidden_states
 class Seq2SeqDecoderLayer(torch.nn.Module):
@@ -404,7 +398,7 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         # Self Attention Block
         residual = hidden_states
         hidden_states = self.pre_self_attn_layer_norm(hidden_states)
-        hidden_states, self_attn_past_key_value = self.self_attn(
+        hidden_states = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=self_past_key_value,
             attention_mask=attention_mask,
@@ -429,7 +423,7 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         # Feed-Forward Block
         hidden_states = self.ff_layer(hidden_states)
-        return hidden_states, self_attn_past_key_value
+        return hidden_states
 class Seq2SeqSelfAttention(nn.Module):
@@ -492,12 +486,11 @@ class Seq2SeqSelfAttention(nn.Module):
         if attention_mask is not None:
             args.insert(3, attention_mask.unsqueeze(2))
-        attn_output, key_states, value_states = self.attn_decode(*args)
+        attn_output = self.attn_decode(*args)
         attn_output = attn_output.view(bsz, self.num_heads, -1, self.head_dim).transpose(1, 2)
         attn_output = attn_output.reshape(bsz, -1, self.num_heads * self.head_dim)
         attn_output = self.out_proj(attn_output)
-        present_key_value = (key_states, value_states)
-        return attn_output, present_key_value
+        return attn_output

optimum/rbln/transformers/models/t5/t5_architecture.py CHANGED Viewed

@@ -88,7 +88,7 @@ class T5DecoderWrapper(Seq2SeqDecoderWrapper):
             cross_past_key_values = cross_past_key_values + ((cross_kv_cache[i], cross_kv_cache[i + 1]),)
         # decode
-        lm_logits, self_present_key_values = self.conditional_generation(
+        lm_logits = self.conditional_generation(
             input_ids=input_ids,
             attention_mask=attention_mask,
             encoder_attention_mask=encoder_attention_mask,
@@ -97,9 +97,7 @@ class T5DecoderWrapper(Seq2SeqDecoderWrapper):
             cache_position=cache_position,
         )
-        outputs = (lm_logits,) + self_present_key_values
-        return outputs
+        return lm_logits
 class T5ForConditionalGeneration(Seq2SeqForConditionalGeneration):
@@ -187,7 +185,7 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
         key_states = self._shape(key_states, -1, bsz)
         value_states = self._shape(value_states, -1, bsz)
-        attn_output, key_states, value_states = self.attn_decode(
+        attn_output = self.attn_decode(
             query_states,
             key_states,
             value_states,
@@ -204,9 +202,7 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
         attn_output = attn_output.reshape(bsz, -1, self.num_heads * self.head_dim)
         attn_output = self.out_proj(attn_output)
-        present_key_value = (key_states, value_states)
-        return attn_output, present_key_value
+        return attn_output
 class T5CrossAttention(nn.Module):

optimum/rbln/transformers/models/whisper/whisper_architecture.py CHANGED Viewed

@@ -25,7 +25,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.utils import logging
-from ....ops import register_rbln_custom_cache_update
+from ....ops import register_rbln_custom_add_softmax_attention, register_rbln_custom_cache_update
 logger = logging.get_logger(__name__)
@@ -34,6 +34,7 @@ logger = logging.get_logger(__name__)
 class WhisperWrapper:
     def __init__(self, model, rbln_token_timestamps):
         register_rbln_custom_cache_update()
+        register_rbln_custom_add_softmax_attention()
         self.encoder = WhisperEncoderWrapper(model)
         self.decoder = WhisperDecoderWrapper(model, output_attentions=rbln_token_timestamps)
@@ -77,9 +78,9 @@ class WhisperEncoderWrapper(torch.nn.Module):
         # 3. update cross_attention's past_key_value to the device-dram for optimization.
         bidx = torch.tensor(0, dtype=torch.int16)
         axis = torch.tensor(1, dtype=torch.int16)
-        cross_key_values = torch.ops.rbln_custom_ops.rbln_cache_update(cross_key_values, cross_kv, bidx, axis)
+        enc_output = torch.ops.rbln_custom_ops.rbln_cache_update(cross_key_values, cross_kv, bidx, axis)
-        return cross_key_values
+        return enc_output
 class WhisperDecoderWrapper(torch.nn.Module):
@@ -118,7 +119,7 @@ class WhisperDecoderWrapper(torch.nn.Module):
             cross_past_key_values = cross_past_key_values + ((cross_kv_cache[i], cross_kv_cache[i + 1]),)
         # Decode
-        sequence_output, self_present_key_values, cross_attentions = self.decoder(
+        sequence_output, cross_attentions = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
             cache_position=cache_position,
@@ -127,9 +128,7 @@ class WhisperDecoderWrapper(torch.nn.Module):
         )
         lm_logits = self.proj_out(sequence_output)
         outputs = (lm_logits,)
-        outputs += self_present_key_values
         if self.output_attentions:
             # deocder's cross attention is used for token_timestamps
@@ -167,26 +166,23 @@ class WhisperDecoder(nn.Module):
         # prepare casual_attn_mask
         attention_mask = _prepare_4d_causal_attention_mask(attention_mask, input_shape, inputs_embeds, cache_position)
-        self_present_key_values = ()
         cross_attentions = ()
         # iterate decoder_layer
         for self_past_key_value, cross_past_key_value, decoder_layer in zip(
             self_past_key_values, cross_past_key_values, self.layers
         ):
-            layer_outputs = decoder_layer(
+            hidden_states, cross_attn_weights = decoder_layer(
                 hidden_states,
                 attention_mask=attention_mask,
                 self_past_key_value=self_past_key_value,
                 cross_past_key_value=cross_past_key_value,
                 cache_position=cache_position,
             )
-            hidden_states = layer_outputs[0]
-            self_present_key_values += layer_outputs[1]
-            cross_attentions += (layer_outputs[2],)
+            cross_attentions += (cross_attn_weights,)
         hidden_states = self.layer_norm(hidden_states)
-        return hidden_states, self_present_key_values, cross_attentions
+        return hidden_states, cross_attentions
 class WhisperDecoderLayer(nn.Module):
@@ -213,7 +209,7 @@ class WhisperDecoderLayer(nn.Module):
         # Self Attention Block
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, _, self_present_key_value = self.self_attn(
+        hidden_states = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=self_past_key_value,
             attention_mask=attention_mask,
@@ -224,7 +220,7 @@ class WhisperDecoderLayer(nn.Module):
         # Cross-Attention Block
         residual = hidden_states
         hidden_states = self.encoder_attn_layer_norm(hidden_states)
-        hidden_states, cross_attn_weights, cross_present_key_value = self.encoder_attn(
+        hidden_states, cross_attn_weights = self.encoder_attn(
             hidden_states=hidden_states,
             past_key_value=cross_past_key_value,
         )
@@ -237,7 +233,7 @@ class WhisperDecoderLayer(nn.Module):
         hidden_states = self.fc2(hidden_states)
         hidden_states = residual + hidden_states
-        return hidden_states, self_present_key_value, cross_attn_weights
+        return hidden_states, cross_attn_weights
 class WhisperAttention(nn.Module):
@@ -258,19 +254,8 @@ class WhisperAttention(nn.Module):
 class WhisperSelfAttention(WhisperAttention):
-    def rbln_cache_update(
-        self,
-        past_key_value: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        cache_position: torch.Tensor,
-    ):
-        s_idx = torch.tensor(cache_position, dtype=torch.int16)
-        axis = torch.tensor(2, dtype=torch.int16)
-        key_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[0], key_states, s_idx, axis)
-        value_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[1], value_states, s_idx, axis)
-        return key_states, value_states
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int) -> torch.Tensor:
+        return tensor.view(bsz, seq_len, 1, self.num_heads, self.head_dim).transpose(1, 3)
     def forward(
         self,
@@ -285,22 +270,27 @@ class WhisperSelfAttention(WhisperAttention):
         key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
         value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        key_states, value_states = self.rbln_cache_update(past_key_value, key_states, value_states, cache_position)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.ops.rbln_custom_ops.add_softmax_attn_decode(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask.unsqueeze(2),
+            past_key_value[0].view(bsz, self.num_heads, 1, -1, self.head_dim),
+            past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
+            cache_position.expand(bsz, 1),
+            torch.tensor(1.0, dtype=torch.float32),  # scale
+        )
-        attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights, (key_states, value_states)
+        return attn_output
-class WhisperCrossAttention(WhisperSelfAttention):
+class WhisperCrossAttention(WhisperAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -322,4 +312,4 @@ class WhisperCrossAttention(WhisperSelfAttention):
         attn_output = attn_output.reshape(batch_size, query_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights, (key_states, value_states)
+        return attn_output, attn_weights

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.7.3a2
+Version: 0.7.3a4
 Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a4.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 optimum/rbln/__init__.py,sha256=eHi15YM3989AcX52jka9rUmgAtlp1PHqMNwBEdOfuu8,6554
-optimum/rbln/__version__.py,sha256=bShBukYvw7AqWtLsut0yClygDEGsFRmxrXypqIeEXcQ,513
-optimum/rbln/modeling.py,sha256=3XE0IrCYbkjw9_Q9BFzZ_ri_Kyxw1g6iwfdohZB46-s,8289
-optimum/rbln/modeling_base.py,sha256=ELSPbjx7awBRM2SckkD-5gI3TIa01mfzz7gDRC1Pljk,21778
+optimum/rbln/__version__.py,sha256=MLlg_138GxyhciEP0ZB5dPN8vriXkicRnaZiwqygxOY,519
+optimum/rbln/modeling.py,sha256=nJsAs5zs--VVOYGFjYNpqfxYIemJIK4Lr0WEzlDLdP0,8390
+optimum/rbln/modeling_base.py,sha256=Ow73GVJF1N5cDFO8_rgirtGj1wC-cXBDyqXHW5PCybA,22270
 optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
 optimum/rbln/diffusers/__init__.py,sha256=pOyoXv3-JRzTBSwPKbgLS9H6F2K9dJdReEmpGhcLQYU,3283
 optimum/rbln/diffusers/modeling_diffusers.py,sha256=zqVNgH9oeOx2iNE7VsW_FinVf4s6G5Idyh4TKz7XJJg,21116
@@ -40,9 +40,9 @@ optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_x
 optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py,sha256=3aB1Rw-OgKytQOHwOaShbEvq_XVHPOGvsGm8pstEmKU,930
 optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py,sha256=MzVP1wscaO1sUIiBIPJqG6zuGyez9VUbA42-JSIm-mk,930
 optimum/rbln/ops/__init__.py,sha256=TxOmsN0u3PmyK4Sb89qbiC4rePOlkvUT7Lm6wVoTnY0,941
-optimum/rbln/ops/attn.py,sha256=LbJAmFtNj05i6BURfKV3KybsPItFe8w-YdSe5SuWkEc,12365
-optimum/rbln/ops/flash_attn.py,sha256=4shKNY13skPoYnbEsGrXDzgNwBIhHZEFrnUnWx1ESZU,4076
-optimum/rbln/ops/kv_cache_update.py,sha256=9W4WCO1Dtfy0u5i978JJRa7uLbqrfR2lHuoPynb07fw,3143
+optimum/rbln/ops/attn.py,sha256=3EqU63Oj4zI4rLbkRycorsscXeD-IpKzt9N1MhkMa5o,10374
+optimum/rbln/ops/flash_attn.py,sha256=wfyiCxDGf034IngzwRU160R7_DlKYpd-uWT0BDEGFks,3408
+optimum/rbln/ops/kv_cache_update.py,sha256=pxf8kAptPaQF5xE8qItvmlFOq_sgim6ZERD7AVaOtec,3221
 optimum/rbln/transformers/__init__.py,sha256=AGo3BqVIZrsOzYsQAnnQ25HCstTPBclrXbvvUxVMlqE,4255
 optimum/rbln/transformers/modeling_alias.py,sha256=yx7FnZQWAnrWzivaO5hI7T6i-fyLzt2tMIXG2oDNbPo,1657
 optimum/rbln/transformers/modeling_generic.py,sha256=aaZWsqVDCRvH03q-Wen7DMfLr7Gy-u-I0mTw0aYqWjk,18195
@@ -59,8 +59,8 @@ optimum/rbln/transformers/models/bert/modeling_bert.py,sha256=p3utRqf3dv9_RkHwaM
 optimum/rbln/transformers/models/clip/__init__.py,sha256=H9vuBwrmFO0-CqZhXUrKF-uQL6igCqMlqrT1X_ELaAI,754
 optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=NiSm7bHs4SReHDUr53BBWSX0Y8bkKOeUSpsBDrp8YDw,6628
 optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
-optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=x8_xQ5aGXbadJyajpJQyfgxx4YPSj62VlmmGDMnC-1E,41819
-optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=dyl8tDBjfe5VfU1XbKAoZS7g7F90JTYVmMuz0HTmCoE,35345
+optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=7OIKteJLKNxOLOg0w3lLOM7TxZovQn4jkglI9wRkrtQ,40609
+optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=W9HnxJoTz78Wc4X5Q3sMSHhMTSa7-9uQCFlnqNVozvA,38932
 optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
 optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
 optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
@@ -91,16 +91,16 @@ optimum/rbln/transformers/models/qwen2/modeling_qwen2.py,sha256=9-aFDvjMzPNUyGOz
 optimum/rbln/transformers/models/qwen2/qwen2_architecture.py,sha256=XlNAMYAcDLohnSAhIFGKOPuCB5XLgzYs5ABWdeQSaZs,720
 optimum/rbln/transformers/models/seq2seq/__init__.py,sha256=EmEMV4rOYqKyruX85d0fR73-b8N6BSD6CPcbpYdBuVk,651
 optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py,sha256=NPfJf9Uk_bYOae7hXGHwteGiWH0va63Z-D93RmAMENg,17611
-optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=QXIGWSu9PsKWE3WhkgmBj3VeszqXIo2MPOwcrb54Tbs,19348
+optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=tvzacIZam1sIr_1BvvZ_fDr8u5dXAiYiynFdX9tArtY,18877
 optimum/rbln/transformers/models/t5/__init__.py,sha256=1skR1RmnG62WTAP3-F5P1x-V_ReFhMyirH3u56vWwvc,675
 optimum/rbln/transformers/models/t5/modeling_t5.py,sha256=nKRR3eH1EAu1YkKvhlqGyTrJXDRd-IWB5LOeG9jrcb4,16021
-optimum/rbln/transformers/models/t5/t5_architecture.py,sha256=oCdmF4eCTayAVjx3c-SVpmhrjnWE92jh79dMIYCwotY,9690
+optimum/rbln/transformers/models/t5/t5_architecture.py,sha256=AArCQhZRETVM583wlIRzMFOSYq7t2nzxaAeyhZxyxKk,9508
 optimum/rbln/transformers/models/wav2vec2/__init__.py,sha256=YpgA0K-vyg9veh0eL_jxauosbRpb_kpGKHvvQLBspKM,649
 optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py,sha256=JYJmV52j6cBwim4RanVJryfKnV80V96ol0A-oR6o7cg,3856
 optimum/rbln/transformers/models/whisper/__init__.py,sha256=ktnNe5ri3ycCWZ_W_voFB9y9-vgGgxS1X9s8LBRZmWc,665
 optimum/rbln/transformers/models/whisper/generation_whisper.py,sha256=GIHTca3b1VtW81kp7BzKQ7f77c2t9OsEsbZetripgDo,4582
 optimum/rbln/transformers/models/whisper/modeling_whisper.py,sha256=0nBADNxE0A1ozBbRutTBvxpo_Y1qkOycT_zronkN-ZU,15840
-optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=eP3UgkwCRaaFjc5Jc4ZEiWxr3-L7oJx9KzpJ7eFkwUs,13158
+optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=Yn6yFpmw6IQbWlnpIMAdEUsNF6huXgaKzGMUZbhSLdo,12572
 optimum/rbln/transformers/models/xlm_roberta/__init__.py,sha256=fC7iNcdxBZ_6eOF2snStmf8r2M3c8O_-XcXnQEaHQCE,653
 optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py,sha256=8YNLz0bc5ze-QuU8rN-QhUfGzlSUs3iMJiWTxO3o6AM,4366
 optimum/rbln/transformers/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -114,7 +114,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
 optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
 optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
 optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
-optimum_rbln-0.7.3a2.dist-info/METADATA,sha256=C-IWumO-veJFZPHpF8wcOTOE0TCDzKU1Xk_ylaqrvPM,5300
-optimum_rbln-0.7.3a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-optimum_rbln-0.7.3a2.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
-optimum_rbln-0.7.3a2.dist-info/RECORD,,
+optimum_rbln-0.7.3a4.dist-info/METADATA,sha256=8VNTOVgsgFtcFUuZ9VEeRQfC2LEB60OFmW92hlJo8V8,5300
+optimum_rbln-0.7.3a4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+optimum_rbln-0.7.3a4.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
+optimum_rbln-0.7.3a4.dist-info/RECORD,,

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a4.dist-info}/WHEEL RENAMED Viewed

File without changes

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

optimum-rbln 0.7.3a2__py3-none-any.whl → 0.7.3a4__py3-none-any.whl

optimum-rbln 0.7.3a2py3-none-any.whl → 0.7.3a4py3-none-any.whl