PyPI - optimum-rbln - Versions diffs - 0.7.2rc2__py3-none-any.whl → 0.7.3__py3-none-any.whl - Mend

optimum-rbln 0.7.2rc2py3-none-any.whl → 0.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -295,6 +295,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
     ):
         if isinstance(model_save_dir, str):
             model_save_dir = Path(model_save_dir)
         # FIXME:: Should we convert it?
         compiled_model_names = [cfg.compiled_model_name for cfg in rbln_config.compile_cfgs]
         rbln_compiled_models = [rbln_compiled_models[cm_name] for cm_name in compiled_model_names]
@@ -389,8 +390,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         return rbln_config
     @classmethod
-    @property
-    def hf_class(cls):
+    def get_hf_class(cls):
         """
         Lazily loads and caches the corresponding Hugging Face model class.
         Removes 'RBLN' prefix from the class name to get the original class name
@@ -416,7 +416,20 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         return self.forward(*args, **kwargs)
     def __repr__(self):
-        return repr(self.model) + repr(self.rbln_submodules)
+        has_submodules = len(self.rbln_submodules) > 0
+        repr_str: str = f"<{self.__class__.__name__}>\n"
+        repr_str += f"- Total {len(self.model)} Runtimes"
+        repr_str += f" and {len(self.rbln_submodules)} Submodules\n" if has_submodules else "\n"
+        repr_str += "[Runtimes]\n"
+        repr_str += "\n".join([repr(model) for model in self.model])
+        repr_str += "\n"
+        if has_submodules > 0:
+            for i, submodule in enumerate(self.rbln_submodules):
+                repr_str += f"[Submodules {i} : {self._rbln_submodules[i]['name']}]\n"
+                repr_str += repr(submodule) + "\n"
+        return repr_str
     def __post_init__(self, **kwargs):
         pass

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attn import register_rbln_custom_attention, register_rbln_custom_attention_add_softmax
-from .flash_attn import register_rbln_custom_flash_attention
+from .attn import (
+    register_rbln_custom_add_softmax_attention,
+    register_rbln_custom_paged_attention,
+    register_rbln_custom_paged_causal_attention,
+)
+from .flash_attn import register_rbln_custom_paged_flash_attention, register_rbln_custom_paged_flash_causal_attention
 from .kv_cache_update import register_rbln_custom_cache_update

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -25,14 +25,14 @@ else:
 @lru_cache
-def register_rbln_custom_attention():
+def register_rbln_custom_paged_attention():
     torch.library.define(
-        "rbln_custom_ops::attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+        "rbln_custom_ops::paged_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
-    @torch.library.impl("rbln_custom_ops::attn_decode", "cpu")
-    def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
+    @torch.library.impl("rbln_custom_ops::paged_attn_decode", "cpu")
+    def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
         """Defines the computation pattern for fused attention with KV cache updates.
         IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
@@ -51,36 +51,27 @@ def register_rbln_custom_attention():
         - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
         - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
         - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1] - Current sequence position
+        - seq: [1, 1] - Current sequence position
         - scale: [] - Attention scale factor
+        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+        - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
-            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
-            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
-    @register_fake("rbln_custom_ops::attn_decode")
-    def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
+    @register_fake("rbln_custom_ops::paged_attn_decode")
+    def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
+        return q
     torch.library.define(
-        "rbln_custom_ops::attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
+        "rbln_custom_ops::paged_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
-    @torch.library.impl("rbln_custom_ops::attn_prefill", "cpu")
-    def attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
+    @torch.library.impl("rbln_custom_ops::paged_attn_prefill", "cpu")
+    def attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
         """Defines the computation pattern for prefill phase attention with KV cache updates.
         IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
@@ -97,32 +88,30 @@ def register_rbln_custom_attention():
         - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
         - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
         - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - batch: [1] - Batch index for cache access
-        - seq: [1] - Starting sequence position
+        - seq: [1, 1] - Starting sequence position
         - scale: [] - Attention scale factor
+        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+        - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
-            - empty_kcache: Same shape as input kcache - Placeholder for compiler
-            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
         """
-        return q, kcache, vcache
+        return q
-    @register_fake("rbln_custom_ops::attn_prefill")
-    def attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
-        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::paged_attn_prefill")
+    def attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
+        return q
 @lru_cache
-def register_rbln_custom_attention_add_softmax():
+def register_rbln_custom_paged_causal_attention():
     torch.library.define(
-        "rbln_custom_ops::attn_decode_add_softmax",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+        "rbln_custom_ops::paged_causal_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
-    @torch.library.impl("rbln_custom_ops::attn_decode_add_softmax", "cpu")
-    def attn_decode_add_softmax_cpu(q, k, v, mask, kcache, vcache, seq, scale):
+    @torch.library.impl("rbln_custom_ops::paged_causal_attn_decode", "cpu")
+    def attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
         """Defines the computation pattern for fused attention with KV cache updates.
         IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
@@ -131,46 +120,36 @@ def register_rbln_custom_attention_add_softmax():
         Pattern components that compiler fuses into a single op:
         1. KV cache updates with new key/value states
         2. Scaled dot-product attention computation
-        3. add-softmax operation
+        3. Causal masked softmax operation
         4. Final attention output computation
         Expected tensor shapes:
         - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
         - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
         - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
-        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
         - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
         - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1] - Current sequence position
+        - seq: [1, 1] - Starting sequence position
         - scale: [] - Attention scale factor
+        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+        - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
-            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
-            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
-    @register_fake("rbln_custom_ops::attn_decode_add_softmax")
-    def attn_decode_add_softmax_abstract(q, k, v, m, kcache, vcache, seq, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+        return q
+    @register_fake("rbln_custom_ops::paged_causal_attn_decode")
+    def attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
+        return q
     torch.library.define(
-        "rbln_custom_ops::attn_prefill_add_softmax",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
+        "rbln_custom_ops::paged_causal_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
     )
-    @torch.library.impl("rbln_custom_ops::attn_prefill_add_softmax", "cpu")
-    def attn_prefill_add_softmax_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
+    @torch.library.impl("rbln_custom_ops::paged_causal_attn_prefill", "cpu")
+    def attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
         """Defines the computation pattern for prefill phase attention with KV cache updates.
         IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
@@ -184,29 +163,59 @@ def register_rbln_custom_attention_add_softmax():
         - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
         - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
         - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
-        - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
         - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
         - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
         - batch: [1] - Batch index for cache access
-        - seq: [1] - Starting sequence position
+        - seq: [1, 1] - Starting sequence position
         - scale: [] - Attention scale factor
+        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+        - block_size: [] - Number of tokens per block
         Returns:
-            Tuple[Tensor, Tensor, Tensor]:
-            - attn_output: [batch=1, n_heads, seq_len, 1, head_dim] - Attention output
-            - empty_kcache: Same shape as input kcache - Placeholder for compiler
-            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
         """
-        return (
-            q,
-            torch.empty(1, *kcache.shape[1:], device=kcache.device),
-            torch.empty(1, *vcache.shape[1:], device=vcache.device),
-        )
-    @register_fake("rbln_custom_ops::attn_prefill_add_softmax")
-    def attn_prefill_add_softmax_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
-        return (
-            q,
-            torch.empty(1, *kcache.shape[1:], device=kcache.device),
-            torch.empty(1, *vcache.shape[1:], device=vcache.device),
-        )
+        return q
+    @register_fake("rbln_custom_ops::paged_causal_attn_prefill")
+    def attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
+        return q
+@lru_cache
+def register_rbln_custom_add_softmax_attention():
+    torch.library.define(
+        "rbln_custom_ops::add_softmax_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor",
+    )
+    @torch.library.impl("rbln_custom_ops::add_softmax_attn_decode", "cpu")
+    def add_softmax_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
+        """Defines the computation pattern for fused attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Pattern components that compiler fuses into a single op:
+        1. KV cache updates with new key/value states
+        2. Scaled dot-product attention computation
+        3. add-softmax operation
+        4. Final attention output computation
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - seq: [1] - Current sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
+        """
+        return q
+    @register_fake("rbln_custom_ops::add_softmax_attn_decode")
+    def add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
+        return q

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -25,37 +25,58 @@ else:
 @lru_cache
-def register_rbln_custom_flash_attention():
+def register_rbln_custom_paged_flash_attention():
     torch.library.define(
-        "rbln_custom_ops::flash_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, int e) -> Tensor[]",
+        "rbln_custom_ops::paged_flash_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
-    @torch.library.impl("rbln_custom_ops::flash_attn_decode", "cpu")
-    def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
-    @register_fake("rbln_custom_ops::flash_attn_decode")
-    def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, partition):
-        return (
-            q,
-            torch.empty(*kcache.shape, device=kcache.device),
-            torch.empty(*vcache.shape, device=vcache.device),
-        )
+    @torch.library.impl("rbln_custom_ops::paged_flash_attn_decode", "cpu")
+    def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
+    @register_fake("rbln_custom_ops::paged_flash_attn_decode")
+    def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
     torch.library.define(
-        "rbln_custom_ops::flash_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+        "rbln_custom_ops::paged_flash_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
     )
     @torch.library.impl("rbln_custom_ops::flash_attn_prefill", "cpu")
-    def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale, partition):
-        return q, kcache, vcache
+    def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
+    @register_fake("rbln_custom_ops::paged_flash_attn_prefill")
+    def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
+@lru_cache
+def register_rbln_custom_paged_flash_causal_attention():
+    torch.library.define(
+        "rbln_custom_ops::paged_flash_causal_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
+    )
+    @torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_decode", "cpu")
+    def flash_attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
+    @register_fake("rbln_custom_ops::paged_flash_causal_attn_decode")
+    def flash_attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
+    torch.library.define(
+        "rbln_custom_ops::paged_flash_causal_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor",
+    )
+    @torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_prefill", "cpu")
+    def flash_attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q
-    @register_fake("rbln_custom_ops::flash_attn_prefill")
-    def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, scale, partition):
-        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::paged_flash_causal_attn_prefill")
+    def flash_attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
+        return q

optimum/rbln/ops/kv_cache_update.py CHANGED Viewed

@@ -45,10 +45,10 @@ def register_rbln_custom_cache_update():
         # Update the specified portion of the cache tensor with the state tensor, using `slice_scatter`.
         # This operation modifies the cache tensor in-place directly on the device, avoiding any unnecessary transfers between host and device.
-        updated_cache = cache.slice_scatter(state, dim=axis, start=s, end=e)
+        cache.slice_scatter(state, dim=axis, start=s, end=e)
-        # Return the updated cache tensor.
-        return updated_cache
+        # 'rbln_cache_update' is an in-place operation that isn't tracked in JIT trace, so a dummy output was added to the return value.
+        return torch.empty([256])
     # Register a "fake" implementation of the "rbln_cache_update" operation.
     # This serves as an abstract definition for the RBLN compiler to recognize the operation and generate an optimized implementation.
@@ -57,4 +57,4 @@ def register_rbln_custom_cache_update():
         # Return a tensor with the same shape as the input cache tensor.
         # This is a placeholder for the abstract implementation and does not perform any actual computation.
         # Like the actual implementation, the abstraction assumes in-place device-side updates.
-        return torch.empty_like(cache)
+        return torch.empty([256])

optimum/rbln/transformers/modeling_generic.py CHANGED Viewed

@@ -73,7 +73,7 @@ class RBLNModelForQuestionAnswering(RBLNModel):
         if rbln_batch_size is None:
             rbln_batch_size = 1
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:
@@ -289,7 +289,7 @@ class RBLNModelForSequenceClassification(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_enc_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:
@@ -362,7 +362,7 @@ class RBLNModelForMaskedLM(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_enc_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -35,16 +35,16 @@ logger = logging.get_logger(__name__)
 class BartWrapper:
-    def __init__(self, model: nn.Module, enc_max_seq_len: int):
+    def __init__(self, model: nn.Module, enc_max_seq_len: int, use_attention_mask: bool):
         self.encoder = Seq2SeqEncoderWrapper(model, enc_max_seq_len)
-        self.decoder = BartDecoderWrapper(model)
+        self.decoder = BartDecoderWrapper(model, use_attention_mask=use_attention_mask)
 class BartDecoderWrapper(Seq2SeqDecoderWrapper):
     def convert_to_rbln_conditional_generation(self, model: nn.Module):
         new_layers = []
         for layer in model.get_decoder().layers:
-            self_attn = BartSelfAttention(layer.self_attn)
+            self_attn = BartSelfAttention(layer.self_attn, use_attention_mask=self.use_attention_mask)
             new_layers.append(BartDecoderLayer(layer, self_attn))
         decoder_model = BartDecoder(model.get_decoder(), new_layers)
@@ -69,7 +69,8 @@ class BartDecoder(Seq2SeqDecoder):
         self.embed_scale = getattr(self._original_mod, "embed_scale", None)
     def prepare_attn_mask(self, attention_mask, encoder_attention_mask, **kwargs):
-        attention_mask = attention_mask[:, None, None, :]
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, None, None, :]
         encoder_attention_mask = _prepare_4d_attention_mask(encoder_attention_mask, torch.float32, tgt_len=1)
         return attention_mask, encoder_attention_mask
@@ -134,7 +135,7 @@ class BartDecoderLayer(Seq2SeqDecoderLayer):
 class BartSelfAttention(Seq2SeqSelfAttention):
-    def __post_init__(self):
+    def __post_init__(self, use_attention_mask: bool = True):
         self.q_proj = self._original_mod.q_proj
         self.k_proj = self._original_mod.k_proj
         self.v_proj = self._original_mod.v_proj
@@ -142,7 +143,10 @@ class BartSelfAttention(Seq2SeqSelfAttention):
         self.num_heads = self._original_mod.num_heads
         self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
         self.scaling = self.head_dim**-0.5
-        self.attn_decode = torch.ops.rbln_custom_ops.attn_decode
+        if use_attention_mask:
+            self.attn_decode = torch.ops.rbln_custom_ops.paged_attn_decode
+        else:
+            self.attn_decode = torch.ops.rbln_custom_ops.paged_causal_attn_decode
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_states = self.q_proj(hidden_states) * self.scaling

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -58,7 +58,7 @@ class RBLNBartModel(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:
@@ -108,12 +108,16 @@ class RBLNBartModel(RBLNModel):
 class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
+    support_paged_causal_attn = True
     @classmethod
     def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
         enc_max_seq_len = (
             rbln_config.model_cfg["enc_max_seq_len"] if "enc_max_seq_len" in rbln_config.model_cfg else 1024
         )
-        return BartWrapper(model, enc_max_seq_len=enc_max_seq_len)
+        use_attention_mask = rbln_config.model_cfg.get("use_attention_mask", False)
+        return BartWrapper(model, enc_max_seq_len=enc_max_seq_len, use_attention_mask=use_attention_mask)
     def __getattr__(self, __name: str) -> Any:
         def redirect(func):

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -56,7 +56,7 @@ class RBLNBertModel(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:

optimum-rbln 0.7.2rc2__py3-none-any.whl → 0.7.3__py3-none-any.whl

optimum-rbln 0.7.2rc2py3-none-any.whl → 0.7.3py3-none-any.whl