PyPI - optimum-rbln - Versions diffs - 0.7.2rc1__py3-none-any.whl → 0.7.3a0__py3-none-any.whl - Mend

optimum-rbln 0.7.2rc1py3-none-any.whl → 0.7.3a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

optimum/rbln/__version__.py CHANGED Viewed

@@ -1,8 +1,13 @@
-# file generated by setuptools_scm
+# file generated by setuptools-scm
 # don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
-    from typing import Tuple, Union
+    from typing import Tuple
+    from typing import Union
     VERSION_TUPLE = Tuple[Union[int, str], ...]
 else:
     VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.7.2rc1'
-__version_tuple__ = version_tuple = (0, 7, 2)
+__version__ = version = '0.7.3a0'
+__version_tuple__ = version_tuple = (0, 7, 3)

optimum/rbln/diffusers/modeling_diffusers.py CHANGED Viewed

@@ -71,13 +71,11 @@ class RBLNDiffusionMixin:
     _prefix = {}
     @classmethod
-    @property
-    def img2img_pipeline(cls):
+    def is_img2img_pipeline(cls):
         return "Img2Img" in cls.__name__
     @classmethod
-    @property
-    def inpaint_pipeline(cls):
+    def is_inpaint_pipeline(cls):
         return "Inpaint" in cls.__name__
     @classmethod
@@ -100,8 +98,8 @@ class RBLNDiffusionMixin:
             submodule_config.update({k: v for k, v in pipe_global_config.items() if k not in submodule_config})
             submodule_config.update(
                 {
-                    "img2img_pipeline": cls.img2img_pipeline,
-                    "inpaint_pipeline": cls.inpaint_pipeline,
+                    "img2img_pipeline": cls.is_img2img_pipeline(),
+                    "inpaint_pipeline": cls.is_inpaint_pipeline(),
                 }
             )
             submodule_config = submodule_cls.update_rbln_config_using_pipe(model, submodule_config)
@@ -112,6 +110,11 @@ class RBLNDiffusionMixin:
             submodule_cls: RBLNModel = getattr(importlib.import_module("optimum.rbln"), f"{submodule_class_name}")
             prefix = cls._prefix.get(submodule_name, "")
             connected_submodules = cls._connected_classes.get(submodule_name)._submodules
+            pipe_global_config = {k: v for k, v in submodule_config.items() if k not in connected_submodules}
+            submodule_config = {k: v for k, v in submodule_config.items() if k in connected_submodules}
+            for key in submodule_config.keys():
+                submodule_config[key].update(pipe_global_config)
             for connected_submodule_name in connected_submodules:
                 connected_submodule_config = rbln_config.pop(prefix + connected_submodule_name, {})
                 if connected_submodule_name in submodule_config:
@@ -119,14 +122,17 @@ class RBLNDiffusionMixin:
                 else:
                     submodule_config[connected_submodule_name] = connected_submodule_config
-            submodules = copy.deepcopy(cls._submodules)
-            submodules += [prefix + connected_submodule_name for connected_submodule_name in connected_submodules]
+            pipe_global_config = {
+                k: v for k, v in rbln_config.items() if k != submodule_class_name and not isinstance(v, dict)
+            }
-            pipe_global_config = {k: v for k, v in rbln_config.items() if k not in submodules}
             for connected_submodule_name in connected_submodules:
-                submodule_config[connected_submodule_name].update(
-                    {k: v for k, v in pipe_global_config.items() if k not in submodule_config}
-                )
+                for k, v in pipe_global_config.items():
+                    if "guidance_scale" in k:
+                        if prefix + "guidance_scale" == k:
+                            submodule_config[connected_submodule_name]["guidance_scale"] = v
+                    else:
+                        submodule_config[connected_submodule_name][k] = v
             rbln_config[submodule_name] = submodule_config
         else:
             raise ValueError(f"submodule {submodule_name} isn't supported")

optimum/rbln/modeling.py CHANGED Viewed

@@ -196,7 +196,7 @@ class RBLNModel(RBLNBaseModel):
         **kwargs,
     ) -> "PreTrainedModel":
         kwargs = cls.update_kwargs(kwargs)
-        return cls.hf_class.from_pretrained(
+        return cls.get_hf_class().from_pretrained(
             model_id,
             subfolder=subfolder,
             revision=revision,

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -389,8 +389,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         return rbln_config
     @classmethod
-    @property
-    def hf_class(cls):
+    def get_hf_class(cls):
         """
         Lazily loads and caches the corresponding Hugging Face model class.
         Removes 'RBLN' prefix from the class name to get the original class name
@@ -416,7 +415,20 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         return self.forward(*args, **kwargs)
     def __repr__(self):
-        return repr(self.model) + repr(self.rbln_submodules)
+        has_submodules = len(self.rbln_submodules) > 0
+        repr_str: str = f"<{self.__class__.__name__}>\n"
+        repr_str += f"- Total {len(self.model)} Runtimes"
+        repr_str += f" and {len(self.rbln_submodules)} Submodules\n" if has_submodules else "\n"
+        repr_str += "[Runtimes]\n"
+        repr_str += "\n".join([repr(model) for model in self.model])
+        repr_str += "\n"
+        if has_submodules > 0:
+            for i, submodule in enumerate(self.rbln_submodules):
+                repr_str += f"[Submodules {i} : {self._rbln_submodules[i]['name']}]\n"
+                repr_str += repr(submodule) + "\n"
+        return repr_str
     def __post_init__(self, **kwargs):
         pass

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attn import register_rbln_custom_attention, register_rbln_custom_attention_add_softmax
-from .flash_attn import register_rbln_custom_flash_attention
+from .attn import (
+    register_rbln_custom_attention_add_softmax,
+    register_rbln_custom_causal_masked_attention,
+    register_rbln_custom_masked_attention,
+)
+from .flash_attn import register_rbln_custom_flash_causal_masked_attention, register_rbln_custom_flash_masked_attention
 from .kv_cache_update import register_rbln_custom_cache_update

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -25,13 +25,13 @@ else:
 @lru_cache
-def register_rbln_custom_attention():
+def register_rbln_custom_masked_attention():
     torch.library.define(
-        "rbln_custom_ops::attn_decode",
+        "rbln_custom_ops::masked_attn_decode",
         "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
     )
-    @torch.library.impl("rbln_custom_ops::attn_decode", "cpu")
+    @torch.library.impl("rbln_custom_ops::masked_attn_decode", "cpu")
     def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
         """Defines the computation pattern for fused attention with KV cache updates.
@@ -66,7 +66,7 @@ def register_rbln_custom_attention():
             torch.empty(*vcache.shape, device=vcache.device),
         )
-    @register_fake("rbln_custom_ops::attn_decode")
+    @register_fake("rbln_custom_ops::masked_attn_decode")
     def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
         return (
             q,
@@ -75,11 +75,11 @@ def register_rbln_custom_attention():
         )
     torch.library.define(
-        "rbln_custom_ops::attn_prefill",
+        "rbln_custom_ops::masked_attn_prefill",
         "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
     )
-    @torch.library.impl("rbln_custom_ops::attn_prefill", "cpu")
+    @torch.library.impl("rbln_custom_ops::masked_attn_prefill", "cpu")
     def attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
         """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -109,11 +109,99 @@ def register_rbln_custom_attention():
         """
         return q, kcache, vcache
-    @register_fake("rbln_custom_ops::attn_prefill")
+    @register_fake("rbln_custom_ops::masked_attn_prefill")
     def attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
         return q, kcache, vcache
+@lru_cache
+def register_rbln_custom_causal_masked_attention():
+    torch.library.define(
+        "rbln_custom_ops::causal_masked_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::causal_masked_attn_decode", "cpu")
+    def attn_decode_cpu(q, k, v, kcache, vcache, seq, scale):
+        """Defines the computation pattern for fused attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Pattern components that compiler fuses into a single op:
+        1. KV cache updates with new key/value states
+        2. Scaled dot-product attention computation
+        3. Causal masked softmax operation
+        4. Final attention output computation
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - seq: [1] - Current sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
+            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
+            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+        """
+        return (
+            q,
+            torch.empty(*kcache.shape, device=kcache.device),
+            torch.empty(*vcache.shape, device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::causal_masked_attn_decode")
+    def attn_decode_abstract(q, k, v, kcache, vcache, seq, partition):
+        return (
+            q,
+            torch.empty(*kcache.shape, device=kcache.device),
+            torch.empty(*vcache.shape, device=vcache.device),
+        )
+    torch.library.define(
+        "rbln_custom_ops::causal_masked_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::causal_masked_attn_prefill", "cpu")
+    def attn_prefill_cpu(q, k, v, kcache, vcache, batch, seq, scale):
+        """Defines the computation pattern for prefill phase attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Key differences from decode pattern:
+        - Handles prefill phase with multiple input tokens
+        - Takes explicit batch index for continuous batching
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+        - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - batch: [1] - Batch index for cache access
+        - seq: [1] - Starting sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
+            - empty_kcache: Same shape as input kcache - Placeholder for compiler
+            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+        """
+        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::causal_masked_attn_prefill")
+    def attn_prefill_abstract(q, k, v, kcache, vcache, batch, seq, partition):
+        return q, kcache, vcache
 @lru_cache
 def register_rbln_custom_attention_add_softmax():
     torch.library.define(

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -25,13 +25,13 @@ else:
 @lru_cache
-def register_rbln_custom_flash_attention():
+def register_rbln_custom_flash_masked_attention():
     torch.library.define(
-        "rbln_custom_ops::flash_attn_decode",
+        "rbln_custom_ops::flash_masked_attn_decode",
         "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, int e) -> Tensor[]",
     )
-    @torch.library.impl("rbln_custom_ops::flash_attn_decode", "cpu")
+    @torch.library.impl("rbln_custom_ops::flash_masked_attn_decode", "cpu")
     def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, partition):
         return (
             q,
@@ -39,7 +39,7 @@ def register_rbln_custom_flash_attention():
             torch.empty(*vcache.shape, device=vcache.device),
         )
-    @register_fake("rbln_custom_ops::flash_attn_decode")
+    @register_fake("rbln_custom_ops::flash_masked_attn_decode")
     def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, partition):
         return (
             q,
@@ -48,7 +48,7 @@ def register_rbln_custom_flash_attention():
         )
     torch.library.define(
-        "rbln_custom_ops::flash_attn_prefill",
+        "rbln_custom_ops::flash_masked_attn_prefill",
         "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
     )
@@ -56,6 +56,43 @@ def register_rbln_custom_flash_attention():
     def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale, partition):
         return q, kcache, vcache
-    @register_fake("rbln_custom_ops::flash_attn_prefill")
+    @register_fake("rbln_custom_ops::flash_masked_attn_prefill")
     def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, scale, partition):
         return q, kcache, vcache
+@lru_cache
+def register_rbln_custom_flash_causal_masked_attention():
+    torch.library.define(
+        "rbln_custom_ops::flash_causal_masked_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, int e) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::flash_causal_masked_attn_decode", "cpu")
+    def flash_attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, partition):
+        return (
+            q,
+            torch.empty(*kcache.shape, device=kcache.device),
+            torch.empty(*vcache.shape, device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::flash_causal_masked_attn_decode")
+    def flash_attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, partition):
+        return (
+            q,
+            torch.empty(*kcache.shape, device=kcache.device),
+            torch.empty(*vcache.shape, device=vcache.device),
+        )
+    torch.library.define(
+        "rbln_custom_ops::flash_causal_masked_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::flash_causal_masked_attn_prefill", "cpu")
+    def flash_attn_prefill_cpu(q, k, v, kcache, vcache, batch, seq, scale, partition):
+        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::flash_causal_masked_attn_prefill")
+    def flash_attn_prefill_abstract(q, k, v, kcache, vcache, batch, seq, scale, partition):
+        return q, kcache, vcache

optimum/rbln/transformers/modeling_generic.py CHANGED Viewed

@@ -73,7 +73,7 @@ class RBLNModelForQuestionAnswering(RBLNModel):
         if rbln_batch_size is None:
             rbln_batch_size = 1
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:
@@ -289,7 +289,7 @@ class RBLNModelForSequenceClassification(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_enc_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:
@@ -362,7 +362,7 @@ class RBLNModelForMaskedLM(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_enc_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -142,7 +142,7 @@ class BartSelfAttention(Seq2SeqSelfAttention):
         self.num_heads = self._original_mod.num_heads
         self.head_dim = self._original_mod.embed_dim // self._original_mod.num_heads
         self.scaling = self.head_dim**-0.5
-        self.attn_decode = torch.ops.rbln_custom_ops.attn_decode
+        self.attn_decode = torch.ops.rbln_custom_ops.masked_attn_decode
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_states = self.q_proj(hidden_states) * self.scaling

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -58,7 +58,7 @@ class RBLNBartModel(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -56,7 +56,7 @@ class RBLNBertModel(RBLNModel):
         if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
-        signature_params = inspect.signature(cls.hf_class.forward).parameters.keys()
+        signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
         if rbln_model_input_names is None:
             for tokenizer in preprocessors:

optimum-rbln 0.7.2rc1__py3-none-any.whl → 0.7.3a0__py3-none-any.whl

optimum-rbln 0.7.2rc1py3-none-any.whl → 0.7.3a0py3-none-any.whl