PyPI - optimum-rbln - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl - Mend

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

optimum/rbln/modeling_config.py CHANGED Viewed

@@ -91,21 +91,36 @@ class RBLNCompileConfig:
         self.tensor_parallel_size = kwargs.get("tensor_parallel_size", self.tensor_parallel_size)
         return self
-    def get_dummy_inputs(self, fill=0):
+    def get_dummy_inputs(
+        self, fill=0, static_tensors: Dict[str, torch.Tensor] = {}, meta_tensor_names: List[str] = []
+    ):
         dummy = []
         for name, shape, dtype in self.input_info:
-            dummy.append(
-                torch.fill(torch.zeros(*shape, dtype=getattr(torch, dtype)), fill)
-                if len(shape) > 0
-                else torch.tensor(fill, dtype=getattr(torch, dtype))
-            )
+            if name in static_tensors:
+                tensor = static_tensors[name]
+                if shape != list(tensor.shape):
+                    raise RuntimeError(f"Different shape for dummy inputs. ({shape} != {list(tensor.shape)})")
+                if getattr(torch, dtype) != tensor.dtype:
+                    raise RuntimeError(f"Different dtype for dummy inputs ({dtype} != {tensor.dtype})")
+                dummy.append(tensor)
+            else:
+                if name in meta_tensor_names:
+                    device = "meta"
+                else:
+                    device = "cpu"
+                dummy.append(
+                    torch.fill(torch.empty(*shape, dtype=getattr(torch, dtype), device=torch.device(device)), fill)
+                    if len(shape) > 0
+                    else torch.tensor(fill, dtype=getattr(torch, dtype), device=torch.device(device))
+                )
         return tuple(dummy)
     def asdict(self):
         return asdict(self)
-RUNTIME_KEYWORDS = ["create_runtimes", "optimize_host_memory", "device", "device_map"]
+RUNTIME_KEYWORDS = ["create_runtimes", "optimize_host_memory", "device", "device_map", "activate_profiler"]
 COMPILE_KEYWORDS = ["compiled_model_name", "mod_name", "input_info", "fusion", "npu", "tensor_parallel_size"]
@@ -243,6 +258,15 @@ class RBLNConfig:
             return rbln_device_map
         return self.runtime_cfg["device_map"]
+    @property
+    def activate_profiler(self):
+        context = ContextRblnConfig.get_current_context()["activate_profiler"]
+        if context:
+            return context
+        elif self.runtime_cfg.get("activate_profiler", None) is None:
+            return False
+        return self.runtime_cfg["activate_profiler"]
 def use_rbln_config(fn):
     """

optimum/rbln/ops/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from .attn import register_rbln_custom_attention, register_rbln_custom_attention_add_softmax
+from .flash_attn import register_rbln_custom_flash_attention
+from .kv_cache_update import register_rbln_custom_cache_update

optimum/rbln/ops/attn.py ADDED Viewed

@@ -0,0 +1,221 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from functools import lru_cache
+import torch
+from packaging import version
+if version.parse(torch.__version__) > version.parse("2.4.0"):
+    register_fake = torch.library.register_fake
+else:
+    register_fake = torch.library.impl_abstract
+@lru_cache
+def register_rbln_custom_attention():
+    torch.library.define(
+        "rbln_custom_ops::attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::attn_decode", "cpu")
+    def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
+        """Defines the computation pattern for fused attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Pattern components that compiler fuses into a single op:
+        1. KV cache updates with new key/value states
+        2. Scaled dot-product attention computation
+        3. Masked softmax operation
+        4. Final attention output computation
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - seq: [1] - Current sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
+            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
+            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+        """
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::attn_decode")
+    def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    torch.library.define(
+        "rbln_custom_ops::attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::attn_prefill", "cpu")
+    def attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
+        """Defines the computation pattern for prefill phase attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Key differences from decode pattern:
+        - Handles prefill phase with multiple input tokens
+        - Takes explicit batch index for continuous batching
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+        - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+        - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - batch: [1] - Batch index for cache access
+        - seq: [1] - Starting sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
+            - empty_kcache: Same shape as input kcache - Placeholder for compiler
+            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+        """
+        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::attn_prefill")
+    def attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
+        return q, kcache, vcache
+@lru_cache
+def register_rbln_custom_attention_add_softmax():
+    torch.library.define(
+        "rbln_custom_ops::attn_decode_add_softmax",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::attn_decode_add_softmax", "cpu")
+    def attn_decode_add_softmax_cpu(q, k, v, mask, kcache, vcache, seq, scale):
+        """Defines the computation pattern for fused attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Pattern components that compiler fuses into a single op:
+        1. KV cache updates with new key/value states
+        2. Scaled dot-product attention computation
+        3. add-softmax operation
+        4. Final attention output computation
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - seq: [1] - Current sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
+            - kcache: Same shape as input kcache, batch=1 - Placeholder for compiler
+            - vcache: Same shape as input vcache, batch=1 - Placeholder for compiler
+        """
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::attn_decode_add_softmax")
+    def attn_decode_add_softmax_abstract(q, k, v, m, kcache, vcache, seq, partition):
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    torch.library.define(
+        "rbln_custom_ops::attn_prefill_add_softmax",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::attn_prefill_add_softmax", "cpu")
+    def attn_prefill_add_softmax_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
+        """Defines the computation pattern for prefill phase attention with KV cache updates.
+        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+        a single optimized NPU operation. It is NOT meant for CPU execution.
+        Key differences from decode pattern:
+        - Handles prefill phase with multiple input tokens
+        - Takes explicit batch index for continuous batching
+        Expected tensor shapes:
+        - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+        - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+        - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+        - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
+        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+        - batch: [1] - Batch index for cache access
+        - seq: [1] - Starting sequence position
+        - scale: [] - Attention scale factor
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+            - attn_output: [batch=1, n_heads, seq_len, 1, head_dim] - Attention output
+            - empty_kcache: Same shape as input kcache - Placeholder for compiler
+            - empty_vcache: Same shape as input vcache - Placeholder for compiler
+        """
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::attn_prefill_add_softmax")
+    def attn_prefill_add_softmax_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )

optimum/rbln/ops/flash_attn.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from functools import lru_cache
+import torch
+from packaging import version
+if version.parse(torch.__version__) > version.parse("2.4.0"):
+    register_fake = torch.library.register_fake
+else:
+    register_fake = torch.library.impl_abstract
+@lru_cache
+def register_rbln_custom_flash_attention():
+    torch.library.define(
+        "rbln_custom_ops::flash_attn_decode",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, int e) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::flash_attn_decode", "cpu")
+    def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, partition):
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    @register_fake("rbln_custom_ops::flash_attn_decode")
+    def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, partition):
+        return (
+            q,
+            torch.empty(1, *kcache.shape[1:], device=kcache.device),
+            torch.empty(1, *vcache.shape[1:], device=vcache.device),
+        )
+    torch.library.define(
+        "rbln_custom_ops::flash_attn_prefill",
+        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
+    )
+    @torch.library.impl("rbln_custom_ops::flash_attn_prefill", "cpu")
+    def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale, partition):
+        return q, kcache, vcache
+    @register_fake("rbln_custom_ops::flash_attn_prefill")
+    def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, scale, partition):
+        return q, kcache, vcache

optimum/rbln/ops/kv_cache_update.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from functools import lru_cache
+import torch
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
+if is_torch_greater_or_equal_than_2_4:
+    register_fake = torch.library.register_fake
+else:
+    register_fake = torch.library.impl_abstract
+@lru_cache
+def register_rbln_custom_cache_update():
+    # Define the RBLN custom operation "rbln_cache_update" which updates a cache tensor with a given state tensor.
+    # This operation is designed to perform in-place updates directly on the device without needing to transfer the cache back to the host.
+    # The `position` parameter specifies the start index for the update along the specified axis, allowing flexible updates to any part of the cache tensor.
+    torch.library.define("rbln_custom_ops::rbln_cache_update", "(Tensor x, Tensor y, Tensor z, Tensor w) -> Tensor")
+    # Implementation of the "rbln_cache_update" operation for the CPU.
+    @torch.library.impl("rbln_custom_ops::rbln_cache_update", "cpu")
+    def rbln_cache_update_cpu(cache, state, position, axis):
+        assert position.dim() == 0
+        assert axis.dim() == 0
+        # Calculate the start (s) and end (e) indices for the update based on the position and the shape of the state tensor along the specified axis.
+        s = position  # Start index for the update, specified by the position.
+        e = (
+            position + state.shape[axis]
+        )  # End index is determined by adding the size of the state along the given axis.
+        # Update the specified portion of the cache tensor with the state tensor, using `slice_scatter`.
+        # This operation modifies the cache tensor in-place directly on the device, avoiding any unnecessary transfers between host and device.
+        updated_cache = cache.slice_scatter(state, dim=axis, start=s, end=e)
+        # Return the updated cache tensor.
+        return updated_cache
+    # Register a "fake" implementation of the "rbln_cache_update" operation.
+    # This serves as an abstract definition for the RBLN compiler to recognize the operation and generate an optimized implementation.
+    @register_fake("rbln_custom_ops::rbln_cache_update")
+    def rbln_cache_update_abstract(cache, state, position, axis):
+        # Return a tensor with the same shape as the input cache tensor.
+        # This is a placeholder for the abstract implementation and does not perform any actual computation.
+        # Like the actual implementation, the abstraction assumes in-place device-side updates.
+        return torch.empty_like(cache)

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -63,10 +63,30 @@ _import_structure = {
         "RBLNXLMRobertaModel",
         "RBLNMistralForCausalLM",
     ],
+    "modeling_alias": [
+        "RBLNASTForAudioClassification",
+        "RBLNBertForQuestionAnswering",
+        "RBLNDistilBertForQuestionAnswering",
+        "RBLNResNetForImageClassification",
+        "RBLNXLMRobertaForSequenceClassification",
+        "RBLNRobertaForSequenceClassification",
+        "RBLNRobertaForMaskedLM",
+        "RBLNViTForImageClassification",
+    ],
 }
 if TYPE_CHECKING:
     from .cache_utils import RebelDynamicCache
+    from .modeling_alias import (
+        RBLNASTForAudioClassification,
+        RBLNBertForQuestionAnswering,
+        RBLNDistilBertForQuestionAnswering,
+        RBLNResNetForImageClassification,
+        RBLNRobertaForMaskedLM,
+        RBLNRobertaForSequenceClassification,
+        RBLNViTForImageClassification,
+        RBLNXLMRobertaForSequenceClassification,
+    )
     from .models import (
         RBLNAutoModel,
         RBLNAutoModelForAudioClassification,

optimum/rbln/{modeling_alias.py → transformers/modeling_alias.py} RENAMED Viewed

@@ -21,7 +21,8 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from .modeling import (
+from ..utils.logging import get_logger
+from .modeling_generic import (
     RBLNModelForAudioClassification,
     RBLNModelForImageClassification,
     RBLNModelForMaskedLM,
@@ -30,6 +31,9 @@ from .modeling import (
 )
+logger = get_logger()
 class RBLNASTForAudioClassification(RBLNModelForAudioClassification):
     pass

optimum-rbln 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl