PyPI - optimum-rbln - Versions diffs - 0.7.4a2__py3-none-any.whl → 0.7.4a4__py3-none-any.whl - Mend

optimum-rbln 0.7.4a2py3-none-any.whl → 0.7.4a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

optimum/rbln/__version__.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.7.4a2'
+__version__ = version = '0.7.4a4'
 __version_tuple__ = version_tuple = (0, 7, 4)

optimum/rbln/modeling.py CHANGED Viewed

@@ -123,8 +123,15 @@ class RBLNModel(RBLNBaseModel):
                 config = AutoConfig.from_pretrained(config._name_or_path, **kwargs)
         if hasattr(model, "can_generate") and model.can_generate():
+            import json
             generation_config = model.generation_config
-            generation_config.save_pretrained(save_dir_path / subfolder)
+            generation_config_path = save_dir_path / subfolder / "generation_config.json"
+            generation_config.save_pretrained(generation_config_path.parent)
+            local_config = json.loads(generation_config_path.read_text(encoding="utf-8"))
+            local_config["transformers_version"] = generation_config.transformers_version
+            generation_config_path.write_text(json.dumps(local_config, indent=2) + "\n", encoding="utf-8")
         if not isinstance(config, PretrainedConfig):  # diffusers config
             config = PretrainedConfig(**config)

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -481,11 +481,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             # First copy everything to a temporary directory
             shutil.copytree(real_save_dir, tmp_dir)
-            # Save configs to the temporary directory
-            self.config.save_pretrained(tmp_dir)
-            if self.generation_config is not None:
-                self.generation_config.save_pretrained(tmp_dir)
             # If everything succeeded, atomically replace the target directory
             if os.path.exists(save_directory_path):
                 shutil.rmtree(save_directory_path)

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attn import (
-    register_rbln_custom_paged_add_softmax_attention,
-    register_rbln_custom_paged_attention,
-    register_rbln_custom_paged_causal_attention,
-)
-from .flash_attn import register_rbln_custom_paged_flash_attention, register_rbln_custom_paged_flash_causal_attention
-from .kv_cache_update import register_rbln_custom_cache_update
+from .attn import *
+from .flash_attn import *
+from .kv_cache_update import *
 from .linear import linear

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -12,212 +12,276 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from functools import lru_cache
 import torch
-from packaging import version
-if version.parse(torch.__version__) > version.parse("2.4.0"):
-    register_fake = torch.library.register_fake
-else:
-    register_fake = torch.library.impl_abstract
-@lru_cache
-def register_rbln_custom_paged_attention():
-    torch.library.define(
-        "rbln_custom_ops::paged_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
-    )
-    @torch.library.impl("rbln_custom_ops::paged_attn_decode", "cpu")
-    def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
-        """Defines the computation pattern for fused attention with KV cache updates.
-        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
-        a single optimized NPU operation. It is NOT meant for CPU execution.
-        Pattern components that compiler fuses into a single op:
-        1. KV cache updates with new key/value states
-        2. Scaled dot-product attention computation
-        3. Masked softmax operation
-        4. Final attention output computation
-        Expected tensor shapes:
-        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
-        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
-        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
-        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
-        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
-        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1, 1] - Current sequence position
-        - scale: [] - Attention scale factor
-        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
-        - block_size: [] - Number of tokens per block
-        Returns:
-            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
-        """
-        return q
-    @register_fake("rbln_custom_ops::paged_attn_decode")
-    def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
-        return q
-    torch.library.define(
-        "rbln_custom_ops::paged_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
-    )
-    @torch.library.impl("rbln_custom_ops::paged_attn_prefill", "cpu")
-    def attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
-        """Defines the computation pattern for prefill phase attention with KV cache updates.
-        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
-        a single optimized NPU operation. It is NOT meant for CPU execution.
-        Key differences from decode pattern:
-        - Handles prefill phase with multiple input tokens
-        - Takes explicit batch index for continuous batching
-        Expected tensor shapes:
-        - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
-        - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
-        - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
-        - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
-        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
-        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1, 1] - Starting sequence position
-        - scale: [] - Attention scale factor
-        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
-        - block_size: [] - Number of tokens per block
-        Returns:
-            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
-        """
-        return q
-    @register_fake("rbln_custom_ops::paged_attn_prefill")
-    def attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
-        return q
-@lru_cache
-def register_rbln_custom_paged_causal_attention():
-    torch.library.define(
-        "rbln_custom_ops::paged_causal_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
-    )
-    @torch.library.impl("rbln_custom_ops::paged_causal_attn_decode", "cpu")
-    def attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        """Defines the computation pattern for fused attention with KV cache updates.
-        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
-        a single optimized NPU operation. It is NOT meant for CPU execution.
-        Pattern components that compiler fuses into a single op:
-        1. KV cache updates with new key/value states
-        2. Scaled dot-product attention computation
-        3. Causal masked softmax operation
-        4. Final attention output computation
-        Expected tensor shapes:
-        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
-        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
-        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
-        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
-        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1, 1] - Starting sequence position
-        - scale: [] - Attention scale factor
-        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
-        - block_size: [] - Number of tokens per block
-        Returns:
-            Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
-        """
-        return q
-    @register_fake("rbln_custom_ops::paged_causal_attn_decode")
-    def attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        return q
-    torch.library.define(
-        "rbln_custom_ops::paged_causal_attn_prefill",
-        "(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
-    )
-    @torch.library.impl("rbln_custom_ops::paged_causal_attn_prefill", "cpu")
-    def attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        """Defines the computation pattern for prefill phase attention with KV cache updates.
-        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
-        a single optimized NPU operation. It is NOT meant for CPU execution.
-        Key differences from decode pattern:
-        - Handles prefill phase with multiple input tokens
-        - Takes explicit batch index for continuous batching
-        Expected tensor shapes:
-        - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
-        - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
-        - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
-        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
-        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - batch: [1] - Batch index for cache access
-        - seq: [1, 1] - Starting sequence position
-        - scale: [] - Attention scale factor
-        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
-        - block_size: [] - Number of tokens per block
-        Returns:
-            Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
-        """
-        return q
-    @register_fake("rbln_custom_ops::paged_causal_attn_prefill")
-    def attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
-        return q
-@lru_cache
-def register_rbln_custom_paged_add_softmax_attention():
-    torch.library.define(
-        "rbln_custom_ops::paged_add_softmax_attn_decode",
-        "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
-    )
-    @torch.library.impl("rbln_custom_ops::paged_add_softmax_attn_decode", "cpu")
-    def paged_add_softmax_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
-        """Defines the computation pattern for fused attention with KV cache updates.
-        IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
-        a single optimized NPU operation. It is NOT meant for CPU execution.
-        Pattern components that compiler fuses into a single op:
-        1. KV cache updates with new key/value states
-        2. Scaled dot-product attention computation
-        3. add-softmax operation
-        4. Final attention output computation
-        Expected tensor shapes:
-        - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
-        - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
-        - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
-        - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
-        - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
-        - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
-        - seq: [1] - Current sequence position
-        - scale: [] - Attention scale factor
-        - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
-        - block_size: [] - Number of tokens per block
-        Returns:
-            Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
-        """
-        return q
-    @register_fake("rbln_custom_ops::paged_add_softmax_attn_decode")
-    def paged_add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition, block_table, block_size):
-        return q
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_attn_decode.register_fake
+def paged_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_attn_prefill",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_attn_prefill(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    """Defines the computation pattern for prefill phase attention with KV cache updates.
+    IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+    a single optimized NPU operation. It is NOT meant for CPU execution.
+    Key differences from decode pattern:
+    - Handles prefill phase with multiple input tokens
+    - Takes explicit batch index for continuous batching
+    Expected tensor shapes:
+    - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+    - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+    - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+    - mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
+    - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+    - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+    - seq: [1, 1] - Starting sequence position
+    - scale: [] - Attention scale factor
+    - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+    - block_size: [] - Number of tokens per block
+    Returns:
+        Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
+    """
+    return torch.empty_like(q)
+@paged_attn_prefill.register_fake
+def paged_attn_prefill_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_causal_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_causal_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    """Defines the computation pattern for fused attention with KV cache updates.
+    IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+    a single optimized NPU operation. It is NOT meant for CPU execution.
+    Pattern components that compiler fuses into a single op:
+    1. KV cache updates with new key/value states
+    2. Scaled dot-product attention computation
+    3. Causal masked softmax operation
+    4. Final attention output computation
+    Expected tensor shapes:
+    - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+    - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+    - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+    - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+    - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+    - seq: [1, 1] - Starting sequence position
+    - scale: [] - Attention scale factor
+    - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+    - block_size: [] - Number of tokens per block
+    Returns:
+        Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
+    """
+    return torch.empty_like(q)
+@paged_causal_attn_decode.register_fake
+def paged_causal_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_causal_attn_prefill",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_causal_attn_prefill(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    """Defines the computation pattern for prefill phase attention with KV cache updates.
+    IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+    a single optimized NPU operation. It is NOT meant for CPU execution.
+    Key differences from decode pattern:
+    - Handles prefill phase with multiple input tokens
+    - Takes explicit batch index for continuous batching
+    Expected tensor shapes:
+    - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+    - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+    - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+    - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+    - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+    - batch: [1] - Batch index for cache access
+    - seq: [1, 1] - Starting sequence position
+    - scale: [] - Attention scale factor
+    - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+    - block_size: [] - Number of tokens per block
+    Returns:
+        Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
+    """
+    return torch.empty_like(q)
+@paged_causal_attn_prefill.register_fake
+def paged_causal_attn_prefill_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_add_softmax_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_add_softmax_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    """Defines the computation pattern for fused attention with KV cache updates.
+    IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+    a single optimized NPU operation. It is NOT meant for CPU execution.
+    Pattern components that compiler fuses into a single op:
+    1. KV cache updates with new key/value states
+    2. Scaled dot-product attention computation
+    3. add-softmax operation
+    4. Final attention output computation
+    Expected tensor shapes:
+    - q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
+    - k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
+    - v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
+    - mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
+    - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+    - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+    - seq: [1] - Current sequence position
+    - scale: [] - Attention scale factor
+    - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
+    - block_size: [] - Number of tokens per block
+    Returns:
+        Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
+    """
+    return torch.empty_like(q)
+@paged_add_softmax_attn_decode.register_fake
+def paged_add_softmax_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)

optimum-rbln 0.7.4a2__py3-none-any.whl → 0.7.4a4__py3-none-any.whl

optimum-rbln 0.7.4a2py3-none-any.whl → 0.7.4a4py3-none-any.whl