PyPI - optimum-rbln - Versions diffs - 0.9.3.post1__py3-none-any.whl - Mend

optimum-rbln 0.9.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (264) hide show

optimum/rbln/ops/flash_attn.py ADDED Viewed

@@ -0,0 +1,350 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+) -> Tensor:
+    """Defines the computation pattern for fused flash attention with KV cache for decoding.
+    Returns a tensor with the same shape as q.
+    """
+    return torch.empty_like(q)
+@paged_flash_attn_decode.register_fake
+def paged_flash_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_attn_decode_kv_fp8",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_attn_decode_kv_fp8(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_flash_attn_decode_kv_fp8.register_fake
+def paged_flash_attn_decode_kv_fp8_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_attn_prefill",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_attn_prefill(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+) -> Tensor:
+    """Defines the computation pattern for fused flash attention with KV cache for prefill.
+    Returns a tensor with the same shape as q.
+    """
+    return torch.empty_like(q)
+@paged_flash_attn_prefill.register_fake
+def paged_flash_attn_prefill_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_attn_prefill_kv_fp8",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_attn_prefill_kv_fp8(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_flash_attn_prefill_kv_fp8.register_fake
+def paged_flash_attn_prefill_kv_fp8_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_causal_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_causal_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """Defines the computation pattern for fused causal flash attention with KV cache for decoding.
+    Returns a tensor with the same shape as q.
+    """
+    return torch.empty_like(q)
+@paged_flash_causal_attn_decode.register_fake
+def paged_flash_causal_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_causal_attn_decode_kv_fp8",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_causal_attn_decode_kv_fp8(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_flash_causal_attn_decode_kv_fp8.register_fake
+def paged_flash_causal_attn_decode_kv_fp8_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    k_scale: Tensor,
+    v_scale: Tensor,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_causal_attn_prefill",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_causal_attn_prefill(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    is_bidirectional: bool,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """Defines the computation pattern for fused causal flash attention with KV cache for prefill.
+    Returns a tensor with the same shape as q.
+    """
+    return torch.empty_like(q)
+@paged_flash_causal_attn_prefill.register_fake
+def paged_flash_causal_attn_prefill_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    is_bidirectional: bool,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_flash_causal_attn_prefill_kv_fp8",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_flash_causal_attn_prefill_kv_fp8(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    is_bidirectional: bool,
+    k_scale: Tensor,
+    v_scale: Tensor,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_flash_causal_attn_prefill_kv_fp8.register_fake
+def paged_flash_causal_attn_prefill_kv_fp8_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    seq: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    partition: int,
+    is_bidirectional: bool,
+    k_scale: Tensor,
+    v_scale: Tensor,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(q)

optimum/rbln/ops/kv_cache_update.py ADDED Viewed

@@ -0,0 +1,29 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import Tensor
+@torch.library.custom_op("rbln_custom_ops::rbln_cache_update", mutates_args=(["cache"]))
+def rbln_cache_update(cache: Tensor, state: Tensor, position: Tensor, axis: Tensor) -> Tensor:
+    # Define the RBLN custom operation "rbln_cache_update" which updates a cache tensor with a given state tensor.
+    # This operation is designed to perform in-place updates directly on the device without needing to transfer the cache back to the host.
+    # The `position` parameter specifies the start index for the update along the specified axis, allowing flexible updates to any part of the cache tensor.
+    return torch.empty_like(cache)
+@rbln_cache_update.register_fake
+def rbln_cache_update_fake(cache: Tensor, state: Tensor, position: Tensor, axis: Tensor) -> Tensor:
+    return torch.empty_like(cache)

optimum/rbln/ops/linear.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import Tensor
+@torch.library.custom_op("rbln_custom_ops::linear", mutates_args=())
+def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
+    output_shape = list(input.shape[:-1])
+    output_shape += [weight.shape[0]]
+    return torch.empty(size=output_shape, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad)
+@linear.register_fake
+def linear_fake(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
+    output_shape = list(input.shape[:-1])
+    output_shape += [weight.shape[0]]
+    return torch.empty(size=output_shape, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad)

optimum/rbln/ops/sliding_window_attn.py ADDED Viewed

@@ -0,0 +1,111 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_sliding_window_attn_prefill",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_sliding_window_attn_prefill(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    cache_seq_len: Tensor,
+    cache_offset: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    is_bidirectional: bool,
+) -> Tensor:
+    """Defines the computation pattern for prefill phase attention with KV cache updates.
+    IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
+    a single optimized NPU operation. It is NOT meant for CPU execution.
+    Key differences from decode pattern:
+    - Handles prefill phase with multiple input tokens
+    - Takes explicit batch index for continuous batching
+    Expected tensor shapes:
+    - q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
+    - k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
+    - v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
+    - kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
+    - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
+    - cache_seq_len: [] - the sequence length of the cached states that were seen by the model
+    - cache_offset: [] - The valid length in the combined sequence of the KV cache and the current projected key states.
+    - scale: [] - Attention scale factor
+    - is_bidirectional: [] - Whether the attention is bidirectional
+    Returns:
+        Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
+    """
+    return torch.empty_like(q)
+@paged_sliding_window_attn_prefill.register_fake
+def paged_sliding_window_attn_prefill_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    cache_seq_len: Tensor,
+    cache_offset: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+    is_bidirectional: bool,
+) -> Tensor:
+    return torch.empty_like(q)
+@torch.library.custom_op(
+    "rbln_custom_ops::paged_sliding_window_attn_decode",
+    mutates_args=(["kcache", "vcache"]),
+)
+def paged_sliding_window_attn_decode(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    cache_seq_len: Tensor,
+    cache_offset: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)
+@paged_sliding_window_attn_decode.register_fake
+def paged_sliding_window_attn_decode_fake(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    kcache: Tensor,
+    vcache: Tensor,
+    cache_seq_len: Tensor,
+    cache_offset: Tensor,
+    scale: Tensor,
+    block_table: Tensor,
+    block_size: int,
+) -> Tensor:
+    return torch.empty_like(q)