PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

optimum/rbln/diffusers/pipelines/auto_pipeline.py CHANGED Viewed

@@ -176,7 +176,7 @@ class RBLNAutoPipelineBase:
         export: bool = None,
         rbln_config: Optional[Union[Dict[str, Any], RBLNModelConfig]] = None,
         **kwargs: Any,
-    ):
+    ) -> RBLNBaseModel:
         """
         Load an RBLN-accelerated Diffusers pipeline from a pretrained checkpoint or a compiled RBLN artifact.
@@ -201,8 +201,7 @@ class RBLNAutoPipelineBase:
                 - Remaining arguments are forwarded to the Diffusers loader.
         Returns:
-            RBLNBaseModel: An instantiated RBLN model wrapping the Diffusers pipeline, ready for
-            inference on RBLN NPUs.
+            RBLNBaseModel: An instantiated RBLN model wrapping the Diffusers pipeline, ready for inference on RBLN NPUs.
         """
         rbln_cls = cls.get_rbln_cls(model_id, export=export, **kwargs)

optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py CHANGED Viewed

@@ -26,7 +26,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
@@ -260,7 +260,7 @@ class RBLNStableDiffusionControlNetPipeline(RBLNDiffusionMixin, StableDiffusionC
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
         **kwargs,
-    ):
+    ) -> Union[StableDiffusionPipelineOutput, Tuple]:
         r"""
         The call function to the pipeline for generation.
@@ -321,14 +321,7 @@ class RBLNStableDiffusionControlNetPipeline(RBLNDiffusionMixin, StableDiffusionC
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple.
             cross_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                 [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -356,8 +349,6 @@ class RBLNStableDiffusionControlNetPipeline(RBLNDiffusionMixin, StableDiffusionC
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeine class.
-        Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,

optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py CHANGED Viewed

@@ -26,7 +26,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
@@ -253,7 +253,7 @@ class RBLNStableDiffusionControlNetImg2ImgPipeline(RBLNDiffusionMixin, StableDif
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
         **kwargs,
-    ):
+    ) -> Union[StableDiffusionPipelineOutput, Tuple]:
         r"""
         The call function to the pipeline for generation.
@@ -347,8 +347,6 @@ class RBLNStableDiffusionControlNetImg2ImgPipeline(RBLNDiffusionMixin, StableDif
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeine class.
-        Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,

optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py CHANGED Viewed

@@ -294,7 +294,7 @@ class RBLNStableDiffusionXLControlNetPipeline(RBLNDiffusionMixin, StableDiffusio
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
         **kwargs,
-    ):
+    ) -> Union[StableDiffusionXLPipelineOutput, Tuple]:
         r"""
         The call function to the pipeline for generation.
@@ -431,8 +431,6 @@ class RBLNStableDiffusionXLControlNetPipeline(RBLNDiffusionMixin, StableDiffusio
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeine class.
-        Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,

optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py CHANGED Viewed

@@ -309,7 +309,7 @@ class RBLNStableDiffusionXLControlNetImg2ImgPipeline(RBLNDiffusionMixin, StableD
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
         **kwargs,
-    ):
+    ) -> Union[StableDiffusionXLPipelineOutput, Tuple]:
         r"""
         Function invoked when calling the pipeline for generation.
@@ -465,8 +465,6 @@ class RBLNStableDiffusionXLControlNetImg2ImgPipeline(RBLNDiffusionMixin, StableD
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeine class.
-        Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`

optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py CHANGED Viewed

@@ -203,7 +203,7 @@ class RBLNRetinaFaceFilter(RetinaFaceFilter):
                 f"If you only need to compile the model without loading it to NPU, you can use:\n"
                 f"  from_pretrained(..., rbln_create_runtimes=False) or\n"
                 f"  from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
-                f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
+                f"To check your NPU status, run the 'rbln-smi' command in your terminal.\n"
                 f"Make sure your NPU is properly installed and operational."
             )
             raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
@@ -278,7 +278,7 @@ class RBLNVideoSafetyModel(VideoSafetyModel):
                 f"If you only need to compile the model without loading it to NPU, you can use:\n"
                 f"  from_pretrained(..., rbln_create_runtimes=False) or\n"
                 f"  from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
-                f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
+                f"To check your NPU status, run the 'rbln-smi' command in your terminal.\n"
                 f"Make sure your NPU is properly installed and operational."
             )
             raise rebel.core.exception.RBLNRuntimeError(error_msg) from e

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -24,7 +24,7 @@ import torch
 from transformers import AutoConfig, AutoModel, GenerationConfig, PretrainedConfig
 from transformers.utils.hub import PushToHubMixin
-from .configuration_utils import RBLNAutoConfig, RBLNCompileConfig, RBLNModelConfig, get_rbln_config_class
+from .configuration_utils import RBLNCompileConfig, RBLNModelConfig, get_rbln_config_class
 from .utils.hub import pull_compiled_model_from_hub, validate_files
 from .utils.logging import get_logger
 from .utils.runtime_utils import UnavailableRuntime, tp_and_devices_are_ok
@@ -90,7 +90,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         self.device = torch.device("cpu")
         self.training = False
-        self.dtype = rbln_config.torch_dtype
+        self.dtype = rbln_config.dtype
         # FIXME :: model_save_dir is not used after initialized. (This can be used when save/load)
         # This attribute is needed to keep one reference on the temporary directory, since garbage collecting it
@@ -206,8 +206,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                     f"does not match the expected model class name ({cls.__name__})."
                 )
-            rbln_config, kwargs = RBLNAutoConfig.load(
-                model_path_subfolder, passed_rbln_config=rbln_config, kwargs=kwargs, return_unused_kwargs=True
+            config_cls = cls.get_rbln_config_class()
+            rbln_config, kwargs = config_cls.from_pretrained(
+                model_path_subfolder, rbln_config=rbln_config, return_unused_kwargs=True, **kwargs
             )
             if rbln_config.rbln_model_cls_name != cls.__name__:
@@ -223,8 +224,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             elif rbln_submodules is None:
                 rbln_submodules = []
-            rbln_config.freeze()
             if config is None:
                 if cls.hf_library_name == "transformers":
                     config = AutoConfig.from_pretrained(
@@ -308,11 +307,13 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 f"If you only need to compile the model without loading it to NPU, you can use:\n"
                 f"  from_pretrained(..., rbln_create_runtimes=False) or\n"
                 f"  from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
-                f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
+                f"To check your NPU status, run the 'rbln-smi' command in your terminal.\n"
                 f"Make sure your NPU is properly installed and operational."
             )
             raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
+        rbln_config.freeze()
         return cls(
             models,
             config,
@@ -451,15 +452,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         model_config: "PretrainedConfig",
         rbln_config: RBLNModelConfig,
     ) -> RBLNModelConfig:
-        rbln_config.torch_dtype = model.dtype
-        if not cls._supports_non_fp32 and rbln_config.torch_dtype != torch.float32:
+        rbln_config.dtype = model.dtype
+        if not cls._supports_non_fp32 and rbln_config.dtype != torch.float32:
             raise NotImplementedError(
                 f"Currently, {cls.__name__} does not support non-fp32 dtype. Please use float32 dtype."
             )
         rbln_config = cls._update_rbln_config(
             preprocessors=preprocessors, model=model, model_config=model_config, rbln_config=rbln_config
         )
-        rbln_config.freeze()
         if rbln_config.rbln_model_cls_name != cls.__name__:
             raise NameError(
                 f"Cannot get the rbln config. {cls.__name__} is not the same as {rbln_config.rbln_model_cls_name}. "

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -16,4 +16,5 @@ from .attn import *
 from .flash_attn import *
 from .kv_cache_update import *
 from .linear import linear
+from .moe import *
 from .sliding_window_attn import *

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -205,6 +205,7 @@ def paged_causal_attn_decode(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused attention with KV cache updates.
@@ -228,6 +229,7 @@ def paged_causal_attn_decode(
     - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
     - block_size: [] - Number of tokens per block
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
@@ -247,6 +249,7 @@ def paged_causal_attn_decode_fake(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -267,6 +270,7 @@ def paged_causal_attn_prefill(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -290,6 +294,7 @@ def paged_causal_attn_prefill(
     - block_size: [] - Number of tokens per block
     - is_bidirectional: [] - Whether the attention is bidirectional at current sequence position
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
@@ -310,6 +315,7 @@ def paged_causal_attn_prefill_fake(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -331,6 +337,7 @@ def paged_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -349,6 +356,7 @@ def paged_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -371,6 +379,7 @@ def paged_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -390,6 +399,7 @@ def paged_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -198,6 +198,7 @@ def paged_flash_causal_attn_decode(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for decoding.
@@ -219,6 +220,7 @@ def paged_flash_causal_attn_decode_fake(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -241,6 +243,7 @@ def paged_flash_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -260,6 +263,7 @@ def paged_flash_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -281,6 +285,7 @@ def paged_flash_causal_attn_prefill(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for prefill.
@@ -303,6 +308,7 @@ def paged_flash_causal_attn_prefill_fake(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -326,6 +332,7 @@ def paged_flash_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -346,5 +353,6 @@ def paged_flash_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/moe.py ADDED Viewed

@@ -0,0 +1,180 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu",
+    mutates_args=(),
+)
+def custom_moe_glu(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - up_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - down_proj_weight: [num_experts, intermediate_size, hidden_size]
+    - router_logits: [batch*seq_len, num_experts]
+    - topk: top k experts to select
+    - norm_topk_prob: whether to normalize the top k routing weights with softmax
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_bias: [num_experts, hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu.register_fake
+def custom_moe_glu_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_ff",
+    mutates_args=(),
+)
+def custom_moe_ff(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE FF operation.
+    Expected tensor shapes:
+    - hidden_states: [batch * seq_len, hidden_size]
+    - gate_proj_weight: [hidden_size, num_experts * intermediate_size]
+    - down_proj_weight: [num_experts * intermediate_size, hidden_size]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - gate_proj_bias: [num_experts * intermediate_size]
+    - down_proj_bias: [hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_ff.register_fake
+def custom_moe_ff_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu_mxfp4",
+    mutates_args=(),
+)
+def custom_moe_glu_mxfp4(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - gate_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - up_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_blocks: [num_experts, hidden_size, intermediate_size // 2]
+    - down_proj_scales: [num_experts, hidden_size, intermediate_size // 32]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - expert_select_count: [num_experts]
+    - alpha: []
+    - limit: []
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu_mxfp4.register_fake
+def custom_moe_glu_mxfp4_fake(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    return torch.empty_like(hidden_states)

optimum/rbln/ops/sliding_window_attn.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
+from typing import Optional
 import torch
 from torch import Tensor
@@ -33,6 +35,7 @@ def paged_sliding_window_attn_prefill(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -53,6 +56,7 @@ def paged_sliding_window_attn_prefill(
     - cache_offset: [] - The valid length in the combined sequence of the KV cache and the current projected key states.
     - scale: [] - Attention scale factor
     - is_bidirectional: [] - Whether the attention is bidirectional
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
     """
@@ -72,6 +76,7 @@ def paged_sliding_window_attn_prefill_fake(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -91,6 +96,8 @@ def paged_sliding_window_attn_decode(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -107,5 +114,7 @@ def paged_sliding_window_attn_decode_fake(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.10.0.post1__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.10.0.post1py3-none-any.whl