PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

optimum/rbln/__init__.py CHANGED Viewed

@@ -91,6 +91,10 @@ _import_structure = {
         "RBLNGemmaModel",
         "RBLNGemmaModelConfig",
         "RBLNGemmaForCausalLM",
+        "RBLNGemma2ForCausalLM",
+        "RBLNGemma2ForCausalLMConfig",
+        "RBLNGemma2Model",
+        "RBLNGemma2ModelConfig",
         "RBLNGemmaForCausalLMConfig",
         "RBLNGemma3ForCausalLM",
         "RBLNGemma3ForCausalLMConfig",
@@ -100,6 +104,8 @@ _import_structure = {
         "RBLNGPT2ModelConfig",
         "RBLNGPT2LMHeadModel",
         "RBLNGPT2LMHeadModelConfig",
+        "RBLNGptOssForCausalLM",
+        "RBLNGptOssForCausalLMConfig",
         "RBLNGroundingDinoDecoder",
         "RBLNGroundingDinoDecoderConfig",
         "RBLNGroundingDinoForObjectDetection",
@@ -140,14 +146,24 @@ _import_structure = {
         "RBLNPixtralVisionModelConfig",
         "RBLNPhiModel",
         "RBLNPhiModelConfig",
+        "RBLNPaliGemmaForConditionalGeneration",
+        "RBLNPaliGemmaForConditionalGenerationConfig",
+        "RBLNPaliGemmaModel",
+        "RBLNPaliGemmaModelConfig",
         "RBLNQwen2ForCausalLM",
         "RBLNQwen2ForCausalLMConfig",
         "RBLNQwen2_5_VisionTransformerPretrainedModel",
         "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
         "RBLNQwen2_5_VLForConditionalGeneration",
         "RBLNQwen2_5_VLForConditionalGenerationConfig",
+        "RBLNQwen3MoeForCausalLM",
+        "RBLNQwen3MoeForCausalLMConfig",
+        "RBLNQwen2_5_VLModel",
+        "RBLNQwen2_5_VLModelConfig",
         "RBLNQwen2Model",
         "RBLNQwen2ModelConfig",
+        "RBLNQwen2MoeForCausalLM",
+        "RBLNQwen2MoeForCausalLMConfig",
         "RBLNQwen3ForCausalLM",
         "RBLNQwen3ForCausalLMConfig",
         "RBLNQwen3Model",
@@ -156,6 +172,8 @@ _import_structure = {
         "RBLNQwen2VisionTransformerPretrainedModelConfig",
         "RBLNQwen2VLForConditionalGeneration",
         "RBLNQwen2VLForConditionalGenerationConfig",
+        "RBLNQwen2VLModel",
+        "RBLNQwen2VLModelConfig",
         "RBLNResNetForImageClassification",
         "RBLNResNetForImageClassificationConfig",
         "RBLNRobertaForMaskedLM",
@@ -394,6 +412,10 @@ if TYPE_CHECKING:
         RBLNDPTForDepthEstimationConfig,
         RBLNExaoneForCausalLM,
         RBLNExaoneForCausalLMConfig,
+        RBLNGemma2ForCausalLM,
+        RBLNGemma2ForCausalLMConfig,
+        RBLNGemma2Model,
+        RBLNGemma2ModelConfig,
         RBLNGemma3ForCausalLM,
         RBLNGemma3ForCausalLMConfig,
         RBLNGemma3ForConditionalGeneration,
@@ -406,6 +428,8 @@ if TYPE_CHECKING:
         RBLNGPT2LMHeadModelConfig,
         RBLNGPT2Model,
         RBLNGPT2ModelConfig,
+        RBLNGptOssForCausalLM,
+        RBLNGptOssForCausalLMConfig,
         RBLNGroundingDinoDecoder,
         RBLNGroundingDinoDecoderConfig,
         RBLNGroundingDinoEncoder,
@@ -436,6 +460,10 @@ if TYPE_CHECKING:
         RBLNOPTForCausalLMConfig,
         RBLNOPTModel,
         RBLNOPTModelConfig,
+        RBLNPaliGemmaForConditionalGeneration,
+        RBLNPaliGemmaForConditionalGenerationConfig,
+        RBLNPaliGemmaModel,
+        RBLNPaliGemmaModelConfig,
         RBLNPegasusForConditionalGeneration,
         RBLNPegasusForConditionalGenerationConfig,
         RBLNPegasusModel,
@@ -450,18 +478,26 @@ if TYPE_CHECKING:
         RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
         RBLNQwen2_5_VLForConditionalGeneration,
         RBLNQwen2_5_VLForConditionalGenerationConfig,
+        RBLNQwen2_5_VLModel,
+        RBLNQwen2_5_VLModelConfig,
         RBLNQwen2ForCausalLM,
         RBLNQwen2ForCausalLMConfig,
         RBLNQwen2Model,
         RBLNQwen2ModelConfig,
+        RBLNQwen2MoeForCausalLM,
+        RBLNQwen2MoeForCausalLMConfig,
         RBLNQwen2VisionTransformerPretrainedModel,
         RBLNQwen2VisionTransformerPretrainedModelConfig,
         RBLNQwen2VLForConditionalGeneration,
         RBLNQwen2VLForConditionalGenerationConfig,
+        RBLNQwen2VLModel,
+        RBLNQwen2VLModelConfig,
         RBLNQwen3ForCausalLM,
         RBLNQwen3ForCausalLMConfig,
         RBLNQwen3Model,
         RBLNQwen3ModelConfig,
+        RBLNQwen3MoeForCausalLM,
+        RBLNQwen3MoeForCausalLMConfig,
         RBLNResNetForImageClassification,
         RBLNResNetForImageClassificationConfig,
         RBLNRobertaForMaskedLM,

optimum/rbln/__version__.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.9.4a2'
-__version_tuple__ = version_tuple = (0, 9, 4, 'a2')
+__version__ = version = '0.9.5a4'
+__version_tuple__ = version_tuple = (0, 9, 5, 'a4')
 __commit_id__ = commit_id = None

optimum/rbln/configuration_utils.py CHANGED Viewed

@@ -24,7 +24,7 @@ import torch
 from packaging.version import Version
 from .__version__ import __version__
-from .utils.deprecation import warn_deprecated_npu
+from .utils.deprecation import deprecate_kwarg, warn_deprecated_npu
 from .utils.logging import get_logger
 from .utils.runtime_utils import ContextRblnConfig
@@ -92,7 +92,7 @@ class RBLNCompileConfig:
                 and isinstance(item[0], str)  # name
                 and isinstance(item[1], (tuple, list))  # shape
                 and all(isinstance(x, int) for x in item[1])
-                and isinstance(item[2], str)  # dtype
+                and (isinstance(item[2], str) or isinstance(item[2], torch.dtype))  # dtype
                 for item in input_info
             )
@@ -524,8 +524,8 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
     non_save_attributes = [
         "_frozen",
         "_runtime_options",
-        "torch_dtype",
         "npu",
+        "dtype",
         "tensor_parallel_size",
         "create_runtimes",
         "device",
@@ -650,6 +650,14 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
         super().__setattr__(key, value)
+    @deprecate_kwarg(
+        old_name="_torch_dtype",
+        new_name="dtype",
+        version="0.12.0",
+        deprecated_type=torch.dtype,
+        value_replacer=RBLNCompileConfig.normalize_dtype,
+        raise_if_greater_or_equal_version=False,
+    )
     def __init__(
         self,
         cls_name: Optional[str] = None,
@@ -661,7 +669,7 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
         tensor_parallel_size: Optional[int] = None,
         timeout: Optional[int] = None,
         optimum_rbln_version: Optional[str] = None,
-        _torch_dtype: Optional[str] = None,
+        dtype: Optional[Union[str, torch.dtype]] = None,
         _compile_cfgs: Optional[List[RBLNCompileConfig]] = None,
         *,
         optimize_host_memory: Optional[bool] = None,
@@ -680,7 +688,7 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
             tensor_parallel_size (Optional[int]): Size for tensor parallelism to distribute the model across devices.
             timeout (Optional[int]): The timeout for the runtime in seconds. If it isn't provided, it will be set to 60 by default.
             optimum_rbln_version (Optional[str]): The optimum-rbln version used for this configuration.
-            _torch_dtype (Optional[str]): The data type to use for the model.
+            dtype (Optional[Union[str, torch.dtype]]): The data type to use for the model.
             _compile_cfgs (List[RBLNCompileConfig]): List of compilation configurations for the model.
             kwargs: Additional keyword arguments.
@@ -710,7 +718,9 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
         self.npu = npu
         self.tensor_parallel_size = tensor_parallel_size
-        self._torch_dtype = _torch_dtype or "float32"
+        if dtype is not None and isinstance(dtype, torch.dtype):
+            dtype = RBLNCompileConfig.normalize_dtype(dtype)
+        self._dtype = dtype or "float32"
         self.optimum_rbln_version = optimum_rbln_version
         if self.optimum_rbln_version is None:
             self.optimum_rbln_version = __version__
@@ -743,14 +753,24 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
     @property
     def torch_dtype(self):
-        return getattr(torch, self._torch_dtype)
+        logger.warning_once("`torch_dtype` is deprecated. Use `dtype` instead.")
+        return self.dtype
     @torch_dtype.setter
     def torch_dtype(self, torch_dtype: Union[str, torch.dtype]):
-        if isinstance(torch_dtype, torch.dtype):
-            torch_dtype = RBLNCompileConfig.normalize_dtype(torch_dtype)
+        logger.warning_once("`torch_dtype` is deprecated. Use `dtype` instead.")
+        self.dtype = torch_dtype
-        self._torch_dtype = torch_dtype
+    @property
+    def dtype(self):
+        return getattr(torch, self._dtype)
+    @dtype.setter
+    def dtype(self, dtype: Union[str, torch.dtype]):
+        if isinstance(dtype, torch.dtype):
+            dtype = RBLNCompileConfig.normalize_dtype(dtype)
+        self._dtype = dtype
     @property
     def rbln_model_cls_name(self) -> str:
@@ -774,10 +794,15 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
             if isinstance(value, RBLNSerializableConfigProtocol):
                 # Convert nested RBLNModelConfig to its serializable form
                 serializable_map[key] = value._prepare_for_serialization()
+            elif key == "_dtype":
+                serializable_map["dtype"] = value
+            elif isinstance(value, list) and all(isinstance(item, RBLNSerializableConfigProtocol) for item in value):
+                serializable_map[key] = [item._prepare_for_serialization() for item in value]
             elif key == "_compile_cfgs":
                 serializable_map[key] = [cfg.asdict() for cfg in value]
             else:
                 serializable_map[key] = value
         return serializable_map
     def __repr__(self):
@@ -825,18 +850,12 @@ class RBLNModelConfig(RBLNSerializableConfigProtocol):
             if not isinstance(submodule_config, RBLNModelConfig):
                 raise ValueError(f"`{submodule_name}` must be an instance of `RBLNModelConfig` before freezing.")
-            if not submodule_config.is_frozen():
-                raise ValueError(f"`{submodule_name}` config must be frozen before freezing super config.")
         self._frozen = True
     def is_frozen(self):
         return self._frozen
     def save(self, path: str):
-        if not self._frozen:
-            raise RuntimeError("`RBLNModelConfig` is not frozen. Please call `set_compile_cfgs` first.")
         # save as json file without runtime attributes
         path = Path(path)
         if path.is_dir():

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -90,7 +90,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         self.device = torch.device("cpu")
         self.training = False
-        self.dtype = rbln_config.torch_dtype
+        self.dtype = rbln_config.dtype
         # FIXME :: model_save_dir is not used after initialized. (This can be used when save/load)
         # This attribute is needed to keep one reference on the temporary directory, since garbage collecting it
@@ -223,8 +223,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             elif rbln_submodules is None:
                 rbln_submodules = []
-            rbln_config.freeze()
             if config is None:
                 if cls.hf_library_name == "transformers":
                     config = AutoConfig.from_pretrained(
@@ -313,6 +311,8 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             )
             raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
+        rbln_config.freeze()
         return cls(
             models,
             config,
@@ -451,15 +451,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         model_config: "PretrainedConfig",
         rbln_config: RBLNModelConfig,
     ) -> RBLNModelConfig:
-        rbln_config.torch_dtype = model.dtype
-        if not cls._supports_non_fp32 and rbln_config.torch_dtype != torch.float32:
+        rbln_config.dtype = model.dtype
+        if not cls._supports_non_fp32 and rbln_config.dtype != torch.float32:
             raise NotImplementedError(
                 f"Currently, {cls.__name__} does not support non-fp32 dtype. Please use float32 dtype."
             )
         rbln_config = cls._update_rbln_config(
             preprocessors=preprocessors, model=model, model_config=model_config, rbln_config=rbln_config
         )
-        rbln_config.freeze()
         if rbln_config.rbln_model_cls_name != cls.__name__:
             raise NameError(
                 f"Cannot get the rbln config. {cls.__name__} is not the same as {rbln_config.rbln_model_cls_name}. "

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -16,4 +16,5 @@ from .attn import *
 from .flash_attn import *
 from .kv_cache_update import *
 from .linear import linear
+from .moe import *
 from .sliding_window_attn import *

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -205,6 +205,7 @@ def paged_causal_attn_decode(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused attention with KV cache updates.
@@ -228,6 +229,7 @@ def paged_causal_attn_decode(
     - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
     - block_size: [] - Number of tokens per block
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
@@ -247,6 +249,7 @@ def paged_causal_attn_decode_fake(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -267,6 +270,7 @@ def paged_causal_attn_prefill(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -290,6 +294,7 @@ def paged_causal_attn_prefill(
     - block_size: [] - Number of tokens per block
     - is_bidirectional: [] - Whether the attention is bidirectional at current sequence position
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
@@ -310,6 +315,7 @@ def paged_causal_attn_prefill_fake(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -331,6 +337,7 @@ def paged_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -349,6 +356,7 @@ def paged_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -371,6 +379,7 @@ def paged_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -390,6 +399,7 @@ def paged_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -198,6 +198,7 @@ def paged_flash_causal_attn_decode(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for decoding.
@@ -219,6 +220,7 @@ def paged_flash_causal_attn_decode_fake(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -241,6 +243,7 @@ def paged_flash_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -260,6 +263,7 @@ def paged_flash_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -281,6 +285,7 @@ def paged_flash_causal_attn_prefill(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for prefill.
@@ -303,6 +308,7 @@ def paged_flash_causal_attn_prefill_fake(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -326,6 +332,7 @@ def paged_flash_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -346,5 +353,6 @@ def paged_flash_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/moe.py ADDED Viewed

@@ -0,0 +1,180 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu",
+    mutates_args=(),
+)
+def custom_moe_glu(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - up_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - down_proj_weight: [num_experts, intermediate_size, hidden_size]
+    - router_logits: [batch*seq_len, num_experts]
+    - topk: top k experts to select
+    - norm_topk_prob: whether to normalize the top k routing weights with softmax
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_bias: [num_experts, hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu.register_fake
+def custom_moe_glu_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_ff",
+    mutates_args=(),
+)
+def custom_moe_ff(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE FF operation.
+    Expected tensor shapes:
+    - hidden_states: [batch * seq_len, hidden_size]
+    - gate_proj_weight: [hidden_size, num_experts * intermediate_size]
+    - down_proj_weight: [num_experts * intermediate_size, hidden_size]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - gate_proj_bias: [num_experts * intermediate_size]
+    - down_proj_bias: [hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_ff.register_fake
+def custom_moe_ff_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu_mxfp4",
+    mutates_args=(),
+)
+def custom_moe_glu_mxfp4(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - gate_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - up_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_blocks: [num_experts, hidden_size, intermediate_size // 2]
+    - down_proj_scales: [num_experts, hidden_size, intermediate_size // 32]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - expert_select_count: [num_experts]
+    - alpha: []
+    - limit: []
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu_mxfp4.register_fake
+def custom_moe_glu_mxfp4_fake(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    return torch.empty_like(hidden_states)

optimum/rbln/ops/sliding_window_attn.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
+from typing import Optional
 import torch
 from torch import Tensor
@@ -33,6 +35,7 @@ def paged_sliding_window_attn_prefill(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -53,6 +56,7 @@ def paged_sliding_window_attn_prefill(
     - cache_offset: [] - The valid length in the combined sequence of the KV cache and the current projected key states.
     - scale: [] - Attention scale factor
     - is_bidirectional: [] - Whether the attention is bidirectional
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
     """
@@ -72,6 +76,7 @@ def paged_sliding_window_attn_prefill_fake(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -91,6 +96,8 @@ def paged_sliding_window_attn_decode(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -107,5 +114,7 @@ def paged_sliding_window_attn_decode_fake(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl