PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/modeling.py CHANGED Viewed

@@ -34,49 +34,6 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
-def _get_dtype(
-    cls,
-    dtype: Optional[Union[str, torch.dtype, dict]],
-    config: PretrainedConfig,
-) -> tuple[PretrainedConfig, Optional[torch.dtype], Optional[torch.dtype]]:
-    dtype_orig = None
-    if dtype is not None:
-        if isinstance(dtype, str):
-            if dtype == "auto":
-                if hasattr(config, "dtype") and config.dtype is not None:
-                    dtype = config.dtype
-                else:
-                    dtype = torch.get_default_dtype()
-            elif hasattr(torch, dtype):
-                dtype = getattr(torch, dtype)
-                config.dtype = dtype
-        elif isinstance(dtype, torch.dtype):
-            config.dtype = dtype
-        elif isinstance(dtype, dict):
-            for key, curr_dtype in dtype.items():
-                if hasattr(config, key):
-                    value = getattr(config, key)
-                    curr_dtype = curr_dtype if not isinstance(curr_dtype, str) else getattr(torch, curr_dtype)
-                    value.dtype = curr_dtype
-            # main torch dtype for modules that aren't part of any sub-config
-            dtype = dtype.get("")
-            dtype = dtype if not isinstance(dtype, str) else getattr(torch, dtype)
-            config.dtype = dtype
-            if dtype is None:
-                dtype = torch.float32
-        else:
-            raise ValueError(f"Invalid dtype: {dtype}")
-        dtype_orig = cls._set_default_dtype(dtype)
-    else:
-        # Use default dtype
-        default_dtype = torch.get_default_dtype()
-        config.dtype = default_dtype
-    return config, dtype, dtype_orig
 class RBLNModel(RBLNBaseModel):
     @classmethod
     def update_kwargs(cls, kwargs):
@@ -97,13 +54,16 @@ class RBLNModel(RBLNBaseModel):
         pass
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         # Wrap the model if needed.
         return model
     @classmethod
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
-        model = cls.wrap_model_if_needed(model, rbln_config)
+        if rbln_config._allow_no_compile_cfgs:
+            return {}
+        model = cls._wrap_model_if_needed(model, rbln_config)
         rbln_compile_config = rbln_config.compile_cfgs[0]
         compiled_model = cls.compile(
             model,
@@ -113,6 +73,18 @@ class RBLNModel(RBLNBaseModel):
         )
         return compiled_model
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Any],
+        model: Optional["PreTrainedModel"] = None,
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_config: Optional[RBLNModelConfig] = None,
+    ) -> RBLNModelConfig:
+        # Default implementation: return config as-is
+        # Subclasses should override to set compile_cfgs if needed
+        return rbln_config
     @classmethod
     def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
         return model
@@ -277,6 +249,9 @@ class RBLNModel(RBLNBaseModel):
         compiled_models: List[rebel.RBLNCompiledModel],
         rbln_config: RBLNModelConfig,
     ) -> List[rebel.Runtime]:
+        if len(rbln_config.compile_cfgs) == 0:
+            return []
         if DEFAULT_COMPILED_MODEL_NAME not in rbln_config.device_map:
             cls._raise_missing_compiled_file_error([DEFAULT_COMPILED_MODEL_NAME])

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -15,7 +15,6 @@
 import importlib
 import os
 import shutil
-from abc import ABC
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
@@ -39,7 +38,7 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
-class PreTrainedModel(ABC):  # noqa: F811
+class PreTrainedModel:  # noqa: F811
     pass
@@ -63,7 +62,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         subfolder: str = "",
         rbln_compiled_models: Optional[rebel.RBLNCompiledModel] = None,
-        rbln_submodules: List["RBLNBaseModel"] = [],
+        rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
         **kwargs,
     ):
         self.model = models
@@ -71,7 +70,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         self.rbln_config = rbln_config
         if not rbln_config.is_frozen():
             raise RuntimeError("`rbln_config` must be frozen. Please call `rbln_config.freeze()` first.")
         self.compiled_models = rbln_compiled_models
         # Registers the RBLN classes into the transformers AutoModel classes to avoid warnings when creating
@@ -92,7 +90,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         self.device = torch.device("cpu")
         self.training = False
-        self.dtype = rbln_config.torch_dtype
+        self.dtype = rbln_config.dtype
         # FIXME :: model_save_dir is not used after initialized. (This can be used when save/load)
         # This attribute is needed to keep one reference on the temporary directory, since garbage collecting it
@@ -107,6 +105,8 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             self.model_save_dir = model_save_dir
         self.subfolder = subfolder
+        if rbln_submodules is None:
+            rbln_submodules = []
         self.rbln_submodules = rbln_submodules
         self.__post_init__(**kwargs)
@@ -182,7 +182,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         # passed from compile function
         rbln_config: Optional[RBLNModelConfig] = None,
         rbln_compiled_models: Optional[Dict[str, rebel.RBLNCompiledModel]] = None,
-        rbln_submodules: List["RBLNBaseModel"] = [],
+        rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
         **kwargs,
     ) -> "RBLNBaseModel":
         if rbln_compiled_models is None:
@@ -218,12 +218,11 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 )
             if len(cls._rbln_submodules) > 0:
-                rbln_submodules = cls._load_submodules(model_save_dir=model_id, rbln_config=rbln_config, **kwargs)
-            else:
+                if rbln_submodules is None:
+                    rbln_submodules = cls._load_submodules(model_save_dir=model_id, rbln_config=rbln_config, **kwargs)
+            elif rbln_submodules is None:
                 rbln_submodules = []
-            rbln_config.freeze()
             if config is None:
                 if cls.hf_library_name == "transformers":
                     config = AutoConfig.from_pretrained(
@@ -280,9 +279,12 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         config: "PretrainedConfig",
         model_save_dir: Union[Path, str],
         subfolder: Union[Path, str],
-        rbln_submodules: List["RBLNBaseModel"] = [],
+        rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
         **kwargs,
     ):
+        if rbln_submodules is None:
+            rbln_submodules = []
         if isinstance(model_save_dir, str):
             model_save_dir = Path(model_save_dir)
@@ -309,6 +311,8 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             )
             raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
+        rbln_config.freeze()
         return cls(
             models,
             config,
@@ -447,15 +451,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
         model_config: "PretrainedConfig",
         rbln_config: RBLNModelConfig,
     ) -> RBLNModelConfig:
-        rbln_config.torch_dtype = model.dtype
-        if not cls._supports_non_fp32 and rbln_config.torch_dtype != torch.float32:
+        rbln_config.dtype = model.dtype
+        if not cls._supports_non_fp32 and rbln_config.dtype != torch.float32:
             raise NotImplementedError(
                 f"Currently, {cls.__name__} does not support non-fp32 dtype. Please use float32 dtype."
             )
         rbln_config = cls._update_rbln_config(
             preprocessors=preprocessors, model=model, model_config=model_config, rbln_config=rbln_config
         )
-        rbln_config.freeze()
         if rbln_config.rbln_model_cls_name != cls.__name__:
             raise NameError(
                 f"Cannot get the rbln config. {cls.__name__} is not the same as {rbln_config.rbln_model_cls_name}. "

optimum/rbln/ops/__init__.py CHANGED Viewed

@@ -16,4 +16,5 @@ from .attn import *
 from .flash_attn import *
 from .kv_cache_update import *
 from .linear import linear
+from .moe import *
 from .sliding_window_attn import *

optimum/rbln/ops/attn.py CHANGED Viewed

@@ -205,6 +205,7 @@ def paged_causal_attn_decode(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused attention with KV cache updates.
@@ -228,6 +229,7 @@ def paged_causal_attn_decode(
     - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
     - block_size: [] - Number of tokens per block
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
@@ -247,6 +249,7 @@ def paged_causal_attn_decode_fake(
     block_table: Tensor,
     block_size: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -267,6 +270,7 @@ def paged_causal_attn_prefill(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -290,6 +294,7 @@ def paged_causal_attn_prefill(
     - block_size: [] - Number of tokens per block
     - is_bidirectional: [] - Whether the attention is bidirectional at current sequence position
     - mask: [batch=1, max_seq_len] - attention mask when use position_ids
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
@@ -310,6 +315,7 @@ def paged_causal_attn_prefill_fake(
     block_size: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -331,6 +337,7 @@ def paged_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -349,6 +356,7 @@ def paged_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -371,6 +379,7 @@ def paged_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -390,6 +399,7 @@ def paged_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/flash_attn.py CHANGED Viewed

@@ -198,6 +198,7 @@ def paged_flash_causal_attn_decode(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for decoding.
@@ -219,6 +220,7 @@ def paged_flash_causal_attn_decode_fake(
     block_size: int,
     partition: int,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -241,6 +243,7 @@ def paged_flash_causal_attn_decode_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -260,6 +263,7 @@ def paged_flash_causal_attn_decode_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -281,6 +285,7 @@ def paged_flash_causal_attn_prefill(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for fused causal flash attention with KV cache for prefill.
@@ -303,6 +308,7 @@ def paged_flash_causal_attn_prefill_fake(
     partition: int,
     is_bidirectional: bool,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -326,6 +332,7 @@ def paged_flash_causal_attn_prefill_kv_fp8(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -346,5 +353,6 @@ def paged_flash_causal_attn_prefill_kv_fp8_fake(
     k_scale: Tensor,
     v_scale: Tensor,
     mask: Optional[Tensor] = None,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/ops/moe.py ADDED Viewed

@@ -0,0 +1,180 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import Tensor
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu",
+    mutates_args=(),
+)
+def custom_moe_glu(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - up_proj_weight: [num_experts, hidden_size, intermediate_size]
+    - down_proj_weight: [num_experts, intermediate_size, hidden_size]
+    - router_logits: [batch*seq_len, num_experts]
+    - topk: top k experts to select
+    - norm_topk_prob: whether to normalize the top k routing weights with softmax
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_bias: [num_experts, hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu.register_fake
+def custom_moe_glu_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    up_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    router_logits: Tensor,
+    topk: int,
+    norm_topk_prob: bool,
+    gate_proj_bias: Optional[Tensor] = None,
+    up_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_ff",
+    mutates_args=(),
+)
+def custom_moe_ff(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Customized MoE FF operation.
+    Expected tensor shapes:
+    - hidden_states: [batch * seq_len, hidden_size]
+    - gate_proj_weight: [hidden_size, num_experts * intermediate_size]
+    - down_proj_weight: [num_experts * intermediate_size, hidden_size]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - gate_proj_bias: [num_experts * intermediate_size]
+    - down_proj_bias: [hidden_size]
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_ff.register_fake
+def custom_moe_ff_fake(
+    hidden_states: Tensor,
+    gate_proj_weight: Tensor,
+    down_proj_weight: Tensor,
+    masked_routing_weight: Tensor,
+    gate_proj_bias: Optional[Tensor] = None,
+    down_proj_bias: Optional[Tensor] = None,
+) -> Tensor:
+    return torch.empty_like(hidden_states)
+@torch.library.custom_op(
+    "rbln_custom_ops::custom_moe_glu_mxfp4",
+    mutates_args=(),
+)
+def custom_moe_glu_mxfp4(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    """
+    Customized MoE GLU operation.
+    Expected tensor shapes:
+    - hidden_states: [batch*seq_len, hidden_size]
+    - gate_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - gate_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - gate_proj_bias: [num_experts, intermediate_size]
+    - up_proj_blocks: [num_experts, intermediate_size, hidden_size // 2]
+    - up_proj_scales: [num_experts, intermediate_size, hidden_size // 32]
+    - up_proj_bias: [num_experts, intermediate_size]
+    - down_proj_blocks: [num_experts, hidden_size, intermediate_size // 2]
+    - down_proj_scales: [num_experts, hidden_size, intermediate_size // 32]
+    - masked_routing_weight: [batch * seq_len, num_experts]
+    - expert_select_count: [num_experts]
+    - alpha: []
+    - limit: []
+    Returns:
+        Tensor: [batch * seq_len, hidden_size]
+    """
+    return torch.empty_like(hidden_states)
+@custom_moe_glu_mxfp4.register_fake
+def custom_moe_glu_mxfp4_fake(
+    hidden_states: Tensor,
+    gate_proj_blocks: Tensor,
+    gate_proj_scales: Tensor,
+    gate_proj_bias: Tensor,
+    up_proj_blocks: Tensor,
+    up_proj_scales: Tensor,
+    up_proj_bias: Tensor,
+    down_proj_blocks: Tensor,
+    down_proj_scales: Tensor,
+    down_proj_bias: Tensor,
+    router_logits: Tensor,
+    alpha: Tensor,
+    limit: Tensor,
+    k: int,
+    post_norm: bool,
+) -> Tensor:
+    return torch.empty_like(hidden_states)

optimum/rbln/ops/sliding_window_attn.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
+from typing import Optional
 import torch
 from torch import Tensor
@@ -33,6 +35,7 @@ def paged_sliding_window_attn_prefill(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     """Defines the computation pattern for prefill phase attention with KV cache updates.
@@ -53,6 +56,7 @@ def paged_sliding_window_attn_prefill(
     - cache_offset: [] - The valid length in the combined sequence of the KV cache and the current projected key states.
     - scale: [] - Attention scale factor
     - is_bidirectional: [] - Whether the attention is bidirectional
+    - s_aux: [num_attention_heads, sink_len] - auxiliary states for attention
     Returns:
         Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
     """
@@ -72,6 +76,7 @@ def paged_sliding_window_attn_prefill_fake(
     block_table: Tensor,
     block_size: int,
     is_bidirectional: bool,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -91,6 +96,8 @@ def paged_sliding_window_attn_decode(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)
@@ -107,5 +114,7 @@ def paged_sliding_window_attn_decode_fake(
     scale: Tensor,
     block_table: Tensor,
     block_size: int,
+    attn_mask: Tensor,
+    s_aux: Optional[Tensor] = None,
 ) -> Tensor:
     return torch.empty_like(q)

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -78,6 +78,10 @@ _import_structure = {
         "RBLNExaoneForCausalLMConfig",
         "RBLNGemmaModel",
         "RBLNGemmaModelConfig",
+        "RBLNGemma2ForCausalLM",
+        "RBLNGemma2ForCausalLMConfig",
+        "RBLNGemma2Model",
+        "RBLNGemma2ModelConfig",
         "RBLNGemma3ForCausalLM",
         "RBLNGemma3ForCausalLMConfig",
         "RBLNGemma3ForConditionalGeneration",
@@ -88,6 +92,8 @@ _import_structure = {
         "RBLNGPT2LMHeadModelConfig",
         "RBLNGPT2Model",
         "RBLNGPT2ModelConfig",
+        "RBLNGptOssForCausalLM",
+        "RBLNGptOssForCausalLMConfig",
         "RBLNGroundingDinoDecoder",
         "RBLNGroundingDinoDecoderConfig",
         "RBLNGroundingDinoForObjectDetection",
@@ -110,6 +116,10 @@ _import_structure = {
         "RBLNPegasusForConditionalGenerationConfig",
         "RBLNPegasusModel",
         "RBLNPegasusModelConfig",
+        "RBLNPaliGemmaForConditionalGeneration",
+        "RBLNPaliGemmaForConditionalGenerationConfig",
+        "RBLNPaliGemmaModel",
+        "RBLNPaliGemmaModelConfig",
         "RBLNLlavaNextForConditionalGeneration",
         "RBLNLlavaNextForConditionalGenerationConfig",
         "RBLNLoRAAdapterConfig",
@@ -134,14 +144,22 @@ _import_structure = {
         "RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
         "RBLNQwen2_5_VLForConditionalGeneration",
         "RBLNQwen2_5_VLForConditionalGenerationConfig",
+        "RBLNQwen2_5_VLModel",
+        "RBLNQwen2_5_VLModelConfig",
         "RBLNQwen2VisionTransformerPretrainedModel",
         "RBLNQwen2VisionTransformerPretrainedModelConfig",
         "RBLNQwen2VLForConditionalGeneration",
         "RBLNQwen2VLForConditionalGenerationConfig",
+        "RBLNQwen2VLModel",
+        "RBLNQwen2VLModelConfig",
         "RBLNQwen2Model",
         "RBLNQwen2ModelConfig",
         "RBLNQwen2ForCausalLM",
         "RBLNQwen2ForCausalLMConfig",
+        "RBLNQwen2MoeForCausalLM",
+        "RBLNQwen2MoeForCausalLMConfig",
+        "RBLNQwen3MoeForCausalLM",
+        "RBLNQwen3MoeForCausalLMConfig",
         "RBLNQwen3ForCausalLM",
         "RBLNQwen3ForCausalLMConfig",
         "RBLNQwen3Model",
@@ -234,6 +252,10 @@ if TYPE_CHECKING:
         RBLNDPTForDepthEstimationConfig,
         RBLNExaoneForCausalLM,
         RBLNExaoneForCausalLMConfig,
+        RBLNGemma2ForCausalLM,
+        RBLNGemma2ForCausalLMConfig,
+        RBLNGemma2Model,
+        RBLNGemma2ModelConfig,
         RBLNGemma3ForCausalLM,
         RBLNGemma3ForCausalLMConfig,
         RBLNGemma3ForConditionalGeneration,
@@ -246,6 +268,8 @@ if TYPE_CHECKING:
         RBLNGPT2LMHeadModelConfig,
         RBLNGPT2Model,
         RBLNGPT2ModelConfig,
+        RBLNGptOssForCausalLM,
+        RBLNGptOssForCausalLMConfig,
         RBLNGroundingDinoDecoder,
         RBLNGroundingDinoDecoderConfig,
         RBLNGroundingDinoEncoder,
@@ -276,6 +300,10 @@ if TYPE_CHECKING:
         RBLNOPTForCausalLMConfig,
         RBLNOPTModel,
         RBLNOPTModelConfig,
+        RBLNPaliGemmaForConditionalGeneration,
+        RBLNPaliGemmaForConditionalGenerationConfig,
+        RBLNPaliGemmaModel,
+        RBLNPaliGemmaModelConfig,
         RBLNPegasusForConditionalGeneration,
         RBLNPegasusForConditionalGenerationConfig,
         RBLNPegasusModel,
@@ -290,18 +318,26 @@ if TYPE_CHECKING:
         RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
         RBLNQwen2_5_VLForConditionalGeneration,
         RBLNQwen2_5_VLForConditionalGenerationConfig,
+        RBLNQwen2_5_VLModel,
+        RBLNQwen2_5_VLModelConfig,
         RBLNQwen2ForCausalLM,
         RBLNQwen2ForCausalLMConfig,
         RBLNQwen2Model,
         RBLNQwen2ModelConfig,
+        RBLNQwen2MoeForCausalLM,
+        RBLNQwen2MoeForCausalLMConfig,
         RBLNQwen2VisionTransformerPretrainedModel,
         RBLNQwen2VisionTransformerPretrainedModelConfig,
         RBLNQwen2VLForConditionalGeneration,
         RBLNQwen2VLForConditionalGenerationConfig,
+        RBLNQwen2VLModel,
+        RBLNQwen2VLModelConfig,
         RBLNQwen3ForCausalLM,
         RBLNQwen3ForCausalLMConfig,
         RBLNQwen3Model,
         RBLNQwen3ModelConfig,
+        RBLNQwen3MoeForCausalLM,
+        RBLNQwen3MoeForCausalLMConfig,
         RBLNResNetForImageClassification,
         RBLNResNetForImageClassificationConfig,
         RBLNRobertaForMaskedLM,

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl