PyPI - optimum-rbln - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl - Mend

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,6 +20,7 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 import inspect
 from dataclasses import dataclass
 from pathlib import Path
@@ -27,28 +28,26 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Un
 import rebel
 import torch
+from rebel.compile_context import CompileContext
 from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import no_init_weights
 from transformers.utils import ModelOutput
 from ....modeling import RBLNModel
-from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
+from ....modeling_config import RBLNCompileConfig, RBLNConfig
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ....utils.timer_utils import rbln_timer
 from ...utils.rbln_quantization import QuantizationManager
-from .decoderonly_architecture import DecoderOnlyWrapper
+from .decoderonly_architecture import (
+    DecoderOnlyWrapper,
+    validate_attention_method,
+)
 logger = get_logger()
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-        PretrainedConfig,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
 class RBLNRuntimeModel(RBLNPytorchRuntime):
@@ -60,32 +59,21 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         inputs_embeds: torch.Tensor,
         attention_mask: torch.Tensor,
         cache_position: torch.Tensor,
-        batch_position: torch.Tensor,
-        query_idx: torch.Tensor,
         **kwargs,
     ):
         if inputs_embeds is None:
             inp = input_ids
             if self.embed_tokens is not None:
                 inp = self.embed_tokens(inp)
-            return super().forward(
-                inp,
-                attention_mask,
-                cache_position,
-                batch_position,
-                query_idx,
-                **kwargs,
-            )
         else:
-            return super().forward(
-                inputs_embeds,
-                attention_mask,
-                cache_position,
-                batch_position,
-                query_idx,
-                **kwargs,
-            )
+            inp = inputs_embeds
+        return super().forward(
+            inp,
+            attention_mask,
+            cache_position,
+            **kwargs,
+        )
 @dataclass
@@ -243,11 +231,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     @classmethod
     def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
         wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
-        # If the model wrapper supports rbln-custom-flash-attention
-        if "kvcache_partition_len" in inspect.signature(cls._decoder_wrapper_cls.__init__).parameters:
-            wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        wrapper_cfg["attn_impl"] = rbln_config.model_cfg.get("attn_impl")
+        wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
         wrapper_cfg["use_rotary_emb"] = cls._use_rotary_emb
         return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
@@ -258,72 +243,46 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
         rbln_compile_configs = rbln_config.compile_cfgs
-        prefill_rbln_compile_config = rbln_compile_configs[0]
-        dec_rbln_compile_config = rbln_compile_configs[1]
+        prefill_compile_config = rbln_compile_configs[0]
+        dec_compile_config = rbln_compile_configs[1]
-        @rbln_timer("JIT trace")
-        def get_scripted_model():
-            # This function is nested to dealloc the example inputs before compilation.
-            # FIXME: 3rd dummy_input(batch_idx) should be fill zero to compile flash_attn.
-            prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
+        context = CompileContext(use_weight_sharing=True)
-            wrapped_model.phase = "prefill"
-            prefill_scripted_model = torch.jit.trace(
-                wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False
-            )
-            wrapped_model.phase = "decode"
-            dec_scripted_model = torch.jit.trace(
-                wrapped_model, dec_example_inputs, check_trace=False, _store_inputs=False
-            )
-            return prefill_scripted_model, dec_scripted_model
-        prefill_scripted_model, dec_scripted_model = get_scripted_model()
+        # Here we use meta tensor, for the memory efficiency.
+        meta_tensor_names = [name for name, _, _ in prefill_compile_config.input_info if "past_key_values" in name]
+        prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
-        @rbln_timer("Model conversion")
-        def scripted_model_to_ir():
-            prefill_ir = rebel.torchscript_to_ir(
-                prefill_scripted_model,
-                input_names=[v[0] for v in prefill_rbln_compile_config.input_info],
-            )
-            dec_ir = rebel.torchscript_to_ir(
-                dec_scripted_model,
-                input_names=[v[0] for v in dec_rbln_compile_config.input_info],
-            )
-            return prefill_ir, dec_ir
+        # Mark static tensors (self kv states)
+        static_tensors = {}
+        for (name, _, _), tensor in zip(prefill_compile_config.input_info, prefill_example_inputs):
+            if "past_key_values" in name:
+                static_tensors[name] = tensor
+                context.mark_static_address(tensor)
-        prefill_ir, dec_ir = scripted_model_to_ir()
-        # Caching prefill_decoder/decoder I/O
-        cache_index_offset = 5
+        dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
-        connections = [
-            (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
-            for i in range(model.config.num_hidden_layers * 2)
-        ]
-        # Extract quantize_config from rbln_config
         quantize_config = rbln_config.model_cfg.get("quantization", None)
         @QuantizationManager.with_quantization_env
         def compile_model(*args, **kwargs):
-            # Remove quantize_config from kwargs
-            kwargs.pop("quantize_config", None)
-            # Call rebel.compile with the updated kwargs
-            return rebel.compile(*args, **kwargs)
-        compiled_model = compile_model(
-            prefill_ir,
-            dec_ir,
-            connections=connections,
-            fusion=prefill_rbln_compile_config.fusion,
-            npu=prefill_rbln_compile_config.npu,
-            tensor_parallel_size=prefill_rbln_compile_config.tensor_parallel_size,
-            use_weight_sharing=True,
-            quantize_config=quantize_config,
-        )
+            wrapped_model.phase = "prefill"
+            compiled_prefill = RBLNModel.compile(
+                wrapped_model,
+                prefill_compile_config,
+                example_inputs=prefill_example_inputs,
+                compile_context=context,
+            )
+            wrapped_model.phase = "decode"
+            compiled_decoder = RBLNModel.compile(
+                wrapped_model,
+                dec_compile_config,
+                example_inputs=dec_example_inputs,
+                compile_context=context,
+            )
+            return {"prefill": compiled_prefill, "decoder": compiled_decoder}
-        return compiled_model
+        return compile_model(quantize_config=quantize_config)
     @classmethod
     def _get_rbln_config(
@@ -335,6 +294,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
         rbln_batch_size = rbln_kwargs.get("batch_size", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
+        rbln_attn_impl = rbln_kwargs.get("attn_impl", None)
+        rbln_kvcache_partition_len = rbln_kwargs.get("kvcache_partition_len", None)
         rbln_quantization = QuantizationManager.validate_quantization_config(rbln_kwargs.get("quantization", None))
         prefill_chunk_size = 128
@@ -344,9 +305,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             )
         if rbln_max_seq_len is None:
             raise ValueError("`rbln_max_seq_len` should be specified.")
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         rbln_use_inputs_embeds = False if rbln_use_inputs_embeds is None else rbln_use_inputs_embeds
+        rbln_attn_impl, rbln_kvcache_partition_len = validate_attention_method(
+            rbln_attn_impl=rbln_attn_impl,
+            rbln_kvcache_partition_len=rbln_kvcache_partition_len,
+            rbln_max_seq_len=rbln_max_seq_len,
+        )
         num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
         num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
         num_hidden_layers = getattr(model_config, "n_layer", None) or getattr(model_config, "num_hidden_layers")
@@ -372,9 +340,14 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                     [batch_size, query_length],
                     "int32",
                 ),
-                ("batch_position", [], "int16"),
-                ("query_idx", [], "int16"),
             ]
+            if query_length > 1:
+                input_info.extend(
+                    [
+                        ("batch_position", [], "int16"),
+                        ("query_position", [], "int16"),
+                    ]
+                )
             input_info.extend(
                 [
@@ -407,12 +380,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             hidden_size=hidden_size,
         )
-        prefill_rbln_compile_config = RBLNCompileConfig(input_info=prefill_input_info)
-        dec_rbln_compile_config = RBLNCompileConfig(input_info=dec_input_info)
+        prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
+        dec_compile_config = RBLNCompileConfig(compiled_model_name="decoder", input_info=dec_input_info)
         rbln_config = RBLNConfig(
             rbln_cls=cls.__name__,
-            compile_cfgs=[prefill_rbln_compile_config, dec_rbln_compile_config],
+            compile_cfgs=[prefill_compile_config, dec_compile_config],
             rbln_kwargs=rbln_kwargs,
         )
@@ -422,6 +395,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 "batch_size": rbln_batch_size,
                 "prefill_chunk_size": prefill_chunk_size,
                 "use_inputs_embeds": rbln_use_inputs_embeds,
+                "kvcache_partition_len": rbln_kvcache_partition_len,
+                "attn_impl": rbln_attn_impl,
             }
         )
@@ -432,12 +407,21 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     @classmethod
     def _create_runtimes(
-        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_device_map: Dict[str, int],
+        activate_profiler: Optional[bool] = None,
     ) -> List[rebel.Runtime]:
-        device_val = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
+        if any(model_name not in rbln_device_map for model_name in ["prefill", "decoder"]):
+            cls._raise_missing_compiled_file_error(["prefill", "decoder"])
         return [
-            compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
-            compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(
+                tensor_type="pt", device=rbln_device_map["prefill"], activate_profiler=activate_profiler
+            ),
+            compiled_models[1].create_runtime(
+                tensor_type="pt", device=rbln_device_map["decoder"], activate_profiler=activate_profiler
+            ),
         ]
     def get_decoder(self):
@@ -569,12 +553,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 ],
                 dtype=torch.float32,
                 device="cpu",
-            ),
-            torch.empty(size=[], dtype=torch.int16, device="cpu"),
+            )
         ]
         input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
         query_length = input_tensors.shape[1]
+        if query_length > self.max_seq_len:
+            raise ValueError(
+                f"Input length ({query_length}) exceeds the maximum allowed sequence length ({self.max_seq_len})."
+            )
         _attention_mask = self.prefill_attention_mask.clone()
         for step in range(0, query_length, self.prefill_chunk_size):
@@ -607,15 +595,15 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 _attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
             _attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            query_idx = (query_length - 1) % self.prefill_chunk_size
+            query_position = (query_length - 1) % self.prefill_chunk_size
-            logits, _ = self.prefill_decoder(
+            logits = self.prefill_decoder(
                 input_ids=_input_tensors.contiguous() if inputs_embeds is None else None,
                 inputs_embeds=_input_tensors.contiguous() if inputs_embeds is not None else None,
                 attention_mask=_attention_mask.contiguous(),
                 cache_position=_cache_position.contiguous(),
                 batch_position=torch.tensor(batch_idx, dtype=torch.int16),
-                query_idx=torch.tensor(query_idx, dtype=torch.int16),
+                query_position=torch.tensor(query_position, dtype=torch.int16),
                 out=out_buffers,
             )
@@ -651,14 +639,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                     f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
                 )
             self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
-        logits, _ = self.decoder(
+        logits = self.decoder(
             input_ids=input_tensors.contiguous() if inputs_embeds is None else None,
             inputs_embeds=input_tensors.contiguous() if inputs_embeds is not None else None,
             attention_mask=self.dec_attn_mask.contiguous(),
             cache_position=cache_position.contiguous(),
-            batch_position=torch.tensor(0, dtype=torch.int16),
-            query_idx=torch.tensor(0, dtype=torch.int16),
         )
         return logits

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -20,6 +20,7 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from typing import TYPE_CHECKING
 import torch.nn as nn
@@ -58,7 +59,7 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
             new_layer = ExaoneLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = ExaoneModel(causal_lm.transformer, new_layers)
+        new_model = ExaoneModel(causal_lm.transformer, new_layers, partition_len=self.kvcache_partition_len)
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm
@@ -85,7 +86,6 @@ class ExaoneAttention(DecoderOnlyAttention):
         self.k_proj = self._original_mod.k_proj
         self.v_proj = self._original_mod.v_proj
         self.o_proj = self._original_mod.out_proj
-        self.num_key_value_heads = self._original_mod.num_key_value_heads
 class ExaoneFlashAttention(DecoderOnlyFlashAttention):
@@ -94,4 +94,3 @@ class ExaoneFlashAttention(DecoderOnlyFlashAttention):
         self.k_proj = self._original_mod.k_proj
         self.v_proj = self._original_mod.v_proj
         self.o_proj = self._original_mod.out_proj
-        self.num_key_value_heads = self._original_mod.num_key_value_heads

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -51,7 +51,7 @@ class GemmaWrapper(DecoderOnlyWrapper):
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
             new_layer = DecoderOnlyLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = GemmaModel(causal_lm.model, new_layers)
+        new_model = GemmaModel(causal_lm.model, new_layers, partition_len=self.kvcache_partition_len)
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -21,6 +21,7 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import math
 from typing import TYPE_CHECKING, Tuple
 import torch
@@ -54,8 +55,6 @@ class GPT2Wrapper(DecoderOnlyWrapper):
 class GPT2Model(DecoderOnlyModel):
-    mask_fmin = torch.finfo(torch.float32).min
     def get_last_layernorm(self) -> nn.LayerNorm:
         return self._original_mod.ln_f
@@ -79,16 +78,17 @@ class GPT2Attention(DecoderOnlyAttention):
         self.c_attn = self._original_mod.c_attn
         self.o_proj = self._original_mod.c_proj
         self.split_size = self._original_mod.split_size
-        self.num_key_value_heads = self._original_mod.num_heads
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
         return query_states, key_states, value_states
-    def rbln_attention(self, *args, **kwargs):
-        return super().rbln_attention(
-            *args,
-            **kwargs,
-            layer_idx=self.layer_idx,
-            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
-        )
+    def get_attn_scale(self):
+        scale = 1.0
+        if self._original_mod.scale_attn_weights:
+            scale /= math.sqrt(self.head_dim)
+        if self._original_mod.scale_attn_by_inverse_layer_idx:
+            scale /= 1 + self.layer_idx
+        return scale

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -23,7 +23,7 @@
 from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .gpt2_architecture import GPT2Wrapper  # GPT2LMHeadModelWrapper
+from .gpt2_architecture import GPT2Wrapper
 logger = logging.get_logger(__name__)

optimum/rbln/transformers/models/llama/llama_architecture.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from ...models.decoderonly.decoderonly_architecture import DecoderOnlyWrapper

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -20,6 +20,7 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 import inspect
 import logging
 from pathlib import Path

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -21,12 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import math
 from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from ....transformers.models.decoderonly.decoderonly_architecture import rotate_half
 from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
     DecoderOnlyForCausalLM,
@@ -34,6 +34,7 @@ from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyModel,
     DecoderOnlyWrapper,
     apply_rotary_pos_emb_partial,
+    rotate_half,
 )
@@ -77,8 +78,6 @@ class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
 class MidmModel(DecoderOnlyModel):
-    mask_fmin = -10000.0
     def get_layernorm1p(self, module: nn.LayerNorm):
         def layernorm1p(input: torch.Tensor):
             """Applies Layer Normalization with a slight modification on the weights."""
@@ -135,14 +134,15 @@ class MidmAttention(DecoderOnlyAttention):
         query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
         return query_states, key_states, value_states
-    def rbln_attention(self, *args, **kwargs):
-        return super().rbln_attention(
-            *args,
-            **kwargs,
-            layer_idx=self.layer_idx,
-            scale_attn_weights=self._original_mod.scale_attn_weights,
-            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
-        )
+    def get_attn_scale(self):
+        scale = 1.0
+        if self._original_mod.scale_attn_weights:
+            scale /= math.sqrt(self.head_dim)
+        if self._original_mod.scale_attn_by_inverse_layer_idx and not self._original_mod.scale_qk_by_inverse_layer_idx:
+            scale /= 1 + self.layer_idx
+        return scale
     def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
         return apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim=cos.shape[-1])

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from transformers import AutoModelForCausalLM
 from ....utils import logging

optimum/rbln/transformers/models/mistral/mistral_architecture.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper

optimum/rbln/transformers/models/phi/phi_architecture.py CHANGED Viewed

@@ -65,7 +65,6 @@ class PhiAttention(DecoderOnlyAttention):
         self.o_proj = self._original_mod.dense
         self.qk_layernorm = self._original_mod.qk_layernorm
         self.rotary_ndims = self._original_mod.rotary_ndims
-        self.num_key_value_heads = self.num_heads
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_states = self.q_proj(hidden_states)
@@ -90,7 +89,7 @@ class PhiLayer(DecoderOnlyLayer):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        current_steps: torch.LongTensor,
+        seq_positions: torch.LongTensor,
         batch_position: torch.Tensor,
         past_key_values: Tuple[Tuple[torch.Tensor]],
         cos: Optional[torch.Tensor] = None,
@@ -103,7 +102,7 @@ class PhiLayer(DecoderOnlyLayer):
         attn_outputs, present_key_values = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            current_steps=current_steps,
+            seq_positions=seq_positions,
             batch_position=batch_position,
             past_key_values=past_key_values,
             cos=cos,

optimum/rbln/transformers/models/qwen2/qwen2_architecture.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper

optimum-rbln 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl