PyPI - optimum-rbln - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

optimum-rbln 0.1.4py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -23,22 +23,18 @@
 import inspect  # noqa: I001
 import logging
-from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import torch  # noqa: F401
 import rebel  # noqa: F401
-from optimum.exporters import TasksManager
-from transformers import AutoModelForCausalLM, LlamaForCausalLM, PretrainedConfig, AutoConfig
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, PreTrainedModel, PretrainedConfig, AutoConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from ...generation.utils import RBLNGenerationMixin
-from ....modeling_base import RBLNBaseModel
+from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ....utils.save_utils import maybe_save_preprocessors
 # FIXME:: Merge Two architecture Codes
@@ -72,10 +68,10 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name"]
-class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
+class RBLNLlamaForCausalLM(RBLNModel, RBLNGenerationMixin):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNBaseModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -83,7 +79,6 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
     - compiling the resulting graph using the RBLN compiler.
     """
-    model_type = "rbln_model"
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
@@ -102,17 +97,34 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         )
         self.decoder_attention_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.int64)
-        self.prefill_decoder = RBLNRuntimeModel(runtime=self.runtimes[0], main_input_name="input_ids")
-        self.decoder = RBLNRuntimeModel(runtime=self.runtimes[1], main_input_name="input_ids")
+        self.prefill_decoder = RBLNRuntimeModel(runtime=self.model[0], main_input_name="input_ids")
+        self.decoder = RBLNRuntimeModel(runtime=self.model[1], main_input_name="input_ids")
         self.past_cached_length = 0
         self.right_padding = True
     @classmethod
-    @torch.no_grad()
-    def _export(
+    def update_kwargs(cls, kwargs):
+        """
+        Update user-given kwargs to get proper pytorch model.
+        For example, `torchscript`=True should be set because torch.jit
+        does not support `transformers` output instances as module output;
+        """
+        kwargs.update(
+            {
+                "torchscript": True,
+                "return_dict": False,
+                "use_cache": True,
+                "torch_dtype": torch.float32,
+                "_attn_implementation": "eager",
+            }
+        )
+        return kwargs
+    @classmethod
+    def get_pytorch_model(
         cls,
         model_id: str,
-        config: "PretrainedConfig",
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -120,135 +132,94 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        rbln_config_kwargs: Optional[Dict[str, Any]] = None,
+        rbln_constructor_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
-    ) -> "RBLNLlamaForCausalLM":
-        task = kwargs.pop("task", None)
-        if task is None:
-            task = TasksManager.infer_task_from_model(cls.auto_model_class)
-        if model_save_dir is None:
-            save_dir = TemporaryDirectory()
-            save_dir_path = Path(save_dir.name)
-        else:
-            save_dir = model_save_dir
-            if isinstance(save_dir, TemporaryDirectory):
-                save_dir_path = Path(model_save_dir.name)
-            else:
-                save_dir_path = Path(model_save_dir)
-                save_dir_path.mkdir(exist_ok=True)
-        def update_configs(kwargs):
-            hf_max_position_embeddings = getattr(AutoConfig.from_pretrained(model_id), "max_position_embeddings", None)
-            max_seq_len = kwargs.get("rbln_max_seq_len", None)
-            if max_seq_len is not None:
-                if max_seq_len <= hf_max_position_embeddings:
-                    kwargs.update({"max_position_embeddings": max_seq_len})
-                else:
-                    raise ValueError("`max_seq_len` should be less or equal than max_position_embeddings!")
-            kwargs.update(
-                {
-                    "torchscript": True,
-                    "return_dict": False,
-                    "use_cache": True,
-                    "torch_dtype": torch.float32,
-                    "_attn_implementation": "eager",
-                }
-            )
-            return kwargs
-        kwargs = update_configs(kwargs)
-        rbln_config_kwargs, rbln_constructor_kwargs = cls.pop_rbln_kwargs_from_kwargs(kwargs)
+    ) -> PreTrainedModel:
+        if rbln_max_seq_len := rbln_config_kwargs.get("rbln_max_seq_len", None):
+            config = AutoConfig.from_pretrained(model_id)
+            if hf_position_embedding := getattr(config, "max_position_embeddings", None):
+                if hf_position_embedding < rbln_max_seq_len:
+                    logger.warning(
+                        f"`rbln_max_seq_len` is larger than original config({hf_position_embedding})."
+                        "This may lead to incorrect inferences of the model."
+                    )
+            kwargs.update({"max_position_embeddings": rbln_max_seq_len})
         # FIXME :: This should be moved when wrapping removed.
         use_continuous_batch = rbln_config_kwargs.get("rbln_batching", "static") == "vllm"
-        origin_mehtods = wrap_llama_cb() if use_continuous_batch else wrap_llama()
+        wrap_llama_cb() if use_continuous_batch else wrap_llama()
-        model: LlamaForCausalLM = TasksManager.get_model_from_task(
-            task=task,
-            model_name_or_path=model_id,
-            subfolder=subfolder,
+        model = super().get_pytorch_model(
+            model_id=model_id,
+            use_auth_token=use_auth_token,
             revision=revision,
-            framework="pt",
+            force_download=force_download,
             cache_dir=cache_dir,
-            use_auth_token=use_auth_token,
+            subfolder=subfolder,
             local_files_only=local_files_only,
-            force_download=force_download,
             trust_remote_code=trust_remote_code,
+            rbln_config_kwargs=rbln_config_kwargs,
+            rbln_constructor_kwargs=rbln_constructor_kwargs,
             **kwargs,
         )
-        if config is None:
-            config = model.config
+        unwrap_llama()
-        config.save_pretrained(save_dir_path)
-        preprocessors = maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
+        return model
-        # Get compilation arguments
-        if rbln_config_kwargs.get("rbln_config", None) is None:
-            rbln_config = cls.get_rbln_config(
-                preprocessors=preprocessors, model_config=model.config, **rbln_config_kwargs
-            )
+    @classmethod
+    @torch.inference_mode()
+    def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
+        use_continuous_batch = rbln_config.meta["rbln_batching"] == "vllm"
-        def compile_llama(use_continuous_batch, wrapper_cls):
-            wrapped_model = wrapper_cls(model).eval()
+        wrapper_cls = LlamaWrapper_cb if use_continuous_batch else LlamaWrapper
-            prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
-            dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
+        wrapped_model = wrapper_cls(model).eval()
-            prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
+        prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
+        dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
-            if use_continuous_batch:
-                batch_index_index = 3
-                dec_example_inputs[batch_index_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
+        prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
+        dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
-            prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs)
-            dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs)
+        if use_continuous_batch:
+            batch_index_index = 3
+            dec_example_inputs[batch_index_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
-            prefill_ir = rebel.torchscript_to_ir(
-                prefill_scripted_model,
-                input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
-            )
-            dec_ir = rebel.torchscript_to_ir(
-                dec_scripted_model,
-                input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
-            )
+        wrap_llama_cb() if use_continuous_batch else wrap_llama()
-            # Caching prefill_decoder/decoder I/O
-            cache_index_offset = 4 if use_continuous_batch else 3
-            connections = [
-                (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
-                for i in range(model.config.num_hidden_layers * 2)
-            ]
+        prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs, check_trace=False)
+        dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs, check_trace=False)
-            compiled_model = rebel.compile(
-                prefill_ir,
-                dec_ir,
-                connections=connections,
-                fusion=prefill_rbln_runtime_config.fusion,
-                npu=prefill_rbln_runtime_config.npu,
-                tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
-                use_weight_sharing=True,
-            )
-            compiled_model.save(save_dir_path / f"{DEFAULT_COMPILED_MODEL_NAME}.rbln")
+        unwrap_llama()
-        wrapper_cls = LlamaWrapper_cb if use_continuous_batch else LlamaWrapper
-        compile_llama(use_continuous_batch=use_continuous_batch, wrapper_cls=wrapper_cls)
-        unwrap_llama(origin_mehtods)
+        prefill_ir = rebel.torchscript_to_ir(
+            prefill_scripted_model,
+            input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
+        )
+        dec_ir = rebel.torchscript_to_ir(
+            dec_scripted_model,
+            input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
+        )
-        rbln_config.save(save_dir_path)
+        # Caching prefill_decoder/decoder I/O
+        cache_index_offset = 4 if use_continuous_batch else 3
+        connections = [
+            (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
+            for i in range(model.config.num_hidden_layers * 2)
+        ]
-        return cls._from_pretrained(
-            model_id=save_dir_path,
-            config=config,
-            model_save_dir=save_dir,
-            **rbln_constructor_kwargs,
-            **kwargs,
+        compiled_model = rebel.compile(
+            prefill_ir,
+            dec_ir,
+            connections=connections,
+            fusion=prefill_rbln_runtime_config.fusion,
+            npu=prefill_rbln_runtime_config.npu,
+            tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
+            use_weight_sharing=True,
         )
+        return compiled_model
     @classmethod
     def _get_rbln_config(
@@ -338,11 +309,14 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         return rbln_config
-    def _create_runtimes(self, rbln_device_map: Dict[str, int]) -> List[rebel.Runtime]:
+    @classmethod
+    def _create_runtimes(
+        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+    ) -> List[rebel.Runtime]:
         device_val = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
         return [
-            self.compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
-            self.compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
         ]
     def get_decoder(self):

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -23,20 +23,16 @@
 import inspect
 import logging
-from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
-from optimum.exporters import TasksManager
-from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
-from ....modeling_base import RBLNBaseModel
+from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ....utils.save_utils import maybe_save_preprocessors
 from ...generation.utils import RBLNGenerationMixin
 from .hf_hub_cached.modeling_midm import MidmLMHeadModel
 from .midm_architecture import (
@@ -74,7 +70,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
         return logits
-class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
+class RBLNMidmLMHeadModel(RBLNModel, RBLNGenerationMixin):
     """
     The Midm Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
@@ -122,8 +118,8 @@ class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
             torch.ones(self.batch_size, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
         )
-        self.prefill_decoder = RBLNRuntimeDecoder(runtime=self.runtimes[0], main_input_name="input_ids")
-        self.decoder = RBLNRuntimeDecoder(runtime=self.runtimes[1], main_input_name="input_ids")
+        self.prefill_decoder = RBLNRuntimeDecoder(runtime=self.model[0], main_input_name="input_ids")
+        self.decoder = RBLNRuntimeDecoder(runtime=self.model[1], main_input_name="input_ids")
         self.past_cached_length = 0
     def can_generate(self):
@@ -149,10 +145,63 @@ class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
         raise NotImplementedError
     @classmethod
-    def _export(
+    @torch.inference_mode()
+    def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
+        wrapped_decoder = MidmLMHeadModelWrapper(model).eval()
+        prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
+        dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
+        prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
+        dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=0)
+        prefill_scripted_model = torch.jit.trace(wrapped_decoder, prefill_example_inputs, check_trace=False)
+        dec_scripted_model = torch.jit.trace(wrapped_decoder, dec_example_inputs, check_trace=False)
+        prefill_ir = rebel.torchscript_to_ir(
+            prefill_scripted_model,
+            input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
+        )
+        dec_ir = rebel.torchscript_to_ir(
+            dec_scripted_model,
+            input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
+        )
+        connections = [(prefill_ir.outputs[1 + i], prefill_ir.inputs[3 + i]) for i in range(model.config.n_layer * 2)]
+        compiled_model = rebel.compile(
+            prefill_ir,
+            dec_ir,
+            connections=connections,
+            fusion=prefill_rbln_runtime_config.fusion,
+            npu=prefill_rbln_runtime_config.npu,
+            tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
+            use_weight_sharing=True,
+        )
+        return compiled_model
+    @classmethod
+    def update_kwargs(cls, kwargs):
+        """
+        Update user-given kwargs to get proper pytorch model.
+        For example, `torchscript`=True should be set because torch.jit
+        does not support `transformers` output instances as module output;
+        """
+        kwargs.update(
+            {
+                "torchscript": True,
+                "return_dict": False,
+                "use_cache": True,
+                "torch_dtype": torch.float32,
+                "_attn_implementation": "eager",
+            }
+        )
+        return kwargs
+    @classmethod
+    def get_pytorch_model(
         cls,
         model_id: str,
-        config: "PretrainedConfig",
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -160,120 +209,35 @@ class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        rbln_config_kwargs: Optional[Dict[str, Any]] = None,
+        rbln_constructor_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
-    ) -> "RBLNMidmLMHeadModel":
-        task = kwargs.pop("task", None)
-        if task is None:
-            task = TasksManager.infer_task_from_model(cls.auto_model_class)
-        if model_save_dir is None:
-            save_dir = TemporaryDirectory()
-            save_dir_path = Path(save_dir.name)
-        else:
-            save_dir = model_save_dir
-            if isinstance(save_dir, TemporaryDirectory):
-                save_dir_path = Path(model_save_dir.name)
-            else:
-                save_dir_path = Path(model_save_dir)
-                save_dir_path.mkdir(exist_ok=True)
-        def update_configs(kwargs):
-            max_seq_len = kwargs.get("rbln_max_seq_len", None)
-            if max_seq_len is not None:
-                kwargs.update({"max_position_embeddings": max_seq_len})
-            kwargs.update(
-                {
-                    "torchscript": True,
-                    "return_dict": False,
-                    "use_cache": True,
-                    "torch_dtype": torch.float32,
-                    "_attn_implementation": "eager",
-                }
-            )
-            return kwargs
-        kwargs = update_configs(kwargs)
-        rbln_config_kwargs, rbln_constructor_kwargs = cls.pop_rbln_kwargs_from_kwargs(kwargs)
+    ) -> PreTrainedModel:
+        if rbln_max_seq_len := rbln_config_kwargs.get("rbln_max_seq_len", None):
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+            if hf_position_embedding := getattr(config, "max_position_embeddings", None):
+                if hf_position_embedding < rbln_max_seq_len:
+                    logger.warning(
+                        f"`rbln_max_seq_len` is larger than original config({hf_position_embedding})."
+                        "This may lead to incorrect inferences of the model."
+                    )
+            kwargs.update({"max_position_embeddings": rbln_max_seq_len})
-        model: MidmLMHeadModel = TasksManager.get_model_from_task(
-            task=task,
-            model_name_or_path=model_id,
-            subfolder=subfolder,
+        return super().get_pytorch_model(
+            model_id=model_id,
+            use_auth_token=use_auth_token,
             revision=revision,
-            framework="pt",
+            force_download=force_download,
             cache_dir=cache_dir,
-            use_auth_token=use_auth_token,
+            subfolder=subfolder,
             local_files_only=local_files_only,
-            force_download=force_download,
             trust_remote_code=trust_remote_code,
+            rbln_config_kwargs=rbln_config_kwargs,
+            rbln_constructor_kwargs=rbln_constructor_kwargs,
             ignore_mismatched_sizes=True,
             **kwargs,
         )
-        if config is None:
-            config = model.config
-        config.save_pretrained(save_dir_path)
-        preprocessors = maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
-        # Get compilation arguments
-        if rbln_config_kwargs.get("rbln_config", None) is None:
-            rbln_config = cls.get_rbln_config(
-                preprocessors=preprocessors, model_config=model.config, **rbln_config_kwargs
-            )
-        def compile_midm():
-            wrapped_decoder = MidmLMHeadModelWrapper(model).eval()
-            prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
-            dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
-            prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=0)
-            prefill_scripted_model = torch.jit.trace(wrapped_decoder, prefill_example_inputs)
-            dec_scripted_model = torch.jit.trace(wrapped_decoder, dec_example_inputs)
-            prefill_ir = rebel.torchscript_to_ir(
-                prefill_scripted_model,
-                input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
-            )
-            dec_ir = rebel.torchscript_to_ir(
-                dec_scripted_model,
-                input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
-            )
-            connections = [
-                (prefill_ir.outputs[1 + i], prefill_ir.inputs[3 + i]) for i in range(model.config.n_layer * 2)
-            ]
-            compiled_model = rebel.compile(
-                prefill_ir,
-                dec_ir,
-                connections=connections,
-                fusion=prefill_rbln_runtime_config.fusion,
-                npu=prefill_rbln_runtime_config.npu,
-                tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
-                use_weight_sharing=True,
-            )
-            compiled_model.save(save_dir_path / f"{DEFAULT_COMPILED_MODEL_NAME}.rbln")
-        compile_midm()
-        rbln_config.save(save_dir_path)
-        return cls._from_pretrained(
-            model_id=save_dir_path,
-            config=config,
-            model_save_dir=save_dir,
-            **rbln_constructor_kwargs,
-            **kwargs,
-        )
     @classmethod
     def _get_rbln_config(
         cls,
@@ -345,11 +309,14 @@ class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
         return rbln_config
-    def _create_runtimes(self, rbln_device_map: Dict[str, int]) -> List[rebel.Runtime]:
+    @classmethod
+    def _create_runtimes(
+        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+    ) -> List[rebel.Runtime]:
         device_val = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
         return [
-            self.compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
-            self.compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
         ]
     def prepare_inputs_for_generation(self, input_ids, past_key_values=0, attention_mask=None, **kwargs):
@@ -421,6 +388,3 @@ class RBLNMidmLMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
                 cache_position=cache_position,
             )
         return CausalLMOutputWithCrossAttentions(logits=output, past_key_values=past_cached_length)
-    def __repr__(self):
-        return repr(self.runtimes[0]) + "\n" + repr(self.runtimes[1])

optimum-rbln 0.1.4__py3-none-any.whl → 0.1.7__py3-none-any.whl

optimum-rbln 0.1.4py3-none-any.whl → 0.1.7py3-none-any.whl