PyPI - optimum-rbln - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl - Mend

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py CHANGED Viewed

@@ -26,13 +26,14 @@ import logging
 from abc import ABC
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
-import rebel  # noqa: F401
-import torch  # noqa: F401
+import rebel
+import torch
+from rebel.compile_context import CompileContext
 from transformers import AutoModelForSeq2SeqLM, GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
 from ....modeling import RBLNModel
-from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
+from ....modeling_config import RBLNCompileConfig, RBLNConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
@@ -66,7 +67,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
 class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     """
     This is a generic model class that will be instantiated as one of the model classes of the library (with a sequence-to-sequence language modeling head) when created with the from_pretrained() class method.
-    This model inherits from [`RBLNBaseModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based Seq2SeqLM models on RBLN devices.
     It implements the methods to convert a pre-trained transformers Seq2SeqLM model into a RBLN transformer model by:
@@ -88,49 +89,42 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNConfig):
         wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
-        wrapped_model.encoder.encoder_max_length = rbln_config.model_cfg["enc_max_seq_len"]
-        wrapped_model.encoder.decoder_max_length = rbln_config.model_cfg["dec_max_seq_len"]
+        enc_compile_config = rbln_config.compile_cfgs[0]
+        dec_compile_config = rbln_config.compile_cfgs[1]
-        wrapped_model.decoder.encoder_max_length = rbln_config.model_cfg["enc_max_seq_len"]
-        wrapped_model.decoder.decoder_max_length = rbln_config.model_cfg["dec_max_seq_len"]
+        context = CompileContext(use_weight_sharing=False)
-        enc_rbln_compile_config = rbln_config.compile_cfgs[0]
-        dec_rbln_compile_config = rbln_config.compile_cfgs[1]
+        enc_example_inputs = enc_compile_config.get_dummy_inputs(fill=0)
-        enc_example_inputs = enc_rbln_compile_config.get_dummy_inputs(fill=0)
-        dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
+        # Mark encoder's static tensors (cross kv states)
+        static_tensors = {}
+        for (name, _, _), tensor in zip(enc_compile_config.input_info, enc_example_inputs):
+            if "key_value_states" in name:
+                static_tensors[name] = tensor
+                context.mark_static_address(tensor)
-        enc_example_inputs[3].fill_(0)
-        dec_example_inputs[4].fill_(-1)
+        dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
-        enc_scripted_model = torch.jit.trace(wrapped_model.encoder, enc_example_inputs, check_trace=False)
-        dec_scripted_model = torch.jit.trace(wrapped_model.decoder, dec_example_inputs, check_trace=False)
+        # Mark decoder's static tensors (self kv states)
+        for (name, _, _), tensor in zip(dec_compile_config.input_info, dec_example_inputs):
+            if "key_value_states" in name:
+                context.mark_static_address(tensor)
-        enc_ir = rebel.torchscript_to_ir(
-            enc_scripted_model,
-            input_names=[v[0] for v in enc_rbln_compile_config.input_info],
-            name=enc_rbln_compile_config.mod_name,
+        compiled_encoder = super().compile(
+            wrapped_model.encoder,
+            enc_compile_config,
+            example_inputs=enc_example_inputs,
+            compile_context=context,
         )
-        dec_ir = rebel.torchscript_to_ir(
-            dec_scripted_model,
-            input_names=[v[0] for v in dec_rbln_compile_config.input_info],
-            name=dec_rbln_compile_config.mod_name,
-        )
-        connections = [
-            (enc_ir.outputs[0], enc_ir.inputs[2], dec_ir.inputs[6]),
-            (dec_ir.outputs[1], dec_ir.inputs[5]),
-        ]
-        compiled_model = rebel.compile(
-            enc_ir,
-            dec_ir,
-            connections=connections,
-            fusion=enc_rbln_compile_config.fusion,
-            npu=enc_rbln_compile_config.npu,
-            tensor_parallel_size=enc_rbln_compile_config.tensor_parallel_size,
+        compiled_decoder = super().compile(
+            wrapped_model.decoder,
+            dec_compile_config,
+            example_inputs=dec_example_inputs,
+            compile_context=context,
         )
-        return compiled_model
+        return {"encoder": compiled_encoder, "decoder": compiled_decoder}
     @classmethod
     def _get_rbln_config(
@@ -204,7 +198,7 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
                 ],
                 "float32",
             ),
-            ("batch_idx", [], "int32"),
+            ("batch_position", [], "int16"),
         ]
         dec_input_info = [
@@ -216,17 +210,16 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
                 [rbln_batch_size, 1],
                 "int32",
             ),
-            ("batch_position", [], "int32"),
         ]
         dec_input_info.extend(
             [
                 (
-                    "self_key_value_states",
+                    "cross_key_value_states",
                     [
                         n_layer * 2,
                         rbln_batch_size,
                         n_head,
-                        rbln_dec_max_seq_len,
+                        rbln_enc_max_seq_len,
                         d_kv,
                     ],
                     "float32",
@@ -236,24 +229,24 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
         dec_input_info.extend(
             [
                 (
-                    "cross_key_value_states",
+                    f"self_key_value_states_{i}",
                     [
-                        n_layer * 2,
                         rbln_batch_size,
                         n_head,
-                        rbln_enc_max_seq_len,
+                        rbln_dec_max_seq_len,
                         d_kv,
                     ],
                     "float32",
                 )
+                for i in range(n_layer * 2)
             ]
         )
-        enc_rbln_compile_config = RBLNCompileConfig(mod_name="encoder", input_info=enc_input_info)
-        dec_rbln_compile_config = RBLNCompileConfig(mod_name="decoder", input_info=dec_input_info)
+        enc_compile_config = RBLNCompileConfig(compiled_model_name="encoder", input_info=enc_input_info)
+        dec_compile_config = RBLNCompileConfig(compiled_model_name="decoder", input_info=dec_input_info)
         rbln_config = RBLNConfig(
             rbln_cls=cls.__name__,
-            compile_cfgs=[enc_rbln_compile_config, dec_rbln_compile_config],
+            compile_cfgs=[enc_compile_config, dec_compile_config],
             rbln_kwargs=rbln_kwargs,
         )
@@ -270,12 +263,21 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     @classmethod
     def _create_runtimes(
-        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_device_map: Dict[str, int],
+        activate_profiler: Optional[bool] = None,
     ) -> List[rebel.Runtime]:
-        device_val = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
+        if any(model_name not in rbln_device_map for model_name in ["encoder", "decoder"]):
+            cls._raise_missing_compiled_file_error(["encoder", "decoder"])
         return [
-            compiled_models[0].create_runtime("encoder", tensor_type="pt", device=device_val),
-            compiled_models[0].create_runtime("decoder", tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(
+                tensor_type="pt", device=rbln_device_map["encoder"], activate_profiler=activate_profiler
+            ),
+            compiled_models[1].create_runtime(
+                tensor_type="pt", device=rbln_device_map["decoder"], activate_profiler=activate_profiler
+            ),
         ]
     def can_generate(self):
@@ -340,9 +342,8 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
             attention_mask=dec_attention_mask,
             encoder_attention_mask=attention_mask,
             cache_position=cache_position,
-            batch_position=torch.tensor(0, dtype=torch.int32),
         )
-        lm_logits = decoder_output.logits[0]
+        lm_logits = decoder_output.logits
         return Seq2SeqLMOutput(logits=lm_logits)
@@ -381,15 +382,14 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
         )
         # 3. make sure that encoder returns `ModelOutput`
-        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
         encoder_kwargs["return_dict"] = True
         encoder_kwargs["output_hidden_states"] = False
         encoder_kwargs["output_attentions"] = False
         for b in range(batch_size):
-            batch_idx = torch.tensor(b, dtype=torch.int32)
+            batch_position = torch.tensor(b, dtype=torch.int16)
             encoder_kwargs["input_ids"] = inputs_tensor[b].unsqueeze(0)
             encoder_kwargs["attention_mask"] = model_kwargs["attention_mask"][b].unsqueeze(0).to(torch.float32)
-            model_kwargs["encoder_outputs"] = encoder(**encoder_kwargs, batch_idx=batch_idx)
+            model_kwargs["encoder_outputs"] = encoder(**encoder_kwargs, batch_position=batch_position)
         return model_kwargs

optimum-rbln 0.1.15__py3-none-any.whl → 0.2.1a0__py3-none-any.whl

optimum-rbln 0.1.15py3-none-any.whl → 0.2.1a0py3-none-any.whl