PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py CHANGED Viewed

@@ -27,6 +27,7 @@ from transformers.modeling_utils import no_init_weights
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
     PatchEmbed,
     Qwen2VisionTransformerPretrainedModel,
+    Qwen2VLConfig,
     Qwen2VLModel,
     Qwen2VLRotaryEmbedding,
     VisionRotaryEmbedding,
@@ -35,7 +36,12 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
 from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
-from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM, RBLNDecoderOnlyOutput
+from ...modeling_outputs import _validate_output_hidden_states
+from ..decoderonly.modeling_decoderonly import (
+    RBLNDecoderOnlyModel,
+    RBLNDecoderOnlyModelForCausalLM,
+    RBLNDecoderOnlyOutput,
+)
 from .configuration_qwen2_vl import (
     RBLNQwen2VisionTransformerPretrainedModelConfig,
     RBLNQwen2VLForConditionalGenerationConfig,
@@ -56,6 +62,7 @@ if TYPE_CHECKING:
 class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
     auto_model_class = None
+    _supports_non_fp32 = True
     def __post_init__(self, **kwargs):
         self.transformer = self.model[0]
@@ -89,10 +96,10 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
         torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
     @classmethod
-    def wrap_model_if_needed(
+    def _wrap_model_if_needed(
         cls, model: "PreTrainedModel", rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig
     ):
-        return Qwen2VisionTransformerWrapper(model).eval()
+        return Qwen2VisionTransformerWrapper(model, rbln_config).eval()
     def __getattr__(self, __name: str) -> Any:
         def redirect(func):
@@ -112,24 +119,24 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
         model_config: "PretrainedConfig" = None,
         rbln_config: Optional[RBLNQwen2VisionTransformerPretrainedModelConfig] = None,
     ) -> RBLNQwen2VisionTransformerPretrainedModelConfig:
-        hidden_size = getattr(model_config, "embed_dim")
-        num_heads = getattr(model_config, "num_heads")
+        hidden_size = model_config.embed_dim
+        num_heads = model_config.num_heads
         head_dim = hidden_size // num_heads
         input_infos = []
         for max_seq_len in rbln_config.max_seq_lens:
             input_info = [
-                ("hidden_states", [max_seq_len, hidden_size], "float32"),
-                ("full_attn_masks", [1, 1, max_seq_len, max_seq_len], "float32"),
+                ("hidden_states", [max_seq_len, hidden_size], rbln_config.dtype),
+                ("full_attn_masks", [1, 1, max_seq_len, max_seq_len], rbln_config.dtype),
                 (
                     "cos",
                     [1, 1, max_seq_len, head_dim],
-                    "float32",
+                    rbln_config.dtype,
                 ),
                 (
                     "sin",
                     [1, 1, max_seq_len, head_dim],
-                    "float32",
+                    rbln_config.dtype,
                 ),
             ]
             input_infos.append(input_info)
@@ -166,7 +173,7 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
             1,
             max_seq_len,
             max_seq_len,
-            dtype=torch.float32,
+            dtype=hidden_state.dtype,
         )
         full_attn_masks[:, :, hidden_state.shape[0] : max_seq_len, :] = 0
@@ -177,10 +184,10 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
         # Processes a batch of images (or frames) through the vision transformer.
         # Each image is handled independently for padding and attention mask generation.
-        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = self.patch_embed(hidden_states).to(self.rbln_config.dtype)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-        position_embeddings = (emb.cos(), emb.sin())
+        position_embeddings = (emb.cos().to(self.rbln_config.dtype), emb.sin().to(self.rbln_config.dtype))
         cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
             dim=0,
@@ -200,10 +207,10 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
             try:
                 cu_index = torch.searchsorted(self.max_seq_lens, cu_seq_len).item()
                 max_seq_len = self.max_seq_lens[cu_index]
-            except Exception:
+            except Exception as e:
                 raise ValueError(
                     f"Required seq_len({cu_seq_len}) is larger than available max_seq_lens({self.max_seq_lens.tolist()})."
-                )
+                ) from e
             # Padding for Full Attention Layers
             hidden_state_full_padded, cos_full_padded, sin_full_padded, full_attn_masks = (
@@ -230,64 +237,48 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
         return hidden_states
-class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
-    """
-    RBLNQwen2VLForConditionalGeneration is a multi-modal model that integrates vision and language processing capabilities,
-    optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
-    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    Important Note:
-        This model includes a Large Language Model (LLM). For optimal performance, it is highly recommended to use
-        tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
-        `from_pretrained` method. Refer to the `from_pretrained` documentation and the RBLNQwen2VLForConditionalGenerationConfig class for details.
-    Examples:
-        ```python
-        from optimum.rbln import RBLNQwen2VLForConditionalGeneration
-        model = RBLNQwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct",
-            export=True,
-            rbln_config={
-                "visual": {
-                    "max_seq_lens": 6400,
-                    "device": 0,
-                },
-                "tensor_parallel_size": 8,
-                "max_seq_len": 32_768,
-                "device": [0, 1, 2, 3, 4, 5, 6, 7],
-            },
-        )
-        model.save_pretrained("compiled-qwen2-vl-7b-instruct")
-        ```
-    """
+class RBLNQwen2VLModel(RBLNDecoderOnlyModel):
     auto_model_class = AutoModelForVision2Seq
+    _decoder_wrapper_cls = Qwen2VL_LanguageModelWrapper
+    _supports_non_fp32 = True
+    _use_rotary_emb = False
     _rbln_submodules = [
         {"name": "visual"},
     ]
-    _decoder_wrapper_cls = Qwen2VL_LanguageModelWrapper
-    _use_rotary_emb = False
+    _config_class = Qwen2VLConfig
+    _rotary_emb_class = Qwen2VLRotaryEmbedding
+    _get_rope_index_func = Qwen2VLModel.get_rope_index
     def __post_init__(self, **kwargs):
+        if hasattr(self.config, "embedding_dim"):
+            self.embedding_dim = self.config.embedding_dim
+        if not isinstance(self.config.text_config, PretrainedConfig):
+            self.config = self._config_class(
+                text_config=self.config.text_config, vision_config=self.config.vision_config
+            )
         super().__post_init__(**kwargs)
         self.visual = self.rbln_submodules[0]
-        self.mrope_section = self.config.rope_scaling["mrope_section"]
-        self.rotary_emb = Qwen2VLRotaryEmbedding(self.config)
-        self.rope_deltas = torch.zeros(self.rbln_config.batch_size)
-    def can_generate(self):
-        return True
+        self.rotary_emb = self._rotary_emb_class(self.config)
+        if not self.can_generate():
+            self.block_tables = torch.arange(self.rbln_config.kvcache_num_blocks, dtype=torch.int16)
+    @property
+    def logits_last_dim(self):
+        if self.can_generate():
+            return self.config.vocab_size
+        else:
+            return self.embedding_dim if hasattr(self, "embedding_dim") else self.config.hidden_size
-    @classmethod
-    def get_pytorch_model(cls, *args, **kwargs):
-        model = super().get_pytorch_model(*args, **kwargs)
-        model.model.lm_head = model.lm_head
-        model.lm_head = None
-        del model.lm_head
-        return model
+    def _create_embedding_layer(self):
+        with no_init_weights():
+            embed_tokens = torch.nn.Embedding(
+                self.config.text_config.vocab_size,
+                self.config.text_config.hidden_size,
+                self.config.text_config.pad_token_id,
+            )
+        return embed_tokens
     @classmethod
     def get_input_info(
@@ -304,52 +295,25 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
             (
                 "position_emb",
                 [2, batch_size, 1, query_length, model_config.hidden_size // model_config.num_attention_heads],
-                "float32",
+                rbln_config.dtype,
             ),
         )
         return input_info
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        generate_idx: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        pixel_values=None,
-        pixel_values_videos=None,
-        image_grid_thw=None,
-        video_grid_thw=None,
-        **kwargs,
-    ):
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            generate_idx,
-            attention_mask,
-            inputs_embeds,
-            **kwargs,
-        )
-        is_prefill_phase = generate_idx is None
-        if is_prefill_phase:
-            model_inputs.update({"input_ids": input_ids})
-        model_inputs.update(
-            {
-                "pixel_values": pixel_values,
-                "pixel_values_videos": pixel_values_videos,
-                "image_grid_thw": image_grid_thw,
-                "video_grid_thw": video_grid_thw,
-            }
-        )
-        return model_inputs
     def _get_position_embeddings(self, hidden_states, position_ids):
         cos, sin = self.rotary_emb(hidden_states, position_ids)
-        mrope_section = self.mrope_section * 2
-        cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(1)
-        sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(1)
+        mrope_section = self.config.rope_scaling["mrope_section"] * 2
+        cos = (
+            torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1)
+            .unsqueeze(1)
+            .to(self.rbln_config.dtype)
+        )
+        sin = (
+            torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1)
+            .unsqueeze(1)
+            .to(self.rbln_config.dtype)
+        )
         return torch.stack([cos, sin])
     def _preprocess_prefill(
@@ -362,7 +326,7 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         video_grid_thw: torch.LongTensor = None,
     ):
         batch_size = input_ids.shape[0]
-        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_tokens(input_ids).to(self.rbln_config.dtype)
         if pixel_values is not None:
             image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
@@ -397,7 +361,7 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         max_inputs_len = input_ids.shape[1]
         head_dim = getattr(self.config, "head_dim", None) or self.config.hidden_size // self.config.num_attention_heads
-        all_position_embeds = torch.zeros(2, batch_size, 1, max_inputs_len, head_dim)
+        all_position_embeds = torch.zeros(2, batch_size, 1, max_inputs_len, head_dim, dtype=self.rbln_config.dtype)
         all_rope_deltas = []
         image_token_id = self.config.image_token_id
@@ -411,8 +375,7 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
             vision_tokens = input_id[0][vision_start_indices + 1]
             image_nums = (vision_tokens == image_token_id).sum()
             video_nums = (vision_tokens == video_token_id).sum()
-            position_ids, rope_deltas = Qwen2VLModel.get_rope_index(
-                self,
+            position_ids, rope_deltas = self._get_rope_index_func(
                 input_id,
                 image_grid_thw[image_idx : image_idx + image_nums] if image_grid_thw is not None else None,
                 video_grid_thw[video_idx : video_idx + video_nums] if video_grid_thw is not None else None,
@@ -429,6 +392,177 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         return inputs_embeds, all_position_embeds, rope_deltas
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> RBLNDecoderOnlyOutput:
+        inputs_embeds, position_embed, rope_deltas = self._preprocess_prefill(
+            input_ids,
+            attention_mask,
+            pixel_values,
+            pixel_values_videos,
+            image_grid_thw,
+            video_grid_thw,
+        )
+        self.rope_deltas = rope_deltas
+        batch_size, seq_len = inputs_embeds.shape[:2]
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
+        all_hidden_states = (
+            tuple(
+                torch.zeros(
+                    batch_size,
+                    seq_len,
+                    self.config.hidden_size,
+                    dtype=self.rbln_config.dtype,
+                )
+                for _ in range(self.config.num_hidden_layers + 1)
+            )
+            if output_hidden_states
+            else None
+        )
+        logits = []
+        for b_idx in range(batch_size):
+            query_length = attention_mask[b_idx].sum(dim=-1).int().item()
+            cache_position = torch.arange(query_length, dtype=torch.int32).unsqueeze(0)
+            outputs = self.prefill_decoder(
+                inputs_embeds=inputs_embeds[b_idx : b_idx + 1],
+                attention_mask=attention_mask[b_idx] if attention_mask is not None else None,
+                cache_position=cache_position,
+                batch_idx=b_idx,
+                position_embed=position_embed[:, b_idx : b_idx + 1],
+                block_tables=self.block_tables,
+            )
+            logits.append(outputs.logits)
+            if self.rbln_config.output_hidden_states:
+                for l_idx in range(self.config.num_hidden_layers + 1):
+                    all_hidden_states[l_idx][b_idx].copy_(outputs.hidden_states[l_idx][0])
+        logits = torch.cat(logits, dim=0)
+        if not return_dict:
+            return_value = logits if not output_hidden_states else (logits, all_hidden_states)
+            return return_value
+        else:
+            return (
+                RBLNDecoderOnlyOutput(logits=logits, hidden_states=all_hidden_states)
+                if output_hidden_states
+                else RBLNDecoderOnlyOutput(logits=logits)
+            )
+# MRO: RBLNQwen2VLForConditionalGeneration -> RBLNQwen2VLModel -> RBLNDecoderOnlyModelForCausalLM -> RBLNDecoderOnlyModel -> RBLNModel
+class RBLNQwen2VLForConditionalGeneration(RBLNQwen2VLModel, RBLNDecoderOnlyModelForCausalLM):
+    """
+    RBLNQwen2VLForConditionalGeneration is a multi-modal model that integrates vision and language processing capabilities,
+    optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    Important Note:
+        This model includes a Large Language Model (LLM). For optimal performance, it is highly recommended to use
+        tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
+        `from_pretrained` method. Refer to the `from_pretrained` documentation and the RBLNQwen2VLForConditionalGenerationConfig class for details.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNQwen2VLForConditionalGeneration
+        model = RBLNQwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct",
+            export=True,
+            rbln_config={
+                "visual": {
+                    "max_seq_lens": 6400,
+                    "device": 0,
+                },
+                "tensor_parallel_size": 8,
+                "max_seq_len": 32_768,
+                "device": [0, 1, 2, 3, 4, 5, 6, 7],
+            },
+        )
+        model.save_pretrained("compiled-qwen2-vl-7b-instruct")
+        ```
+    """
+    auto_model_class = AutoModelForVision2Seq
+    _decoder_wrapper_cls = Qwen2VL_LanguageModelWrapper
+    _supports_non_fp32 = True
+    _use_rotary_emb = False
+    _rbln_submodules = [
+        {"name": "visual"},
+    ]
+    def __post_init__(self, **kwargs):
+        super().__post_init__(**kwargs)
+        self.rope_deltas = torch.zeros(self.rbln_config.batch_size)
+    def can_generate(self):
+        return True
+    @classmethod
+    def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
+        model.model.lm_head = model.lm_head
+        return model
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        generate_idx: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        model_inputs = {}
+        is_prefill_phase = generate_idx is None
+        if is_prefill_phase:
+            generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
+            cache_position = None
+            model_inputs.update({"input_ids": input_ids})
+        else:
+            if inputs_embeds is not None:
+                raise NotImplementedError("Specifying inputs_embeds in decoder phase is not supported.")
+            input_ids = input_ids[:, -1:]
+            cache_position = generate_idx
+            generate_idx = generate_idx + 1
+            model_inputs.update({"input_ids": input_ids})
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "generate_idx": generate_idx,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "image_grid_thw": image_grid_thw,
+                "video_grid_thw": video_grid_thw,
+            }
+        )
+        return model_inputs
     def _preprocess_decoder(
         self,
         input_ids: torch.LongTensor = None,
@@ -439,14 +573,14 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
                 f"Cache position size mismatch: got {cache_position.shape[0]}, expected {self.rbln_config.batch_size}."
             )
-        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_tokens(input_ids).to(self.rbln_config.dtype)
         position_embeds = []
         for b_idx in range(self.rbln_config.batch_size):
             delta = cache_position[b_idx] + self.rope_deltas[b_idx]
             position_ids = torch.arange(1).view(1, -1)
             position_ids = position_ids.add(delta)
             position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
-            position_embed = self._get_position_embeddings(torch.zeros(1, dtype=torch.float32), position_ids)
+            position_embed = self._get_position_embeddings(torch.zeros(1, dtype=self.rbln_config.dtype), position_ids)
             position_embeds.append(position_embed)
         position_embeds = torch.cat(position_embeds, dim=1)
@@ -465,8 +599,10 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         cache_position: Optional[torch.LongTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
         **kwargs,
     ) -> RBLNDecoderOnlyOutput:
+        output_hidden_states = _validate_output_hidden_states(output_hidden_states, self.rbln_config)
         # Prefill
         if cache_position is None:
             inputs_embeds, position_embed, rope_deltas = self._preprocess_prefill(
@@ -478,8 +614,21 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
                 video_grid_thw,
             )
+            batch_size, seq_len = inputs_embeds.shape[:2]
+            all_hidden_states = (
+                tuple(
+                    torch.zeros(
+                        batch_size,
+                        seq_len,
+                        self.config.hidden_size,
+                        dtype=self.rbln_config.dtype,
+                    )
+                    for _ in range(self.config.num_hidden_layers + 1)
+                )
+                if output_hidden_states
+                else None
+            )
             self.rope_deltas = rope_deltas
-            batch_size = inputs_embeds.shape[0]
             logits = []
             for b_idx in range(batch_size):
@@ -493,8 +642,10 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
                     position_embed=position_embed[:, b_idx : b_idx + 1],
                 )
                 logits.append(output.logits)
+                if self.rbln_config.output_hidden_states:
+                    for l_idx in range(self.config.num_hidden_layers + 1):
+                        all_hidden_states[l_idx][b_idx].copy_(output.hidden_states[l_idx][0])
             logits = torch.cat(logits, dim=0)
         # Decoder
         else:
             inputs_embeds, position_embed = self._preprocess_decoder(input_ids, cache_position)
@@ -504,11 +655,17 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
                 position_embed=position_embed,
             )
             logits = output.logits
+            all_hidden_states = output.hidden_states
         if not return_dict:
-            return logits, generate_idx
+            return_value = (
+                logits,
+                generate_idx if not output_hidden_states else (logits, generate_idx, all_hidden_states),
+            )
+            return return_value
         else:
             return RBLNDecoderOnlyOutput(
                 logits=logits,
                 generate_idx=generate_idx,
+                hidden_states=all_hidden_states,
             )

optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py CHANGED Viewed

@@ -9,19 +9,24 @@ from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyWrapper,
     apply_rotary_pos_emb,
 )
+from .configuration_qwen2_vl import RBLNQwen2VisionTransformerPretrainedModelConfig
 class Qwen2VisionTransformerWrapper(nn.Module):
-    def __init__(self, model: torch.nn.Module):
+    def __init__(self, model: torch.nn.Module, rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig):
         super().__init__()
-        self._original_mod = model
         self.merger = model.merger
-        self.blocks = self.wrap_vision_blocks(model.blocks)
+        self.rbln_config = rbln_config
+        self.blocks = self.wrap_vision_blocks(model.blocks, rbln_config)
-    def wrap_vision_blocks(self, blocks: torch.nn.ModuleList):
+    def wrap_vision_blocks(
+        self,
+        blocks: torch.nn.ModuleList,
+        rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig,
+    ):
         wrapped_blocks = []
-        for i, block in enumerate(blocks):
-            wrapped_blocks.append(Qwen2VLVisionBlock(block))
+        for _, block in enumerate(blocks):
+            wrapped_blocks.append(Qwen2VLVisionBlock(block, rbln_config))
         return nn.ModuleList(wrapped_blocks)
     def forward(
@@ -31,7 +36,7 @@ class Qwen2VisionTransformerWrapper(nn.Module):
         cos: torch.Tensor,
         sin: torch.Tensor,
     ):
-        full_attn_masks = (1 - full_attn_masks) * torch.finfo(torch.float32).min
+        full_attn_masks = (1.0 - full_attn_masks) * torch.finfo(hidden_states.dtype).min
         for block in self.blocks:
             hidden_states = block(hidden_states, full_attn_masks, [cos, sin])
@@ -40,13 +45,13 @@ class Qwen2VisionTransformerWrapper(nn.Module):
 class Qwen2VLVisionBlock(torch.nn.Module):
-    def __init__(self, model: torch.nn.Module):
+    def __init__(self, model: torch.nn.Module, rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig):
         super().__init__()
         self._origin_model = model
+        self.rbln_config = rbln_config
         self.norm1 = model.norm1
         self.norm2 = model.norm2
-        self.attn = VisionAttention(model.attn)
+        self.attn = VisionAttention(model.attn, rbln_config)
         self.mlp = model.mlp
     def forward(
@@ -65,13 +70,15 @@ class Qwen2VLVisionBlock(torch.nn.Module):
 class VisionAttention(nn.Module):
-    def __init__(self, model: nn.Module) -> None:
+    def __init__(self, model: nn.Module, rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig) -> None:
         super().__init__()
         self._origin_model = model
+        self.rbln_config = rbln_config
         self.num_heads = model.num_heads
         self.head_dim = getattr(model, "head_dim", model.proj.in_features // model.num_heads)
         self.qkv = model.qkv
         self.proj = model.proj
+        self.scale = torch.tensor(1 / math.sqrt(self.head_dim), dtype=rbln_config.dtype)
     def forward(
         self,
@@ -88,9 +95,9 @@ class VisionAttention(nn.Module):
         cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb(q, k, cos, sin)
-        attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(q, k.transpose(2, 3)) * self.scale
         attn_weights = attn_weights + attn_masks
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_output = torch.matmul(attn_weights, v)
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(1, seq_length, -1)
@@ -100,6 +107,12 @@ class VisionAttention(nn.Module):
 class Qwen2VL_LanguageModelWrapper(DecoderOnlyWrapper):
+    def get_decoder_layers(self, model: PreTrainedModel):
+        return model.model.language_model.layers if hasattr(model, "model") else model.language_model.layers
+    def get_model_layer(self, model: PreTrainedModel):
+        return model.model.language_model if hasattr(model, "model") else model.language_model
     def prepare_forward_args(self, *args):
         args = list(args)
         input_ids = None if self.rbln_config.use_inputs_embeds else args.pop(0)
@@ -108,7 +121,7 @@ class Qwen2VL_LanguageModelWrapper(DecoderOnlyWrapper):
         global_block_tables = args.pop(0)
         local_block_tables = None
         position_embeds = args.pop(0)
-        query_position = args.pop(0) if self.phase == "prefill" else None
+        query_position = args.pop(0) if self.phase == "prefill" and self.rbln_config.logits_to_keep > 0 else None
         position_ids = None
         attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
         lora_int_id = args.pop(0) if self.rbln_config.lora_config else None
@@ -142,24 +155,3 @@ class Qwen2VL_LanguageModelWrapper(DecoderOnlyWrapper):
             past_key_values,
             position_embeds,
         )
-    def convert_to_rbln_class(self, model: PreTrainedModel, max_seq_len: int):
-        new_layers = []
-        for layer_idx, layer in enumerate(model.model.language_model.layers):
-            is_sliding = layer_idx in self.rbln_config.sliding_window_layers
-            new_self_attn = self.get_rbln_attn_class()(
-                self.get_attn_layer(layer), self.rbln_config, is_sliding=is_sliding
-            )
-            new_layer = self.get_rbln_layer_class()(layer, new_self_attn)
-            new_layers.append(new_layer)
-        new_model = self.get_rbln_model_class()(
-            model.model.language_model,
-            new_layers,
-            self.rbln_config,
-            use_learned_pos_emb=self.__class__._use_learned_pos_emb,
-        )
-        new_model = self.get_rbln_causal_lm_class()(model.model, new_model)
-        return new_model

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl