PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/video_llava/modeling_video_llava.py CHANGED Viewed

@@ -23,10 +23,11 @@ from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import ModelOutput
+from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils import TransformersKwargs, auto_docstring, logging, torch_compilable_check
+from ...utils.generic import check_model_inputs
 from ..auto import AutoModel
 from .configuration_video_llava import VideoLlavaConfig
@@ -161,7 +162,6 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
         self.multi_modal_projector = VideoLlavaMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModel.from_config(config.text_config)
-        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def get_input_embeddings(self):
@@ -170,86 +170,82 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring(
+        custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
+    )
     def get_image_features(
         self,
         pixel_values_images: torch.FloatTensor,
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
-    ):
-        """
-        Obtains image last hidden states from the vision tower and apply multimodal projection.
-        Args:
-            pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
-               The tensors corresponding to the input images.
-            vision_feature_layer (`Union[int, list[int]]`, *optional*):
-                The index of the layer to select the vision feature. If multiple indices are provided,
-                the vision feature of the corresponding indices will be concatenated to form the
-                vision features.
-            vision_feature_select_strategy (`str`, *optional*):
-                The feature selection strategy used to select the vision feature from the vision backbone.
-                Can be one of `"default"` or `"full"`
-        Returns:
-            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        output_hidden_states: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+            The tensors corresponding to the input images.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`
         """
-        vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-        )
-        vision_feature_select_strategy = (
-            vision_feature_select_strategy
-            if vision_feature_select_strategy is not None
-            else self.config.vision_feature_select_strategy
+        image_outputs = self.image_tower(
+            pixel_values_images,
+            output_hidden_states=True,  # Ignore arg on purpose
+            return_dict=True,
+            **kwargs,
         )
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-        image_outputs = self.image_tower(pixel_values_images, output_hidden_states=True)
         # If we have one vision feature layer, return the corresponding hidden states,
         # otherwise, select the hidden states of each feature layer and concatenate them
         if isinstance(vision_feature_layer, int):
-            image_outputs = image_outputs.hidden_states[vision_feature_layer]
+            selected_hidden_state = image_outputs.hidden_states[vision_feature_layer]
             if vision_feature_select_strategy == "default":
-                image_outputs = image_outputs[:, 1:]
+                selected_hidden_state = selected_hidden_state[:, 1:]
         else:
             hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
             # For default; crop CLS from each hidden state in the hidden state pool
             if vision_feature_select_strategy == "default":
                 hs_pool = [hs[:, 1:] for hs in hs_pool]
-            image_outputs = torch.cat(hs_pool, dim=-1)
+            selected_hidden_state = torch.cat(hs_pool, dim=-1)
-        image_features = self.multi_modal_projector(image_outputs)
+        image_features = self.multi_modal_projector(selected_hidden_state)
+        image_outputs.pooler_output = image_features
-        return image_features
+        return image_outputs
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring(
+        custom_intro="Obtains video last hidden states from the vision tower and apply multimodal projection."
+    )
     def get_video_features(
         self,
         pixel_values_videos: torch.FloatTensor,
         vision_feature_layer: int | list[int] | None = None,
-    ):
-        """
-        Obtains video last hidden states from the vision tower and apply multimodal projection.
-        Args:
-            pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
-               The tensors corresponding to the input videos.
-            vision_feature_layer (`Union[int, list[int]]`, *optional*):
-                The index of the layer to select the vision feature. If multiple indices are provided,
-                the vision feature of the corresponding indices will be concatenated to form the
-                vision features.
-        Returns:
-            video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos * num_frames, image_length, embed_dim)`).
-            frames (`int`): Number of frames the videos have.
+        output_hidden_states: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
+            The tensors corresponding to the input videos.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
         """
-        vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-        )
         batch_size_vid, num_frames, channels, height, width = pixel_values_videos.shape
         pixel_values = pixel_values_videos.reshape(batch_size_vid * num_frames, channels, height, width)
-        video_outputs = self.video_tower(pixel_values, output_hidden_states=True)
+        video_outputs = self.video_tower(
+            pixel_values,
+            output_hidden_states=True,  # Ignore arg on purpose
+            return_dict=True,
+            **kwargs,
+        )
         # If we have one vision feature layer, return the corresponding hidden states,
         # otherwise, select the hidden states of each feature layer and concatenate them
@@ -260,8 +256,9 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
             video_features = torch.cat(hs_pool, dim=-1)
         video_features = self.multi_modal_projector(video_features)
+        video_outputs.pooler_output = video_features
-        return video_features, num_frames
+        return video_outputs
     def get_placeholder_mask(
         self,
@@ -289,21 +286,22 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
         n_image_tokens = special_image_mask.sum()
         special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
-            raise ValueError(
-                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0] * image_features.shape[1]}"
+        if image_features is not None:
+            torch_compilable_check(
+                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0] * image_features.shape[1]}",
             )
         n_video_tokens = special_video_mask.sum()
         special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
-        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
-            raise ValueError(
-                f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0] * video_features.shape[1]}"
+        if video_features is not None:
+            torch_compilable_check(
+                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0] * video_features.shape[1]}",
             )
         return special_image_mask, special_video_mask
-    @can_return_tuple
+    @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
         self,
@@ -334,14 +332,6 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-        )
-        vision_feature_select_strategy = (
-            vision_feature_select_strategy
-            if vision_feature_select_strategy is not None
-            else self.config.vision_feature_select_strategy
-        )
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -354,7 +344,8 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
                 pixel_values_images,
                 vision_feature_layer=vision_feature_layer,
                 vision_feature_select_strategy=vision_feature_select_strategy,
-            )
+                return_dict=True,
+            ).pooler_output
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             special_image_mask, _ = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=image_features
@@ -362,9 +353,9 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
         if pixel_values_videos is not None:
-            video_features, num_frames = self.get_video_features(
-                pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
-            )
+            video_features = self.get_video_features(
+                pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer, return_dict=True
+            ).pooler_output
             video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
             _, special_video_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, video_features=video_features
@@ -424,19 +415,33 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
     def get_output_embeddings(self) -> nn.Module:
         return self.lm_head
+    @auto_docstring
     def get_image_features(
         self,
         pixel_values_images: torch.FloatTensor,
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
-    ):
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+            The tensors corresponding to the input images.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`
+        """
         return self.model.get_image_features(
             pixel_values_images=pixel_values_images,
             vision_feature_layer=vision_feature_layer,
             vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
         )
-    @can_return_tuple
+    @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
         self,
@@ -472,13 +477,13 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> import numpy as np
         >>> import av
         >>> from huggingface_hub import hf_hub_download
         >>> from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
         >>> def read_video_pyav(container, indices):
         ...     '''
         ...     Decode the video with PyAV decoder.
@@ -520,7 +525,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
         >>> # to generate from image and video mix
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> prompt = [
         ...     "USER: <image>\nHow many cats do you see? ASSISTANT:",
         ...     "USER: <video>\nWhy is this video funny? ASSISTANT:"
@@ -539,14 +545,6 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_feature_layer = (
-            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-        )
-        vision_feature_select_strategy = (
-            vision_feature_select_strategy
-            if vision_feature_select_strategy is not None
-            else self.config.vision_feature_select_strategy
-        )
         outputs = self.model(
             input_ids=input_ids,

transformers/models/videomae/modeling_videomae.py CHANGED Viewed

@@ -245,9 +245,9 @@ class VideoMAESelfAttention(nn.Module):
         value_layer = values.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
         query_layer = queries.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         context_layer, attention_probs = attention_interface(
             self,
@@ -490,8 +490,7 @@ class VideoMAEDecoder(nn.Module):
         for layer_module in self.decoder_layers:
             hidden_states = layer_module(hidden_states)
-        if return_token_num > 0:
-            hidden_states = hidden_states[:, -return_token_num:]
+        hidden_states = hidden_states[:, -return_token_num:]
         # predictor projection
         hidden_states = self.norm(hidden_states)

transformers/models/vilt/configuration_vilt.py CHANGED Viewed

@@ -116,10 +116,13 @@ class ViltConfig(PreTrainedConfig):
         max_image_length=-1,
         tie_word_embeddings=True,
         num_images=-1,
+        pad_token_id=None,
         **kwargs,
     ):
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(**kwargs)
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
         self.vocab_size = vocab_size
         self.type_vocab_size = type_vocab_size
         self.modality_type_vocab_size = modality_type_vocab_size

transformers/models/vilt/image_processing_vilt_fast.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
@@ -58,7 +58,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
         images: list["torch.Tensor"],
         do_resize: bool,
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         size_divisor: int | None,
         do_pad: bool,
         do_rescale: bool,
@@ -117,7 +117,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
         self,
         images: "torch.Tensor",
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         size_divisor: int | None = None,
     ) -> "torch.Tensor":
         """
@@ -126,7 +126,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
         Args:
             images (`torch.Tensor`): Image or batch of images to resize.
             size (`dict[str, int]`): Size dictionary with shortest_edge key.
-            interpolation (`F.InterpolationMode`, *optional*): Interpolation method to use.
+            interpolation (`tvF.InterpolationMode`, *optional*): Interpolation method to use.
             size_divisor (`int`, *optional*): Value to ensure height/width are divisible by.
         Returns:
@@ -165,7 +165,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
             new_widths = new_widths // size_divisor * size_divisor
         # Resize the image
-        return F.resize(images, [new_heights, new_widths], interpolation=interpolation)
+        return tvF.resize(images, [new_heights, new_widths], interpolation=interpolation)
     def _pad_batch(
         self,
@@ -205,7 +205,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
                 padding_right = max_size[1] - original_size[1]
                 padding = [0, 0, padding_right, padding_bottom]
-                padded_images = F.pad(stacked_images, padding, fill=0)
+                padded_images = tvF.pad(stacked_images, padding, fill=0)
                 pixel_mask = mask_template.clone()
                 pixel_mask[: original_size[0], : original_size[1]].fill_(1)
                 pixel_masks = pixel_mask.unsqueeze(0).repeat(stacked_images.shape[0], 1, 1)

transformers/models/vilt/modeling_vilt.py CHANGED Viewed

@@ -575,11 +575,13 @@ class ViltModel(ViltPreTrainedModel):
         ```python
         >>> from transformers import ViltProcessor, ViltModel
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> # prepare image and text
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> text = "hello world"
         >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
@@ -729,13 +731,15 @@ class ViltForMaskedLM(ViltPreTrainedModel):
         ```python
         >>> from transformers import ViltProcessor, ViltForMaskedLM
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
         >>> import re
         >>> import torch
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> text = "a bunch of [MASK] laying on a [MASK]."
         >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
@@ -897,11 +901,13 @@ class ViltForQuestionAnswering(ViltPreTrainedModel):
         ```python
         >>> from transformers import ViltProcessor, ViltForQuestionAnswering
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> text = "How many cats are there?"
         >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
@@ -1000,11 +1006,13 @@ class ViltForImageAndTextRetrieval(ViltPreTrainedModel):
         ```python
         >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
         >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
@@ -1104,18 +1112,25 @@ class ViltForImagesAndTextClassification(ViltPreTrainedModel):
         ```python
         >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
-        >>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
-        >>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
+        >>> url_1 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg"
+        >>> with httpx.stream("GET", url_1) as response:
+        ...     image_1 = Image.open(BytesIO(response.read()))
+        >>> url_2 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg"
+        >>> with httpx.stream("GET", url_2) as response:
+        ...     image_2 = Image.open(BytesIO(response.read()))
         >>> text = "The left image contains twice the number of dogs as the right image."
         >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
         >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
         >>> # prepare inputs
-        >>> encoding = processor([image1, image2], text, return_tensors="pt")
+        >>> encoding = processor([image_1, image_2], text, return_tensors="pt")
         >>> # forward pass
         >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))

transformers/models/vipllava/configuration_vipllava.py CHANGED Viewed

@@ -46,6 +46,8 @@ class VipLlavaConfig(PreTrainedConfig):
             The vision feature layer, or list of layers to select the vision features from.
         image_seq_length (`int`, *optional*, defaults to 576):
             Sequence length of one image embedding.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
     Example:
@@ -83,6 +85,7 @@ class VipLlavaConfig(PreTrainedConfig):
         projector_layernorm_eps=1e-5,
         vision_feature_layers=[-2, -5, -8, -11, 6],
         image_seq_length=576,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.image_token_index = image_token_index
@@ -91,6 +94,7 @@ class VipLlavaConfig(PreTrainedConfig):
         self.vision_feature_layers = vision_feature_layers
         self.image_seq_length = image_seq_length
         self.vision_config = vision_config
+        self.tie_word_embeddings = tie_word_embeddings
         if isinstance(self.vision_config, dict):
             vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl