optimum-rbln 0.8.4a6__py3-none-any.whl → 0.8.4a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of optimum-rbln might be problematic. Click here for more details.
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +41 -3
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +7 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +7 -2
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +1 -1
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +1 -1
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +1 -1
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +1 -1
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +1 -1
- optimum/rbln/diffusers/modeling_diffusers.py +7 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +31 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +28 -3
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +31 -3
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +1 -1
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +9 -1
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +9 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +6 -3
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +45 -8
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -1
- optimum/rbln/modeling.py +17 -13
- optimum/rbln/modeling_base.py +11 -9
- optimum/rbln/transformers/configuration_generic.py +3 -3
- optimum/rbln/transformers/modeling_generic.py +1 -0
- optimum/rbln/transformers/models/auto/auto_factory.py +67 -7
- optimum/rbln/transformers/models/auto/modeling_auto.py +31 -0
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +5 -6
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -1
- optimum/rbln/transformers/models/clip/configuration_clip.py +7 -4
- optimum/rbln/transformers/models/clip/modeling_clip.py +23 -4
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +38 -6
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -1
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +23 -0
- optimum/rbln/transformers/models/exaone/modeling_exaone.py +42 -4
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +17 -2
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +7 -8
- optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +12 -6
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +6 -2
- optimum/rbln/transformers/models/llava/configuration_llava.py +6 -2
- optimum/rbln/transformers/models/llava/modeling_llava.py +1 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +2 -2
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +57 -78
- optimum/rbln/transformers/models/midm/modeling_midm.py +42 -4
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +1 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +18 -3
- optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +2 -2
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +1 -1
- optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +2 -2
- optimum/rbln/transformers/models/siglip/configuration_siglip.py +1 -1
- optimum/rbln/transformers/models/swin/configuration_swin.py +1 -1
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +1 -1
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +1 -0
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +1 -1
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +1 -0
- {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/METADATA +1 -1
- {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/RECORD +66 -66
- {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/licenses/LICENSE +0 -0
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
import bisect
|
|
16
16
|
from pathlib import Path
|
|
17
|
-
from
|
|
17
|
+
from tempfile import TemporaryDirectory
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
18
19
|
|
|
19
20
|
import torch
|
|
20
21
|
from transformers import PretrainedConfig, PreTrainedModel
|
|
@@ -126,8 +127,8 @@ class RBLNColPaliForRetrieval(RBLNModel):
|
|
|
126
127
|
The ColPali Model transformer for document retrieval using vision-language models.
|
|
127
128
|
This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
128
129
|
|
|
129
|
-
A class to convert and run pre-trained transformers based ColPaliForRetrieval model on RBLN devices.
|
|
130
|
-
It implements the methods to convert a pre-trained transformers ColPaliForRetrieval model into a RBLN transformer model by:
|
|
130
|
+
A class to convert and run pre-trained transformers based `ColPaliForRetrieval` model on RBLN devices.
|
|
131
|
+
It implements the methods to convert a pre-trained transformers `ColPaliForRetrieval` model into a RBLN transformer model by:
|
|
131
132
|
|
|
132
133
|
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
133
134
|
- compiling the resulting graph using the RBLN compiler.
|
|
@@ -263,11 +264,42 @@ class RBLNColPaliForRetrieval(RBLNModel):
|
|
|
263
264
|
return rbln_config
|
|
264
265
|
|
|
265
266
|
@classmethod
|
|
266
|
-
def from_model(
|
|
267
|
+
def from_model(
|
|
268
|
+
cls,
|
|
269
|
+
model: "PreTrainedModel",
|
|
270
|
+
config: Optional[PretrainedConfig] = None,
|
|
271
|
+
rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
|
|
272
|
+
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
|
|
273
|
+
subfolder: str = "",
|
|
274
|
+
**kwargs: Any,
|
|
275
|
+
) -> "RBLNModel":
|
|
276
|
+
"""
|
|
277
|
+
Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
|
|
278
|
+
This method performs the actual model conversion and compilation process.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
model (PreTrainedModel): The PyTorch model to be compiled.
|
|
282
|
+
The object must be an instance of the HuggingFace transformers PreTrainedModel class.
|
|
283
|
+
config (Optional[PretrainedConfig]): The configuration object associated with the model.
|
|
284
|
+
rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
|
|
285
|
+
This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
|
|
286
|
+
For detailed configuration options, see the specific model's configuration class documentation.
|
|
287
|
+
kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
|
|
288
|
+
|
|
289
|
+
The method performs the following steps:
|
|
290
|
+
|
|
291
|
+
1. Compiles the PyTorch model into an optimized RBLN graph
|
|
292
|
+
2. Configures the model for the specified NPU device
|
|
293
|
+
3. Creates the necessary runtime objects if requested
|
|
294
|
+
4. Saves the compiled model and configurations
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
(RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
|
|
298
|
+
"""
|
|
267
299
|
if not hasattr(model, "vision_tower"):
|
|
268
300
|
model.vision_tower = model.vlm.vision_tower
|
|
269
301
|
del model.vlm.vision_tower
|
|
270
|
-
model = super().from_model(model,
|
|
302
|
+
model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
|
|
271
303
|
return model
|
|
272
304
|
|
|
273
305
|
@classmethod
|
|
@@ -334,7 +366,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
|
|
|
334
366
|
output_hidden_states: Optional[bool] = None,
|
|
335
367
|
return_dict: Optional[bool] = None,
|
|
336
368
|
**kwargs,
|
|
337
|
-
) -> ColPaliForRetrievalOutput:
|
|
369
|
+
) -> Union[Tuple, ColPaliForRetrievalOutput]:
|
|
338
370
|
if pixel_values is not None:
|
|
339
371
|
pixel_values = pixel_values.to(dtype=self.dtype)
|
|
340
372
|
|
|
@@ -104,7 +104,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
104
104
|
["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
|
|
105
105
|
logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
|
|
106
106
|
Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
|
|
107
|
-
|
|
107
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
108
108
|
|
|
109
109
|
Raises:
|
|
110
110
|
ValueError: If `batch_size` is not a positive integer.
|
|
@@ -86,3 +86,26 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
|
|
|
86
86
|
model_kwargs["generate_idx"] = outputs.generate_idx
|
|
87
87
|
model_kwargs["padded_cache_lengths"] = outputs.padded_cache_lengths
|
|
88
88
|
return model_kwargs
|
|
89
|
+
|
|
90
|
+
def generate(
|
|
91
|
+
self,
|
|
92
|
+
input_ids: torch.LongTensor,
|
|
93
|
+
attention_mask: Optional[torch.LongTensor] = None,
|
|
94
|
+
max_length: Optional[int] = None,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
"""
|
|
98
|
+
The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
input_ids: The input ids to the model.
|
|
102
|
+
attention_mask: The attention mask to the model.
|
|
103
|
+
max_length: The maximum length of the generated text.
|
|
104
|
+
kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
|
|
105
|
+
"""
|
|
106
|
+
if max_length is not None:
|
|
107
|
+
kwargs["max_length"] = max_length
|
|
108
|
+
if attention_mask is not None:
|
|
109
|
+
kwargs["attention_mask"] = attention_mask
|
|
110
|
+
|
|
111
|
+
return super().generate(input_ids, **kwargs)
|
|
@@ -14,11 +14,13 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import inspect
|
|
17
|
-
from
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Callable, Dict, Optional, Union
|
|
18
19
|
|
|
19
20
|
from transformers import AutoModelForCausalLM
|
|
20
21
|
from transformers.generation.utils import GenerationMixin
|
|
21
22
|
|
|
23
|
+
from ....configuration_utils import RBLNModelConfig
|
|
22
24
|
from ....utils import logging
|
|
23
25
|
from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
|
|
24
26
|
from .exaone_architecture import ExaoneForCausalLMWrapper
|
|
@@ -92,9 +94,45 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
92
94
|
_supports_cache_class = True
|
|
93
95
|
|
|
94
96
|
@classmethod
|
|
95
|
-
def from_pretrained(
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
def from_pretrained(
|
|
98
|
+
cls,
|
|
99
|
+
model_id: Union[str, Path],
|
|
100
|
+
*,
|
|
101
|
+
export: Optional[bool] = None,
|
|
102
|
+
rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
|
|
103
|
+
trust_remote_code: Optional[bool] = None,
|
|
104
|
+
**kwargs: Any,
|
|
105
|
+
):
|
|
106
|
+
"""
|
|
107
|
+
The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
|
|
108
|
+
User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
|
|
112
|
+
It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
|
|
113
|
+
export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
|
|
114
|
+
If None, it will be determined based on the existence of the compiled model files in the model_id.
|
|
115
|
+
rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
|
|
116
|
+
This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNExaoneForCausalLMConfig` for EXAONE models).
|
|
117
|
+
For detailed configuration options, see the specific model's configuration class documentation.
|
|
118
|
+
trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
|
|
119
|
+
kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
(RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
if trust_remote_code is not None:
|
|
126
|
+
kwargs["trust_remote_code"] = trust_remote_code
|
|
127
|
+
elif "trust_remote_code" not in kwargs:
|
|
128
|
+
kwargs["trust_remote_code"] = True
|
|
129
|
+
|
|
130
|
+
return super().from_pretrained(
|
|
131
|
+
model_id=model_id,
|
|
132
|
+
export=export,
|
|
133
|
+
rbln_config=rbln_config,
|
|
134
|
+
**kwargs,
|
|
135
|
+
)
|
|
98
136
|
|
|
99
137
|
def __getattr__(self, __name: str) -> Any:
|
|
100
138
|
def redirect(func):
|
|
@@ -27,6 +27,21 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
|
|
|
27
27
|
image_prefill_chunk_size: Optional[int] = None,
|
|
28
28
|
**kwargs: Any,
|
|
29
29
|
):
|
|
30
|
+
"""
|
|
31
|
+
Args:
|
|
32
|
+
use_position_ids (Optional[bool]): Whether or not to use `position_ids`, which is indices of positions of each input sequence tokens in the position embeddings.
|
|
33
|
+
use_attention_mask (Optional[bool]): Whether or not to use `attention_mask` to to avoid performing attention on padding token indices.
|
|
34
|
+
prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
|
|
35
|
+
processing input sequences. Defaults to 256. Must be a positive integer
|
|
36
|
+
divisible by 64. Affects prefill performance and memory usage.
|
|
37
|
+
image_prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
|
|
38
|
+
processing images. This config is used when `use_image_prefill` is True.
|
|
39
|
+
Currently, the `prefill_chunk_size` and `image_prefill_chunk_size` should be the same value.
|
|
40
|
+
kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If `use_attention_mask` or `use_position_ids` are False.
|
|
44
|
+
"""
|
|
30
45
|
# use_attention_mask and use_position_ids are always True for Gemma3
|
|
31
46
|
use_attention_mask = use_attention_mask or True
|
|
32
47
|
use_position_ids = use_position_ids or True
|
|
@@ -64,10 +79,10 @@ class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
64
79
|
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
65
80
|
vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
|
|
66
81
|
language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
|
|
67
|
-
|
|
82
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
68
83
|
|
|
69
84
|
Raises:
|
|
70
|
-
ValueError: If batch_size is not a positive integer.
|
|
85
|
+
ValueError: If `batch_size` is not a positive integer.
|
|
71
86
|
"""
|
|
72
87
|
super().__init__(**kwargs)
|
|
73
88
|
self.batch_size = batch_size or 1
|
|
@@ -201,16 +201,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
|
|
|
201
201
|
return model_kwargs
|
|
202
202
|
|
|
203
203
|
def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
|
204
|
-
|
|
205
|
-
Projects the last hidden state from the vision model into language model space.
|
|
204
|
+
# Projects the last hidden state from the vision model into language model space.
|
|
206
205
|
|
|
207
|
-
Args:
|
|
208
|
-
|
|
209
|
-
|
|
206
|
+
# Args:
|
|
207
|
+
# pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
|
|
208
|
+
# The tensors corresponding to the input images.
|
|
209
|
+
|
|
210
|
+
# Returns:
|
|
211
|
+
# Image feature tensor of shape `(num_images, image_length, embed_dim)`.
|
|
210
212
|
|
|
211
|
-
Returns:
|
|
212
|
-
Image feature tensor of shape `(num_images, image_length, embed_dim)`.
|
|
213
|
-
"""
|
|
214
213
|
vision_outputs = self.vision_tower(pixel_values).last_hidden_state
|
|
215
214
|
image_features = self.multi_modal_projector(vision_outputs)
|
|
216
215
|
return image_features
|
|
@@ -32,14 +32,20 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
|
|
|
32
32
|
decoder: Optional["RBLNGroundingDinoDecoderConfig"] = None,
|
|
33
33
|
text_backbone: Optional["RBLNModelConfig"] = None,
|
|
34
34
|
backbone: Optional["RBLNModelConfig"] = None,
|
|
35
|
-
output_attentions: Optional[bool] =
|
|
36
|
-
output_hidden_states: Optional[bool] =
|
|
35
|
+
output_attentions: Optional[bool] = None,
|
|
36
|
+
output_hidden_states: Optional[bool] = None,
|
|
37
37
|
**kwargs: Any,
|
|
38
38
|
):
|
|
39
39
|
"""
|
|
40
40
|
Args:
|
|
41
|
-
batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
|
|
42
|
-
|
|
41
|
+
batch_size (Optional[int]): The batch size for image and text processing. Defaults to 1.
|
|
42
|
+
encoder (Optional["RBLNModelConfig"]): The encoder configuration. Defaults to None.
|
|
43
|
+
decoder (Optional["RBLNModelConfig"]): The decoder configuration. Defaults to None.
|
|
44
|
+
text_backbone (Optional["RBLNModelConfig"]): The text backbone configuration. Defaults to None.
|
|
45
|
+
backbone (Optional["RBLNModelConfig"]): The backbone configuration. Defaults to None.
|
|
46
|
+
output_attentions (Optional[bool]): Whether to output attentions. Defaults to None.
|
|
47
|
+
output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None.
|
|
48
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
43
49
|
|
|
44
50
|
Raises:
|
|
45
51
|
ValueError: If batch_size is not a positive integer.
|
|
@@ -49,8 +55,8 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
|
|
|
49
55
|
self.decoder = decoder
|
|
50
56
|
self.text_backbone = text_backbone
|
|
51
57
|
self.backbone = backbone
|
|
52
|
-
self.output_attentions = output_attentions
|
|
53
|
-
self.output_hidden_states = output_hidden_states
|
|
58
|
+
self.output_attentions = output_attentions if output_attentions is not None else False
|
|
59
|
+
self.output_hidden_states = output_hidden_states if output_hidden_states is not None else False
|
|
54
60
|
|
|
55
61
|
if not isinstance(self.batch_size, int) or self.batch_size < 0:
|
|
56
62
|
raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
|
|
@@ -45,11 +45,15 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
45
45
|
Args:
|
|
46
46
|
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
47
47
|
vision_model (Optional[RBLNModelConfig]): Configuration for the vision transformer component.
|
|
48
|
+
This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
|
|
49
|
+
If not provided, default settings will be used.
|
|
48
50
|
text_model (Optional[RBLNModelConfig]): Configuration for the text model component.
|
|
49
|
-
|
|
51
|
+
This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
|
|
52
|
+
If not provided, default settings will be used.
|
|
53
|
+
kwargs: Additional arguments passed to the parent `RBLNModelConfig`.
|
|
50
54
|
|
|
51
55
|
Raises:
|
|
52
|
-
ValueError: If batch_size is not a positive integer.
|
|
56
|
+
ValueError: If `batch_size` is not a positive integer.
|
|
53
57
|
"""
|
|
54
58
|
|
|
55
59
|
super().__init__(**kwargs)
|
|
@@ -39,11 +39,15 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
39
39
|
Args:
|
|
40
40
|
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
41
41
|
vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
|
|
42
|
+
This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
|
|
43
|
+
If not provided, default settings will be used.
|
|
42
44
|
language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
|
|
43
|
-
|
|
45
|
+
This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
|
|
46
|
+
If not provided, default settings will be used.
|
|
47
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
44
48
|
|
|
45
49
|
Raises:
|
|
46
|
-
ValueError: If batch_size is not a positive integer.
|
|
50
|
+
ValueError: If `batch_size` is not a positive integer.
|
|
47
51
|
"""
|
|
48
52
|
super().__init__(**kwargs)
|
|
49
53
|
self.batch_size = batch_size or 1
|
|
@@ -105,6 +105,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
|
|
|
105
105
|
RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
|
|
106
106
|
optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
|
|
107
107
|
This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
108
|
+
|
|
108
109
|
Important Note:
|
|
109
110
|
This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
|
|
110
111
|
tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
|
|
@@ -45,10 +45,10 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
45
45
|
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
46
46
|
vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
|
|
47
47
|
language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
|
|
48
|
-
|
|
48
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
49
49
|
|
|
50
50
|
Raises:
|
|
51
|
-
ValueError: If batch_size is not a positive integer.
|
|
51
|
+
ValueError: If `batch_size` is not a positive integer.
|
|
52
52
|
"""
|
|
53
53
|
super().__init__(**kwargs)
|
|
54
54
|
self.batch_size = batch_size or 1
|
|
@@ -287,18 +287,15 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
287
287
|
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
|
288
288
|
|
|
289
289
|
Args:
|
|
290
|
-
pixel_values (
|
|
291
|
-
|
|
292
|
-
image_sizes (
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
The index of the layer to select the vision feature.
|
|
296
|
-
vision_feature_select_strategy (`str`):
|
|
297
|
-
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
290
|
+
pixel_values (torch.FloatTensor): The tensors corresponding to the input images
|
|
291
|
+
whose shape is `(batch_size, num_patches, channels, height, width)`.
|
|
292
|
+
image_sizes (torch.Tensor): Actual image size of each images (H, W).
|
|
293
|
+
vision_feature_layer (int): The index of the layer to select the vision feature.
|
|
294
|
+
vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
|
|
298
295
|
Can be one of `"default"` or `"full"`
|
|
299
296
|
Returns:
|
|
300
|
-
image_features (List[
|
|
301
|
-
|
|
297
|
+
image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches
|
|
298
|
+
and are of shape `(num_patches, image_length, embed_dim)`).
|
|
302
299
|
"""
|
|
303
300
|
# ! infer image_num_patches from image_sizes
|
|
304
301
|
image_num_patches = [
|
|
@@ -412,23 +409,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
412
409
|
|
|
413
410
|
# Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
|
|
414
411
|
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
# Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
|
413
|
+
|
|
414
|
+
# Args:
|
|
415
|
+
# image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches.
|
|
416
|
+
# Its length is num_images, and each of shape is `(num_patches, image_length, embed_dim)`
|
|
417
|
+
# image_sizes (torch.Tensor): Actual image size of each images (H, W).
|
|
418
|
+
# vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
|
|
419
|
+
# image_newline (torch.Tensor): New line embedding vector whose shape is `embed_dim`.
|
|
420
|
+
|
|
421
|
+
# Returns:
|
|
422
|
+
# image_features (torch.Tensor): A torch.Tensor of shape `(all_feat_len, embed_dim)`)
|
|
423
|
+
# feature_lens (List[int]): A token length of each image in image_features
|
|
417
424
|
|
|
418
|
-
Args:
|
|
419
|
-
image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
|
|
420
|
-
List of image feature tensor, each contains all the visual feature of all patches.
|
|
421
|
-
image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
|
|
422
|
-
Actual image size of each images (H, W).
|
|
423
|
-
vision_feature_select_strategy (`str`)
|
|
424
|
-
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
425
|
-
image_newline (`torch.Tensor` of shape `(embed_dim)`)
|
|
426
|
-
New line embedding vector.
|
|
427
|
-
Returns:
|
|
428
|
-
image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
|
|
429
|
-
feature_lens (`List[int]`)
|
|
430
|
-
token length of each image in image_features
|
|
431
|
-
"""
|
|
432
425
|
new_image_features = []
|
|
433
426
|
feature_lens = []
|
|
434
427
|
for image_idx, image_feature in enumerate(image_features):
|
|
@@ -478,21 +471,17 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
478
471
|
|
|
479
472
|
# Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
|
|
480
473
|
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
Returns:
|
|
494
|
-
tuple: The shape of the image patch grid in the format (width, height).
|
|
495
|
-
"""
|
|
474
|
+
# Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
|
475
|
+
|
|
476
|
+
# Args:
|
|
477
|
+
# image_size (tuple): The size of the input image in the format (width, height).
|
|
478
|
+
# grid_pinpoints (list): A list containing possible resolutions.
|
|
479
|
+
# Each item in the list should be a tuple or list of the form `(height, width)`.
|
|
480
|
+
# patch_size (int): The size of each image patch.
|
|
481
|
+
|
|
482
|
+
# Returns:
|
|
483
|
+
# tuple: The shape of the image patch grid in the format (width, height).
|
|
484
|
+
|
|
496
485
|
if not isinstance(grid_pinpoints, list):
|
|
497
486
|
raise TypeError("grid_pinpoints should be a list of tuples or lists")
|
|
498
487
|
|
|
@@ -510,18 +499,15 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
|
|
510
499
|
|
|
511
500
|
# Almost copied from : https://github.com/huggingface/transformers/blob/1feebb5b4150882deabddd190a541f336f3be817/src/transformers/models/llava_next/modeling_llava_next.py#L115C1-L152C1
|
|
512
501
|
def unpad_image(tensor, original_size):
|
|
513
|
-
|
|
514
|
-
Unpads a PyTorch tensor of a padded and resized image.
|
|
502
|
+
# Unpads a PyTorch tensor of a padded and resized image.
|
|
515
503
|
|
|
516
|
-
Args:
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
504
|
+
# Args:
|
|
505
|
+
# tensor (torch.Tensor): The image tensor, assumed to be of shape (num_channels, height, width).
|
|
506
|
+
# original_size (tuple): The original size of the image (height, width).
|
|
507
|
+
|
|
508
|
+
# Returns:
|
|
509
|
+
# (torch.Tensor): The unpadded image tensor.
|
|
521
510
|
|
|
522
|
-
Returns:
|
|
523
|
-
`torch.Tensor`: The unpadded image tensor.
|
|
524
|
-
"""
|
|
525
511
|
if not isinstance(original_size, (list, tuple)):
|
|
526
512
|
if not isinstance(original_size, (torch.Tensor, np.ndarray)):
|
|
527
513
|
raise TypeError(
|
|
@@ -550,22 +536,19 @@ def unpad_image(tensor, original_size):
|
|
|
550
536
|
|
|
551
537
|
# Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
|
|
552
538
|
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
|
|
553
|
-
|
|
554
|
-
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
539
|
+
# Selects the best resolution from a list of possible resolutions based on the original size.
|
|
555
540
|
|
|
556
|
-
This is done by calculating the effective and wasted resolution for each possible resolution.
|
|
541
|
+
# This is done by calculating the effective and wasted resolution for each possible resolution.
|
|
557
542
|
|
|
558
|
-
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
|
|
543
|
+
# The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
|
|
559
544
|
|
|
560
|
-
Args:
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
545
|
+
# Args:
|
|
546
|
+
# original_size (tuple): The original size of the image in the format (height, width).
|
|
547
|
+
# possible_resolutions (List(tuple)): A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
|
|
548
|
+
|
|
549
|
+
# Returns:
|
|
550
|
+
# (tuple): The best fit resolution in the format (height, width).
|
|
565
551
|
|
|
566
|
-
Returns:
|
|
567
|
-
tuple: The best fit resolution in the format (height, width).
|
|
568
|
-
"""
|
|
569
552
|
original_height, original_width = original_size
|
|
570
553
|
best_fit = None
|
|
571
554
|
max_effective_resolution = 0
|
|
@@ -589,21 +572,17 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
|
|
|
589
572
|
|
|
590
573
|
# Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
|
|
591
574
|
def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
Returns:
|
|
605
|
-
int: the number of patches
|
|
606
|
-
"""
|
|
575
|
+
# Calculate the number of patches after the preprocessing for images of any resolution.
|
|
576
|
+
|
|
577
|
+
# Args:
|
|
578
|
+
# image_size (Union[torch.LongTensor, np.ndarray, Tuple[int, int]): The size of the input image in the format (height, width).
|
|
579
|
+
# grid_pinpoints (list): A list containing possible resolutions.
|
|
580
|
+
# Each item in the list should be a tuple or list of the form `(height, width)`.
|
|
581
|
+
# patch_size (int): The size of each image patch.
|
|
582
|
+
|
|
583
|
+
# Returns:
|
|
584
|
+
# (int): the number of patches.
|
|
585
|
+
|
|
607
586
|
if not isinstance(grid_pinpoints, list):
|
|
608
587
|
raise TypeError("grid_pinpoints should be a list of tuples or lists")
|
|
609
588
|
|
|
@@ -13,11 +13,13 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import inspect
|
|
16
|
-
from
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Callable, Dict, Optional, Union
|
|
17
18
|
|
|
18
19
|
from transformers import AutoModelForCausalLM
|
|
19
20
|
from transformers.generation.utils import GenerationMixin
|
|
20
21
|
|
|
22
|
+
from ....configuration_utils import RBLNModelConfig
|
|
21
23
|
from ....utils import logging
|
|
22
24
|
from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
|
|
23
25
|
from .midm_architecture import MidmLMHeadModelWrapper
|
|
@@ -91,9 +93,45 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
|
|
|
91
93
|
_supports_cache_class = True
|
|
92
94
|
|
|
93
95
|
@classmethod
|
|
94
|
-
def from_pretrained(
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
def from_pretrained(
|
|
97
|
+
cls,
|
|
98
|
+
model_id: Union[str, Path],
|
|
99
|
+
*,
|
|
100
|
+
export: Optional[bool] = None,
|
|
101
|
+
rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
|
|
102
|
+
trust_remote_code: Optional[bool] = None,
|
|
103
|
+
**kwargs: Any,
|
|
104
|
+
):
|
|
105
|
+
"""
|
|
106
|
+
The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
|
|
107
|
+
User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
|
|
111
|
+
It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
|
|
112
|
+
export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
|
|
113
|
+
If None, it will be determined based on the existence of the compiled model files in the model_id.
|
|
114
|
+
rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
|
|
115
|
+
This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNMidmLMHeadModelConfig` for Mi:dm models).
|
|
116
|
+
For detailed configuration options, see the specific model's configuration class documentation.
|
|
117
|
+
trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
|
|
118
|
+
kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
(RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
if trust_remote_code is not None:
|
|
125
|
+
kwargs["trust_remote_code"] = trust_remote_code
|
|
126
|
+
elif "trust_remote_code" not in kwargs:
|
|
127
|
+
kwargs["trust_remote_code"] = True
|
|
128
|
+
|
|
129
|
+
return super().from_pretrained(
|
|
130
|
+
model_id=model_id,
|
|
131
|
+
export=export,
|
|
132
|
+
rbln_config=rbln_config,
|
|
133
|
+
**kwargs,
|
|
134
|
+
)
|
|
97
135
|
|
|
98
136
|
def __getattr__(self, __name: str) -> Any:
|
|
99
137
|
def redirect(func):
|
|
@@ -29,7 +29,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
|
|
|
29
29
|
Args:
|
|
30
30
|
max_image_size (Tuple): The size of max input images. A tuple (max_height, max_width)
|
|
31
31
|
batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
|
|
32
|
-
|
|
32
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
33
33
|
|
|
34
34
|
Raises:
|
|
35
35
|
ValueError: If batch_size is not a positive integer.
|
|
@@ -31,10 +31,22 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
|
|
|
31
31
|
|
|
32
32
|
def __init__(
|
|
33
33
|
self,
|
|
34
|
-
visual: Optional[RBLNModelConfig] = None,
|
|
35
34
|
use_inputs_embeds: bool = True,
|
|
35
|
+
visual: Optional[RBLNModelConfig] = None,
|
|
36
36
|
**kwargs: Any,
|
|
37
37
|
):
|
|
38
|
+
"""
|
|
39
|
+
Args:
|
|
40
|
+
use_inputs_embeds (bool): Whether or not to use `inputs_embeds` as input. Defaults to `True`.
|
|
41
|
+
visual (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
|
|
42
|
+
kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If `use_inputs_embeds` is False.
|
|
46
|
+
ValueError: If the visual configuration is provided but contains invalid settings, such as an invalid max_seq_lens (e.g., not a positive integer, not a multiple of the window-based attention unit, or insufficient for the expected resolution).
|
|
47
|
+
ValueError: If visual is None and no default vision configuration can be inferred for the model architecture.
|
|
48
|
+
ValueError: If any inherited parameters violate constraints defined in the parent class, such as batch_size not being a positive integer, prefill_chunk_size not being divisible by 64, or max_seq_len not meeting requirements for Flash Attention.
|
|
49
|
+
"""
|
|
38
50
|
super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
|
|
39
51
|
if not self.use_inputs_embeds:
|
|
40
52
|
raise ValueError(
|
|
@@ -66,10 +78,13 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
|
|
|
66
78
|
making 256 (64 * 4) valid. RBLN optimization runs inference per image or video
|
|
67
79
|
frame, so set `max_seq_len` to match the maximum expected resolution to reduce
|
|
68
80
|
computation. If not provided, a `ValueError` is raised.
|
|
69
|
-
|
|
81
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
70
82
|
|
|
71
83
|
Raises:
|
|
72
|
-
ValueError: If
|
|
84
|
+
ValueError: If `max_seq_lens` is None or not provided.
|
|
85
|
+
ValueError: If `max_seq_lens` (or any value in the list) is not a positive integer.
|
|
86
|
+
ValueError: If `max_seq_lens` is not a multiple of (window_size / patch_size)^2 for window-based attention, or is insufficient for the expected image/video resolution.
|
|
87
|
+
ValueError: If `batch_size` (inherited from RBLNModelConfig) is not a positive integer.
|
|
73
88
|
|
|
74
89
|
Max Seq Lens:
|
|
75
90
|
Since `Qwen2_5_VLForConditionalGeneration` performs inference on a per-image or per-frame basis,
|