optimum-rbln 0.8.4a6__py3-none-any.whl → 0.8.4a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (66) hide show
  1. optimum/rbln/__version__.py +2 -2
  2. optimum/rbln/configuration_utils.py +41 -3
  3. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +1 -1
  4. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +3 -3
  5. optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +1 -1
  6. optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
  7. optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +7 -2
  8. optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +7 -2
  9. optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +1 -1
  10. optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +1 -1
  11. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +2 -2
  12. optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +1 -1
  13. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +3 -3
  14. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +1 -1
  15. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +1 -1
  16. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +1 -1
  17. optimum/rbln/diffusers/modeling_diffusers.py +7 -3
  18. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +31 -3
  19. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +28 -3
  20. optimum/rbln/diffusers/models/autoencoders/vq_model.py +31 -3
  21. optimum/rbln/diffusers/models/transformers/prior_transformer.py +1 -1
  22. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +9 -1
  23. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +9 -1
  24. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +6 -3
  25. optimum/rbln/diffusers/pipelines/auto_pipeline.py +45 -8
  26. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  27. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -1
  28. optimum/rbln/modeling.py +17 -13
  29. optimum/rbln/modeling_base.py +11 -9
  30. optimum/rbln/transformers/configuration_generic.py +3 -3
  31. optimum/rbln/transformers/modeling_generic.py +1 -0
  32. optimum/rbln/transformers/models/auto/auto_factory.py +67 -7
  33. optimum/rbln/transformers/models/auto/modeling_auto.py +31 -0
  34. optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +5 -6
  35. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -1
  36. optimum/rbln/transformers/models/clip/configuration_clip.py +7 -4
  37. optimum/rbln/transformers/models/clip/modeling_clip.py +23 -4
  38. optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
  39. optimum/rbln/transformers/models/colpali/modeling_colpali.py +38 -6
  40. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -1
  41. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +23 -0
  42. optimum/rbln/transformers/models/exaone/modeling_exaone.py +42 -4
  43. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +17 -2
  44. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +7 -8
  45. optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +12 -6
  46. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +6 -2
  47. optimum/rbln/transformers/models/llava/configuration_llava.py +6 -2
  48. optimum/rbln/transformers/models/llava/modeling_llava.py +1 -0
  49. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +2 -2
  50. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +57 -78
  51. optimum/rbln/transformers/models/midm/modeling_midm.py +42 -4
  52. optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +1 -1
  53. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +18 -3
  54. optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +2 -2
  55. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +1 -1
  56. optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +2 -2
  57. optimum/rbln/transformers/models/siglip/configuration_siglip.py +1 -1
  58. optimum/rbln/transformers/models/swin/configuration_swin.py +1 -1
  59. optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +1 -1
  60. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +1 -0
  61. optimum/rbln/transformers/models/whisper/configuration_whisper.py +1 -1
  62. optimum/rbln/transformers/models/whisper/modeling_whisper.py +1 -0
  63. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/METADATA +1 -1
  64. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/RECORD +66 -66
  65. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/WHEEL +0 -0
  66. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a8.dist-info}/licenses/LICENSE +0 -0
@@ -14,7 +14,8 @@
14
14
 
15
15
  import bisect
16
16
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Optional, Union
17
+ from tempfile import TemporaryDirectory
18
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
18
19
 
19
20
  import torch
20
21
  from transformers import PretrainedConfig, PreTrainedModel
@@ -126,8 +127,8 @@ class RBLNColPaliForRetrieval(RBLNModel):
126
127
  The ColPali Model transformer for document retrieval using vision-language models.
127
128
  This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
128
129
 
129
- A class to convert and run pre-trained transformers based ColPaliForRetrieval model on RBLN devices.
130
- It implements the methods to convert a pre-trained transformers ColPaliForRetrieval model into a RBLN transformer model by:
130
+ A class to convert and run pre-trained transformers based `ColPaliForRetrieval` model on RBLN devices.
131
+ It implements the methods to convert a pre-trained transformers `ColPaliForRetrieval` model into a RBLN transformer model by:
131
132
 
132
133
  - transferring the checkpoint weights of the original into an optimized RBLN graph,
133
134
  - compiling the resulting graph using the RBLN compiler.
@@ -263,11 +264,42 @@ class RBLNColPaliForRetrieval(RBLNModel):
263
264
  return rbln_config
264
265
 
265
266
  @classmethod
266
- def from_model(cls, model: "PreTrainedModel", *args, **kwargs):
267
+ def from_model(
268
+ cls,
269
+ model: "PreTrainedModel",
270
+ config: Optional[PretrainedConfig] = None,
271
+ rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
272
+ model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
273
+ subfolder: str = "",
274
+ **kwargs: Any,
275
+ ) -> "RBLNModel":
276
+ """
277
+ Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
278
+ This method performs the actual model conversion and compilation process.
279
+
280
+ Args:
281
+ model (PreTrainedModel): The PyTorch model to be compiled.
282
+ The object must be an instance of the HuggingFace transformers PreTrainedModel class.
283
+ config (Optional[PretrainedConfig]): The configuration object associated with the model.
284
+ rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
285
+ This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
286
+ For detailed configuration options, see the specific model's configuration class documentation.
287
+ kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
288
+
289
+ The method performs the following steps:
290
+
291
+ 1. Compiles the PyTorch model into an optimized RBLN graph
292
+ 2. Configures the model for the specified NPU device
293
+ 3. Creates the necessary runtime objects if requested
294
+ 4. Saves the compiled model and configurations
295
+
296
+ Returns:
297
+ (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
298
+ """
267
299
  if not hasattr(model, "vision_tower"):
268
300
  model.vision_tower = model.vlm.vision_tower
269
301
  del model.vlm.vision_tower
270
- model = super().from_model(model, *args, **kwargs)
302
+ model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
271
303
  return model
272
304
 
273
305
  @classmethod
@@ -334,7 +366,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
334
366
  output_hidden_states: Optional[bool] = None,
335
367
  return_dict: Optional[bool] = None,
336
368
  **kwargs,
337
- ) -> ColPaliForRetrievalOutput:
369
+ ) -> Union[Tuple, ColPaliForRetrievalOutput]:
338
370
  if pixel_values is not None:
339
371
  pixel_values = pixel_values.to(dtype=self.dtype)
340
372
 
@@ -104,7 +104,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
104
104
  ["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
105
105
  logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
106
106
  Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
107
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
107
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
108
108
 
109
109
  Raises:
110
110
  ValueError: If `batch_size` is not a positive integer.
@@ -86,3 +86,26 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
86
86
  model_kwargs["generate_idx"] = outputs.generate_idx
87
87
  model_kwargs["padded_cache_lengths"] = outputs.padded_cache_lengths
88
88
  return model_kwargs
89
+
90
+ def generate(
91
+ self,
92
+ input_ids: torch.LongTensor,
93
+ attention_mask: Optional[torch.LongTensor] = None,
94
+ max_length: Optional[int] = None,
95
+ **kwargs,
96
+ ):
97
+ """
98
+ The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
99
+
100
+ Args:
101
+ input_ids: The input ids to the model.
102
+ attention_mask: The attention mask to the model.
103
+ max_length: The maximum length of the generated text.
104
+ kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
105
+ """
106
+ if max_length is not None:
107
+ kwargs["max_length"] = max_length
108
+ if attention_mask is not None:
109
+ kwargs["attention_mask"] = attention_mask
110
+
111
+ return super().generate(input_ids, **kwargs)
@@ -14,11 +14,13 @@
14
14
 
15
15
 
16
16
  import inspect
17
- from typing import Any, Callable
17
+ from pathlib import Path
18
+ from typing import Any, Callable, Dict, Optional, Union
18
19
 
19
20
  from transformers import AutoModelForCausalLM
20
21
  from transformers.generation.utils import GenerationMixin
21
22
 
23
+ from ....configuration_utils import RBLNModelConfig
22
24
  from ....utils import logging
23
25
  from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
24
26
  from .exaone_architecture import ExaoneForCausalLMWrapper
@@ -92,9 +94,45 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
92
94
  _supports_cache_class = True
93
95
 
94
96
  @classmethod
95
- def from_pretrained(cls, *args, **kwargs):
96
- kwargs.setdefault("trust_remote_code", True)
97
- return super().from_pretrained(*args, **kwargs)
97
+ def from_pretrained(
98
+ cls,
99
+ model_id: Union[str, Path],
100
+ *,
101
+ export: Optional[bool] = None,
102
+ rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
103
+ trust_remote_code: Optional[bool] = None,
104
+ **kwargs: Any,
105
+ ):
106
+ """
107
+ The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
108
+ User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
109
+
110
+ Args:
111
+ model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
112
+ It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
113
+ export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
114
+ If None, it will be determined based on the existence of the compiled model files in the model_id.
115
+ rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
116
+ This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNExaoneForCausalLMConfig` for EXAONE models).
117
+ For detailed configuration options, see the specific model's configuration class documentation.
118
+ trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
119
+ kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
120
+
121
+ Returns:
122
+ (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
123
+ """
124
+
125
+ if trust_remote_code is not None:
126
+ kwargs["trust_remote_code"] = trust_remote_code
127
+ elif "trust_remote_code" not in kwargs:
128
+ kwargs["trust_remote_code"] = True
129
+
130
+ return super().from_pretrained(
131
+ model_id=model_id,
132
+ export=export,
133
+ rbln_config=rbln_config,
134
+ **kwargs,
135
+ )
98
136
 
99
137
  def __getattr__(self, __name: str) -> Any:
100
138
  def redirect(func):
@@ -27,6 +27,21 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
27
27
  image_prefill_chunk_size: Optional[int] = None,
28
28
  **kwargs: Any,
29
29
  ):
30
+ """
31
+ Args:
32
+ use_position_ids (Optional[bool]): Whether or not to use `position_ids`, which is indices of positions of each input sequence tokens in the position embeddings.
33
+ use_attention_mask (Optional[bool]): Whether or not to use `attention_mask` to to avoid performing attention on padding token indices.
34
+ prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
35
+ processing input sequences. Defaults to 256. Must be a positive integer
36
+ divisible by 64. Affects prefill performance and memory usage.
37
+ image_prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
38
+ processing images. This config is used when `use_image_prefill` is True.
39
+ Currently, the `prefill_chunk_size` and `image_prefill_chunk_size` should be the same value.
40
+ kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
41
+
42
+ Raises:
43
+ ValueError: If `use_attention_mask` or `use_position_ids` are False.
44
+ """
30
45
  # use_attention_mask and use_position_ids are always True for Gemma3
31
46
  use_attention_mask = use_attention_mask or True
32
47
  use_position_ids = use_position_ids or True
@@ -64,10 +79,10 @@ class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):
64
79
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
65
80
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
66
81
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
67
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
82
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
68
83
 
69
84
  Raises:
70
- ValueError: If batch_size is not a positive integer.
85
+ ValueError: If `batch_size` is not a positive integer.
71
86
  """
72
87
  super().__init__(**kwargs)
73
88
  self.batch_size = batch_size or 1
@@ -201,16 +201,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
201
201
  return model_kwargs
202
202
 
203
203
  def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
204
- """
205
- Projects the last hidden state from the vision model into language model space.
204
+ # Projects the last hidden state from the vision model into language model space.
206
205
 
207
- Args:
208
- pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
209
- The tensors corresponding to the input images.
206
+ # Args:
207
+ # pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
208
+ # The tensors corresponding to the input images.
209
+
210
+ # Returns:
211
+ # Image feature tensor of shape `(num_images, image_length, embed_dim)`.
210
212
 
211
- Returns:
212
- Image feature tensor of shape `(num_images, image_length, embed_dim)`.
213
- """
214
213
  vision_outputs = self.vision_tower(pixel_values).last_hidden_state
215
214
  image_features = self.multi_modal_projector(vision_outputs)
216
215
  return image_features
@@ -32,14 +32,20 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
32
32
  decoder: Optional["RBLNGroundingDinoDecoderConfig"] = None,
33
33
  text_backbone: Optional["RBLNModelConfig"] = None,
34
34
  backbone: Optional["RBLNModelConfig"] = None,
35
- output_attentions: Optional[bool] = False,
36
- output_hidden_states: Optional[bool] = False,
35
+ output_attentions: Optional[bool] = None,
36
+ output_hidden_states: Optional[bool] = None,
37
37
  **kwargs: Any,
38
38
  ):
39
39
  """
40
40
  Args:
41
- batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
42
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
41
+ batch_size (Optional[int]): The batch size for image and text processing. Defaults to 1.
42
+ encoder (Optional["RBLNModelConfig"]): The encoder configuration. Defaults to None.
43
+ decoder (Optional["RBLNModelConfig"]): The decoder configuration. Defaults to None.
44
+ text_backbone (Optional["RBLNModelConfig"]): The text backbone configuration. Defaults to None.
45
+ backbone (Optional["RBLNModelConfig"]): The backbone configuration. Defaults to None.
46
+ output_attentions (Optional[bool]): Whether to output attentions. Defaults to None.
47
+ output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None.
48
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
43
49
 
44
50
  Raises:
45
51
  ValueError: If batch_size is not a positive integer.
@@ -49,8 +55,8 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
49
55
  self.decoder = decoder
50
56
  self.text_backbone = text_backbone
51
57
  self.backbone = backbone
52
- self.output_attentions = output_attentions
53
- self.output_hidden_states = output_hidden_states
58
+ self.output_attentions = output_attentions if output_attentions is not None else False
59
+ self.output_hidden_states = output_hidden_states if output_hidden_states is not None else False
54
60
 
55
61
  if not isinstance(self.batch_size, int) or self.batch_size < 0:
56
62
  raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
@@ -45,11 +45,15 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
45
45
  Args:
46
46
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
47
47
  vision_model (Optional[RBLNModelConfig]): Configuration for the vision transformer component.
48
+ This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
49
+ If not provided, default settings will be used.
48
50
  text_model (Optional[RBLNModelConfig]): Configuration for the text model component.
49
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
51
+ This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
52
+ If not provided, default settings will be used.
53
+ kwargs: Additional arguments passed to the parent `RBLNModelConfig`.
50
54
 
51
55
  Raises:
52
- ValueError: If batch_size is not a positive integer.
56
+ ValueError: If `batch_size` is not a positive integer.
53
57
  """
54
58
 
55
59
  super().__init__(**kwargs)
@@ -39,11 +39,15 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
39
39
  Args:
40
40
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
41
41
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
42
+ This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
43
+ If not provided, default settings will be used.
42
44
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
43
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
45
+ This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
46
+ If not provided, default settings will be used.
47
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
44
48
 
45
49
  Raises:
46
- ValueError: If batch_size is not a positive integer.
50
+ ValueError: If `batch_size` is not a positive integer.
47
51
  """
48
52
  super().__init__(**kwargs)
49
53
  self.batch_size = batch_size or 1
@@ -105,6 +105,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
105
105
  RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
106
106
  optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
107
107
  This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
108
+
108
109
  Important Note:
109
110
  This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
110
111
  tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
@@ -45,10 +45,10 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
45
45
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
46
46
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
47
47
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
48
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
48
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
49
49
 
50
50
  Raises:
51
- ValueError: If batch_size is not a positive integer.
51
+ ValueError: If `batch_size` is not a positive integer.
52
52
  """
53
53
  super().__init__(**kwargs)
54
54
  self.batch_size = batch_size or 1
@@ -287,18 +287,15 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
287
287
  Obtains image last hidden states from the vision tower and apply multimodal projection.
288
288
 
289
289
  Args:
290
- pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
291
- The tensors corresponding to the input images.
292
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
293
- Actual image size of each images (H, W).
294
- vision_feature_layer (`int`):
295
- The index of the layer to select the vision feature.
296
- vision_feature_select_strategy (`str`):
297
- The feature selection strategy used to select the vision feature from the vision backbone.
290
+ pixel_values (torch.FloatTensor): The tensors corresponding to the input images
291
+ whose shape is `(batch_size, num_patches, channels, height, width)`.
292
+ image_sizes (torch.Tensor): Actual image size of each images (H, W).
293
+ vision_feature_layer (int): The index of the layer to select the vision feature.
294
+ vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
298
295
  Can be one of `"default"` or `"full"`
299
296
  Returns:
300
- image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
301
- and are of shape `(num_patches, image_length, embed_dim)`).
297
+ image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches
298
+ and are of shape `(num_patches, image_length, embed_dim)`).
302
299
  """
303
300
  # ! infer image_num_patches from image_sizes
304
301
  image_num_patches = [
@@ -412,23 +409,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
412
409
 
413
410
  # Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
414
411
  def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
415
- """
416
- Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
412
+ # Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
413
+
414
+ # Args:
415
+ # image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches.
416
+ # Its length is num_images, and each of shape is `(num_patches, image_length, embed_dim)`
417
+ # image_sizes (torch.Tensor): Actual image size of each images (H, W).
418
+ # vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
419
+ # image_newline (torch.Tensor): New line embedding vector whose shape is `embed_dim`.
420
+
421
+ # Returns:
422
+ # image_features (torch.Tensor): A torch.Tensor of shape `(all_feat_len, embed_dim)`)
423
+ # feature_lens (List[int]): A token length of each image in image_features
417
424
 
418
- Args:
419
- image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
420
- List of image feature tensor, each contains all the visual feature of all patches.
421
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
422
- Actual image size of each images (H, W).
423
- vision_feature_select_strategy (`str`)
424
- The feature selection strategy used to select the vision feature from the vision backbone.
425
- image_newline (`torch.Tensor` of shape `(embed_dim)`)
426
- New line embedding vector.
427
- Returns:
428
- image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
429
- feature_lens (`List[int]`)
430
- token length of each image in image_features
431
- """
432
425
  new_image_features = []
433
426
  feature_lens = []
434
427
  for image_idx, image_feature in enumerate(image_features):
@@ -478,21 +471,17 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
478
471
 
479
472
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
480
473
  def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
481
- """
482
- Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
483
-
484
- Args:
485
- image_size (`tuple`):
486
- The size of the input image in the format (width, height).
487
- grid_pinpoints (`List`):
488
- A list containing possible resolutions. Each item in the list should be a tuple or list
489
- of the form `(height, width)`.
490
- patch_size (`int`):
491
- The size of each image patch.
492
-
493
- Returns:
494
- tuple: The shape of the image patch grid in the format (width, height).
495
- """
474
+ # Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
475
+
476
+ # Args:
477
+ # image_size (tuple): The size of the input image in the format (width, height).
478
+ # grid_pinpoints (list): A list containing possible resolutions.
479
+ # Each item in the list should be a tuple or list of the form `(height, width)`.
480
+ # patch_size (int): The size of each image patch.
481
+
482
+ # Returns:
483
+ # tuple: The shape of the image patch grid in the format (width, height).
484
+
496
485
  if not isinstance(grid_pinpoints, list):
497
486
  raise TypeError("grid_pinpoints should be a list of tuples or lists")
498
487
 
@@ -510,18 +499,15 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
510
499
 
511
500
  # Almost copied from : https://github.com/huggingface/transformers/blob/1feebb5b4150882deabddd190a541f336f3be817/src/transformers/models/llava_next/modeling_llava_next.py#L115C1-L152C1
512
501
  def unpad_image(tensor, original_size):
513
- """
514
- Unpads a PyTorch tensor of a padded and resized image.
502
+ # Unpads a PyTorch tensor of a padded and resized image.
515
503
 
516
- Args:
517
- tensor (`torch.Tensor`):
518
- The image tensor, assumed to be of shape (num_channels, height, width).
519
- original_size (`tuple`):
520
- The original size of the image (height, width).
504
+ # Args:
505
+ # tensor (torch.Tensor): The image tensor, assumed to be of shape (num_channels, height, width).
506
+ # original_size (tuple): The original size of the image (height, width).
507
+
508
+ # Returns:
509
+ # (torch.Tensor): The unpadded image tensor.
521
510
 
522
- Returns:
523
- `torch.Tensor`: The unpadded image tensor.
524
- """
525
511
  if not isinstance(original_size, (list, tuple)):
526
512
  if not isinstance(original_size, (torch.Tensor, np.ndarray)):
527
513
  raise TypeError(
@@ -550,22 +536,19 @@ def unpad_image(tensor, original_size):
550
536
 
551
537
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
552
538
  def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
553
- """
554
- Selects the best resolution from a list of possible resolutions based on the original size.
539
+ # Selects the best resolution from a list of possible resolutions based on the original size.
555
540
 
556
- This is done by calculating the effective and wasted resolution for each possible resolution.
541
+ # This is done by calculating the effective and wasted resolution for each possible resolution.
557
542
 
558
- The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
543
+ # The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
559
544
 
560
- Args:
561
- original_size (tuple):
562
- The original size of the image in the format (height, width).
563
- possible_resolutions (list):
564
- A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
545
+ # Args:
546
+ # original_size (tuple): The original size of the image in the format (height, width).
547
+ # possible_resolutions (List(tuple)): A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
548
+
549
+ # Returns:
550
+ # (tuple): The best fit resolution in the format (height, width).
565
551
 
566
- Returns:
567
- tuple: The best fit resolution in the format (height, width).
568
- """
569
552
  original_height, original_width = original_size
570
553
  best_fit = None
571
554
  max_effective_resolution = 0
@@ -589,21 +572,17 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
589
572
 
590
573
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
591
574
  def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
592
- """
593
- Calculate the number of patches after the preprocessing for images of any resolution.
594
-
595
- Args:
596
- image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
597
- The size of the input image in the format (height, width). ?
598
- grid_pinpoints (`List`):
599
- A list containing possible resolutions. Each item in the list should be a tuple or list
600
- of the form `(height, width)`.
601
- patch_size (`int`):
602
- The size of each image patch.
603
-
604
- Returns:
605
- int: the number of patches
606
- """
575
+ # Calculate the number of patches after the preprocessing for images of any resolution.
576
+
577
+ # Args:
578
+ # image_size (Union[torch.LongTensor, np.ndarray, Tuple[int, int]): The size of the input image in the format (height, width).
579
+ # grid_pinpoints (list): A list containing possible resolutions.
580
+ # Each item in the list should be a tuple or list of the form `(height, width)`.
581
+ # patch_size (int): The size of each image patch.
582
+
583
+ # Returns:
584
+ # (int): the number of patches.
585
+
607
586
  if not isinstance(grid_pinpoints, list):
608
587
  raise TypeError("grid_pinpoints should be a list of tuples or lists")
609
588
 
@@ -13,11 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import inspect
16
- from typing import Any, Callable
16
+ from pathlib import Path
17
+ from typing import Any, Callable, Dict, Optional, Union
17
18
 
18
19
  from transformers import AutoModelForCausalLM
19
20
  from transformers.generation.utils import GenerationMixin
20
21
 
22
+ from ....configuration_utils import RBLNModelConfig
21
23
  from ....utils import logging
22
24
  from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
23
25
  from .midm_architecture import MidmLMHeadModelWrapper
@@ -91,9 +93,45 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
91
93
  _supports_cache_class = True
92
94
 
93
95
  @classmethod
94
- def from_pretrained(cls, *args, **kwargs):
95
- kwargs.setdefault("trust_remote_code", True)
96
- return super().from_pretrained(*args, **kwargs)
96
+ def from_pretrained(
97
+ cls,
98
+ model_id: Union[str, Path],
99
+ *,
100
+ export: Optional[bool] = None,
101
+ rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
102
+ trust_remote_code: Optional[bool] = None,
103
+ **kwargs: Any,
104
+ ):
105
+ """
106
+ The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
107
+ User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
108
+
109
+ Args:
110
+ model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
111
+ It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
112
+ export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
113
+ If None, it will be determined based on the existence of the compiled model files in the model_id.
114
+ rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
115
+ This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNMidmLMHeadModelConfig` for Mi:dm models).
116
+ For detailed configuration options, see the specific model's configuration class documentation.
117
+ trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
118
+ kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
119
+
120
+ Returns:
121
+ (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
122
+ """
123
+
124
+ if trust_remote_code is not None:
125
+ kwargs["trust_remote_code"] = trust_remote_code
126
+ elif "trust_remote_code" not in kwargs:
127
+ kwargs["trust_remote_code"] = True
128
+
129
+ return super().from_pretrained(
130
+ model_id=model_id,
131
+ export=export,
132
+ rbln_config=rbln_config,
133
+ **kwargs,
134
+ )
97
135
 
98
136
  def __getattr__(self, __name: str) -> Any:
99
137
  def redirect(func):
@@ -29,7 +29,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
29
29
  Args:
30
30
  max_image_size (Tuple): The size of max input images. A tuple (max_height, max_width)
31
31
  batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
32
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
32
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
33
33
 
34
34
  Raises:
35
35
  ValueError: If batch_size is not a positive integer.
@@ -31,10 +31,22 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
31
31
 
32
32
  def __init__(
33
33
  self,
34
- visual: Optional[RBLNModelConfig] = None,
35
34
  use_inputs_embeds: bool = True,
35
+ visual: Optional[RBLNModelConfig] = None,
36
36
  **kwargs: Any,
37
37
  ):
38
+ """
39
+ Args:
40
+ use_inputs_embeds (bool): Whether or not to use `inputs_embeds` as input. Defaults to `True`.
41
+ visual (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
42
+ kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
43
+
44
+ Raises:
45
+ ValueError: If `use_inputs_embeds` is False.
46
+ ValueError: If the visual configuration is provided but contains invalid settings, such as an invalid max_seq_lens (e.g., not a positive integer, not a multiple of the window-based attention unit, or insufficient for the expected resolution).
47
+ ValueError: If visual is None and no default vision configuration can be inferred for the model architecture.
48
+ ValueError: If any inherited parameters violate constraints defined in the parent class, such as batch_size not being a positive integer, prefill_chunk_size not being divisible by 64, or max_seq_len not meeting requirements for Flash Attention.
49
+ """
38
50
  super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
39
51
  if not self.use_inputs_embeds:
40
52
  raise ValueError(
@@ -66,10 +78,13 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
66
78
  making 256 (64 * 4) valid. RBLN optimization runs inference per image or video
67
79
  frame, so set `max_seq_len` to match the maximum expected resolution to reduce
68
80
  computation. If not provided, a `ValueError` is raised.
69
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
81
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
70
82
 
71
83
  Raises:
72
- ValueError: If batch_size is not a positive integer.
84
+ ValueError: If `max_seq_lens` is None or not provided.
85
+ ValueError: If `max_seq_lens` (or any value in the list) is not a positive integer.
86
+ ValueError: If `max_seq_lens` is not a multiple of (window_size / patch_size)^2 for window-based attention, or is insufficient for the expected image/video resolution.
87
+ ValueError: If `batch_size` (inherited from RBLNModelConfig) is not a positive integer.
73
88
 
74
89
  Max Seq Lens:
75
90
  Since `Qwen2_5_VLForConditionalGeneration` performs inference on a per-image or per-frame basis,