optimum-rbln 0.8.4a6__py3-none-any.whl → 0.8.4a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (64) hide show
  1. optimum/rbln/__version__.py +2 -2
  2. optimum/rbln/configuration_utils.py +41 -3
  3. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +1 -1
  4. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +3 -3
  5. optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +1 -1
  6. optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
  7. optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +7 -2
  8. optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +7 -2
  9. optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +1 -1
  10. optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +1 -1
  11. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +2 -2
  12. optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +1 -1
  13. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +3 -3
  14. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +1 -1
  15. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +1 -1
  16. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +1 -1
  17. optimum/rbln/diffusers/modeling_diffusers.py +7 -3
  18. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +31 -3
  19. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +28 -3
  20. optimum/rbln/diffusers/models/autoencoders/vq_model.py +31 -3
  21. optimum/rbln/diffusers/models/transformers/prior_transformer.py +1 -1
  22. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +9 -1
  23. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +9 -1
  24. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +6 -3
  25. optimum/rbln/diffusers/pipelines/auto_pipeline.py +45 -8
  26. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  27. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -1
  28. optimum/rbln/modeling.py +17 -13
  29. optimum/rbln/modeling_base.py +11 -9
  30. optimum/rbln/transformers/configuration_generic.py +3 -3
  31. optimum/rbln/transformers/modeling_generic.py +1 -0
  32. optimum/rbln/transformers/models/auto/auto_factory.py +67 -7
  33. optimum/rbln/transformers/models/auto/modeling_auto.py +31 -0
  34. optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +5 -6
  35. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -1
  36. optimum/rbln/transformers/models/clip/configuration_clip.py +7 -4
  37. optimum/rbln/transformers/models/clip/modeling_clip.py +23 -4
  38. optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
  39. optimum/rbln/transformers/models/colpali/modeling_colpali.py +38 -6
  40. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -1
  41. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +23 -0
  42. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +17 -2
  43. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +7 -8
  44. optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +12 -6
  45. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +6 -2
  46. optimum/rbln/transformers/models/llava/configuration_llava.py +6 -2
  47. optimum/rbln/transformers/models/llava/modeling_llava.py +1 -0
  48. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +2 -2
  49. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +57 -78
  50. optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +1 -1
  51. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +18 -3
  52. optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +2 -2
  53. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +1 -1
  54. optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +2 -2
  55. optimum/rbln/transformers/models/siglip/configuration_siglip.py +1 -1
  56. optimum/rbln/transformers/models/swin/configuration_swin.py +1 -1
  57. optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +1 -1
  58. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +1 -0
  59. optimum/rbln/transformers/models/whisper/configuration_whisper.py +1 -1
  60. optimum/rbln/transformers/models/whisper/modeling_whisper.py +1 -0
  61. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a7.dist-info}/METADATA +1 -1
  62. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a7.dist-info}/RECORD +64 -64
  63. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a7.dist-info}/WHEEL +0 -0
  64. {optimum_rbln-0.8.4a6.dist-info → optimum_rbln-0.8.4a7.dist-info}/licenses/LICENSE +0 -0
@@ -14,7 +14,8 @@
14
14
 
15
15
  import bisect
16
16
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Optional, Union
17
+ from tempfile import TemporaryDirectory
18
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
18
19
 
19
20
  import torch
20
21
  from transformers import PretrainedConfig, PreTrainedModel
@@ -126,8 +127,8 @@ class RBLNColPaliForRetrieval(RBLNModel):
126
127
  The ColPali Model transformer for document retrieval using vision-language models.
127
128
  This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
128
129
 
129
- A class to convert and run pre-trained transformers based ColPaliForRetrieval model on RBLN devices.
130
- It implements the methods to convert a pre-trained transformers ColPaliForRetrieval model into a RBLN transformer model by:
130
+ A class to convert and run pre-trained transformers based `ColPaliForRetrieval` model on RBLN devices.
131
+ It implements the methods to convert a pre-trained transformers `ColPaliForRetrieval` model into a RBLN transformer model by:
131
132
 
132
133
  - transferring the checkpoint weights of the original into an optimized RBLN graph,
133
134
  - compiling the resulting graph using the RBLN compiler.
@@ -263,11 +264,42 @@ class RBLNColPaliForRetrieval(RBLNModel):
263
264
  return rbln_config
264
265
 
265
266
  @classmethod
266
- def from_model(cls, model: "PreTrainedModel", *args, **kwargs):
267
+ def from_model(
268
+ cls,
269
+ model: "PreTrainedModel",
270
+ config: Optional[PretrainedConfig] = None,
271
+ rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
272
+ model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
273
+ subfolder: str = "",
274
+ **kwargs: Any,
275
+ ) -> "RBLNModel":
276
+ """
277
+ Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
278
+ This method performs the actual model conversion and compilation process.
279
+
280
+ Args:
281
+ model (PreTrainedModel): The PyTorch model to be compiled.
282
+ The object must be an instance of the HuggingFace transformers PreTrainedModel class.
283
+ config (Optional[PretrainedConfig]): The configuration object associated with the model.
284
+ rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
285
+ This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
286
+ For detailed configuration options, see the specific model's configuration class documentation.
287
+ kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
288
+
289
+ The method performs the following steps:
290
+
291
+ 1. Compiles the PyTorch model into an optimized RBLN graph
292
+ 2. Configures the model for the specified NPU device
293
+ 3. Creates the necessary runtime objects if requested
294
+ 4. Saves the compiled model and configurations
295
+
296
+ Returns:
297
+ (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
298
+ """
267
299
  if not hasattr(model, "vision_tower"):
268
300
  model.vision_tower = model.vlm.vision_tower
269
301
  del model.vlm.vision_tower
270
- model = super().from_model(model, *args, **kwargs)
302
+ model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
271
303
  return model
272
304
 
273
305
  @classmethod
@@ -334,7 +366,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
334
366
  output_hidden_states: Optional[bool] = None,
335
367
  return_dict: Optional[bool] = None,
336
368
  **kwargs,
337
- ) -> ColPaliForRetrievalOutput:
369
+ ) -> Union[Tuple, ColPaliForRetrievalOutput]:
338
370
  if pixel_values is not None:
339
371
  pixel_values = pixel_values.to(dtype=self.dtype)
340
372
 
@@ -104,7 +104,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
104
104
  ["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
105
105
  logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
106
106
  Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
107
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
107
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
108
108
 
109
109
  Raises:
110
110
  ValueError: If `batch_size` is not a positive integer.
@@ -86,3 +86,26 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
86
86
  model_kwargs["generate_idx"] = outputs.generate_idx
87
87
  model_kwargs["padded_cache_lengths"] = outputs.padded_cache_lengths
88
88
  return model_kwargs
89
+
90
+ def generate(
91
+ self,
92
+ input_ids: torch.LongTensor,
93
+ attention_mask: Optional[torch.LongTensor] = None,
94
+ max_length: Optional[int] = None,
95
+ **kwargs,
96
+ ):
97
+ """
98
+ The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
99
+
100
+ Args:
101
+ input_ids: The input ids to the model.
102
+ attention_mask: The attention mask to the model.
103
+ max_length: The maximum length of the generated text.
104
+ kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
105
+ """
106
+ if max_length is not None:
107
+ kwargs["max_length"] = max_length
108
+ if attention_mask is not None:
109
+ kwargs["attention_mask"] = attention_mask
110
+
111
+ return super().generate(input_ids, **kwargs)
@@ -27,6 +27,21 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
27
27
  image_prefill_chunk_size: Optional[int] = None,
28
28
  **kwargs: Any,
29
29
  ):
30
+ """
31
+ Args:
32
+ use_position_ids (Optional[bool]): Whether or not to use `position_ids`, which is indices of positions of each input sequence tokens in the position embeddings.
33
+ use_attention_mask (Optional[bool]): Whether or not to use `attention_mask` to to avoid performing attention on padding token indices.
34
+ prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
35
+ processing input sequences. Defaults to 256. Must be a positive integer
36
+ divisible by 64. Affects prefill performance and memory usage.
37
+ image_prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
38
+ processing images. This config is used when `use_image_prefill` is True.
39
+ Currently, the `prefill_chunk_size` and `image_prefill_chunk_size` should be the same value.
40
+ kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
41
+
42
+ Raises:
43
+ ValueError: If `use_attention_mask` or `use_position_ids` are False.
44
+ """
30
45
  # use_attention_mask and use_position_ids are always True for Gemma3
31
46
  use_attention_mask = use_attention_mask or True
32
47
  use_position_ids = use_position_ids or True
@@ -64,10 +79,10 @@ class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):
64
79
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
65
80
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
66
81
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
67
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
82
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
68
83
 
69
84
  Raises:
70
- ValueError: If batch_size is not a positive integer.
85
+ ValueError: If `batch_size` is not a positive integer.
71
86
  """
72
87
  super().__init__(**kwargs)
73
88
  self.batch_size = batch_size or 1
@@ -201,16 +201,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
201
201
  return model_kwargs
202
202
 
203
203
  def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
204
- """
205
- Projects the last hidden state from the vision model into language model space.
204
+ # Projects the last hidden state from the vision model into language model space.
206
205
 
207
- Args:
208
- pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
209
- The tensors corresponding to the input images.
206
+ # Args:
207
+ # pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
208
+ # The tensors corresponding to the input images.
209
+
210
+ # Returns:
211
+ # Image feature tensor of shape `(num_images, image_length, embed_dim)`.
210
212
 
211
- Returns:
212
- Image feature tensor of shape `(num_images, image_length, embed_dim)`.
213
- """
214
213
  vision_outputs = self.vision_tower(pixel_values).last_hidden_state
215
214
  image_features = self.multi_modal_projector(vision_outputs)
216
215
  return image_features
@@ -32,14 +32,20 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
32
32
  decoder: Optional["RBLNGroundingDinoDecoderConfig"] = None,
33
33
  text_backbone: Optional["RBLNModelConfig"] = None,
34
34
  backbone: Optional["RBLNModelConfig"] = None,
35
- output_attentions: Optional[bool] = False,
36
- output_hidden_states: Optional[bool] = False,
35
+ output_attentions: Optional[bool] = None,
36
+ output_hidden_states: Optional[bool] = None,
37
37
  **kwargs: Any,
38
38
  ):
39
39
  """
40
40
  Args:
41
- batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
42
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
41
+ batch_size (Optional[int]): The batch size for image and text processing. Defaults to 1.
42
+ encoder (Optional["RBLNModelConfig"]): The encoder configuration. Defaults to None.
43
+ decoder (Optional["RBLNModelConfig"]): The decoder configuration. Defaults to None.
44
+ text_backbone (Optional["RBLNModelConfig"]): The text backbone configuration. Defaults to None.
45
+ backbone (Optional["RBLNModelConfig"]): The backbone configuration. Defaults to None.
46
+ output_attentions (Optional[bool]): Whether to output attentions. Defaults to None.
47
+ output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None.
48
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
43
49
 
44
50
  Raises:
45
51
  ValueError: If batch_size is not a positive integer.
@@ -49,8 +55,8 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
49
55
  self.decoder = decoder
50
56
  self.text_backbone = text_backbone
51
57
  self.backbone = backbone
52
- self.output_attentions = output_attentions
53
- self.output_hidden_states = output_hidden_states
58
+ self.output_attentions = output_attentions if output_attentions is not None else False
59
+ self.output_hidden_states = output_hidden_states if output_hidden_states is not None else False
54
60
 
55
61
  if not isinstance(self.batch_size, int) or self.batch_size < 0:
56
62
  raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
@@ -45,11 +45,15 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
45
45
  Args:
46
46
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
47
47
  vision_model (Optional[RBLNModelConfig]): Configuration for the vision transformer component.
48
+ This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
49
+ If not provided, default settings will be used.
48
50
  text_model (Optional[RBLNModelConfig]): Configuration for the text model component.
49
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
51
+ This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
52
+ If not provided, default settings will be used.
53
+ kwargs: Additional arguments passed to the parent `RBLNModelConfig`.
50
54
 
51
55
  Raises:
52
- ValueError: If batch_size is not a positive integer.
56
+ ValueError: If `batch_size` is not a positive integer.
53
57
  """
54
58
 
55
59
  super().__init__(**kwargs)
@@ -39,11 +39,15 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
39
39
  Args:
40
40
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
41
41
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
42
+ This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
43
+ If not provided, default settings will be used.
42
44
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
43
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
45
+ This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
46
+ If not provided, default settings will be used.
47
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
44
48
 
45
49
  Raises:
46
- ValueError: If batch_size is not a positive integer.
50
+ ValueError: If `batch_size` is not a positive integer.
47
51
  """
48
52
  super().__init__(**kwargs)
49
53
  self.batch_size = batch_size or 1
@@ -105,6 +105,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
105
105
  RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
106
106
  optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
107
107
  This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
108
+
108
109
  Important Note:
109
110
  This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
110
111
  tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
@@ -45,10 +45,10 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
45
45
  batch_size (Optional[int]): The batch size for inference. Defaults to 1.
46
46
  vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
47
47
  language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
48
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
48
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
49
49
 
50
50
  Raises:
51
- ValueError: If batch_size is not a positive integer.
51
+ ValueError: If `batch_size` is not a positive integer.
52
52
  """
53
53
  super().__init__(**kwargs)
54
54
  self.batch_size = batch_size or 1
@@ -287,18 +287,15 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
287
287
  Obtains image last hidden states from the vision tower and apply multimodal projection.
288
288
 
289
289
  Args:
290
- pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
291
- The tensors corresponding to the input images.
292
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
293
- Actual image size of each images (H, W).
294
- vision_feature_layer (`int`):
295
- The index of the layer to select the vision feature.
296
- vision_feature_select_strategy (`str`):
297
- The feature selection strategy used to select the vision feature from the vision backbone.
290
+ pixel_values (torch.FloatTensor): The tensors corresponding to the input images
291
+ whose shape is `(batch_size, num_patches, channels, height, width)`.
292
+ image_sizes (torch.Tensor): Actual image size of each images (H, W).
293
+ vision_feature_layer (int): The index of the layer to select the vision feature.
294
+ vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
298
295
  Can be one of `"default"` or `"full"`
299
296
  Returns:
300
- image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
301
- and are of shape `(num_patches, image_length, embed_dim)`).
297
+ image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches
298
+ and are of shape `(num_patches, image_length, embed_dim)`).
302
299
  """
303
300
  # ! infer image_num_patches from image_sizes
304
301
  image_num_patches = [
@@ -412,23 +409,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
412
409
 
413
410
  # Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
414
411
  def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
415
- """
416
- Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
412
+ # Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
413
+
414
+ # Args:
415
+ # image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches.
416
+ # Its length is num_images, and each of shape is `(num_patches, image_length, embed_dim)`
417
+ # image_sizes (torch.Tensor): Actual image size of each images (H, W).
418
+ # vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
419
+ # image_newline (torch.Tensor): New line embedding vector whose shape is `embed_dim`.
420
+
421
+ # Returns:
422
+ # image_features (torch.Tensor): A torch.Tensor of shape `(all_feat_len, embed_dim)`)
423
+ # feature_lens (List[int]): A token length of each image in image_features
417
424
 
418
- Args:
419
- image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
420
- List of image feature tensor, each contains all the visual feature of all patches.
421
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
422
- Actual image size of each images (H, W).
423
- vision_feature_select_strategy (`str`)
424
- The feature selection strategy used to select the vision feature from the vision backbone.
425
- image_newline (`torch.Tensor` of shape `(embed_dim)`)
426
- New line embedding vector.
427
- Returns:
428
- image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
429
- feature_lens (`List[int]`)
430
- token length of each image in image_features
431
- """
432
425
  new_image_features = []
433
426
  feature_lens = []
434
427
  for image_idx, image_feature in enumerate(image_features):
@@ -478,21 +471,17 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
478
471
 
479
472
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
480
473
  def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
481
- """
482
- Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
483
-
484
- Args:
485
- image_size (`tuple`):
486
- The size of the input image in the format (width, height).
487
- grid_pinpoints (`List`):
488
- A list containing possible resolutions. Each item in the list should be a tuple or list
489
- of the form `(height, width)`.
490
- patch_size (`int`):
491
- The size of each image patch.
492
-
493
- Returns:
494
- tuple: The shape of the image patch grid in the format (width, height).
495
- """
474
+ # Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
475
+
476
+ # Args:
477
+ # image_size (tuple): The size of the input image in the format (width, height).
478
+ # grid_pinpoints (list): A list containing possible resolutions.
479
+ # Each item in the list should be a tuple or list of the form `(height, width)`.
480
+ # patch_size (int): The size of each image patch.
481
+
482
+ # Returns:
483
+ # tuple: The shape of the image patch grid in the format (width, height).
484
+
496
485
  if not isinstance(grid_pinpoints, list):
497
486
  raise TypeError("grid_pinpoints should be a list of tuples or lists")
498
487
 
@@ -510,18 +499,15 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
510
499
 
511
500
  # Almost copied from : https://github.com/huggingface/transformers/blob/1feebb5b4150882deabddd190a541f336f3be817/src/transformers/models/llava_next/modeling_llava_next.py#L115C1-L152C1
512
501
  def unpad_image(tensor, original_size):
513
- """
514
- Unpads a PyTorch tensor of a padded and resized image.
502
+ # Unpads a PyTorch tensor of a padded and resized image.
515
503
 
516
- Args:
517
- tensor (`torch.Tensor`):
518
- The image tensor, assumed to be of shape (num_channels, height, width).
519
- original_size (`tuple`):
520
- The original size of the image (height, width).
504
+ # Args:
505
+ # tensor (torch.Tensor): The image tensor, assumed to be of shape (num_channels, height, width).
506
+ # original_size (tuple): The original size of the image (height, width).
507
+
508
+ # Returns:
509
+ # (torch.Tensor): The unpadded image tensor.
521
510
 
522
- Returns:
523
- `torch.Tensor`: The unpadded image tensor.
524
- """
525
511
  if not isinstance(original_size, (list, tuple)):
526
512
  if not isinstance(original_size, (torch.Tensor, np.ndarray)):
527
513
  raise TypeError(
@@ -550,22 +536,19 @@ def unpad_image(tensor, original_size):
550
536
 
551
537
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
552
538
  def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
553
- """
554
- Selects the best resolution from a list of possible resolutions based on the original size.
539
+ # Selects the best resolution from a list of possible resolutions based on the original size.
555
540
 
556
- This is done by calculating the effective and wasted resolution for each possible resolution.
541
+ # This is done by calculating the effective and wasted resolution for each possible resolution.
557
542
 
558
- The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
543
+ # The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
559
544
 
560
- Args:
561
- original_size (tuple):
562
- The original size of the image in the format (height, width).
563
- possible_resolutions (list):
564
- A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
545
+ # Args:
546
+ # original_size (tuple): The original size of the image in the format (height, width).
547
+ # possible_resolutions (List(tuple)): A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
548
+
549
+ # Returns:
550
+ # (tuple): The best fit resolution in the format (height, width).
565
551
 
566
- Returns:
567
- tuple: The best fit resolution in the format (height, width).
568
- """
569
552
  original_height, original_width = original_size
570
553
  best_fit = None
571
554
  max_effective_resolution = 0
@@ -589,21 +572,17 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
589
572
 
590
573
  # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
591
574
  def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
592
- """
593
- Calculate the number of patches after the preprocessing for images of any resolution.
594
-
595
- Args:
596
- image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
597
- The size of the input image in the format (height, width). ?
598
- grid_pinpoints (`List`):
599
- A list containing possible resolutions. Each item in the list should be a tuple or list
600
- of the form `(height, width)`.
601
- patch_size (`int`):
602
- The size of each image patch.
603
-
604
- Returns:
605
- int: the number of patches
606
- """
575
+ # Calculate the number of patches after the preprocessing for images of any resolution.
576
+
577
+ # Args:
578
+ # image_size (Union[torch.LongTensor, np.ndarray, Tuple[int, int]): The size of the input image in the format (height, width).
579
+ # grid_pinpoints (list): A list containing possible resolutions.
580
+ # Each item in the list should be a tuple or list of the form `(height, width)`.
581
+ # patch_size (int): The size of each image patch.
582
+
583
+ # Returns:
584
+ # (int): the number of patches.
585
+
607
586
  if not isinstance(grid_pinpoints, list):
608
587
  raise TypeError("grid_pinpoints should be a list of tuples or lists")
609
588
 
@@ -29,7 +29,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
29
29
  Args:
30
30
  max_image_size (Tuple): The size of max input images. A tuple (max_height, max_width)
31
31
  batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
32
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
32
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
33
33
 
34
34
  Raises:
35
35
  ValueError: If batch_size is not a positive integer.
@@ -31,10 +31,22 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
31
31
 
32
32
  def __init__(
33
33
  self,
34
- visual: Optional[RBLNModelConfig] = None,
35
34
  use_inputs_embeds: bool = True,
35
+ visual: Optional[RBLNModelConfig] = None,
36
36
  **kwargs: Any,
37
37
  ):
38
+ """
39
+ Args:
40
+ use_inputs_embeds (bool): Whether or not to use `inputs_embeds` as input. Defaults to `True`.
41
+ visual (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
42
+ kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
43
+
44
+ Raises:
45
+ ValueError: If `use_inputs_embeds` is False.
46
+ ValueError: If the visual configuration is provided but contains invalid settings, such as an invalid max_seq_lens (e.g., not a positive integer, not a multiple of the window-based attention unit, or insufficient for the expected resolution).
47
+ ValueError: If visual is None and no default vision configuration can be inferred for the model architecture.
48
+ ValueError: If any inherited parameters violate constraints defined in the parent class, such as batch_size not being a positive integer, prefill_chunk_size not being divisible by 64, or max_seq_len not meeting requirements for Flash Attention.
49
+ """
38
50
  super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
39
51
  if not self.use_inputs_embeds:
40
52
  raise ValueError(
@@ -66,10 +78,13 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
66
78
  making 256 (64 * 4) valid. RBLN optimization runs inference per image or video
67
79
  frame, so set `max_seq_len` to match the maximum expected resolution to reduce
68
80
  computation. If not provided, a `ValueError` is raised.
69
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
81
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
70
82
 
71
83
  Raises:
72
- ValueError: If batch_size is not a positive integer.
84
+ ValueError: If `max_seq_lens` is None or not provided.
85
+ ValueError: If `max_seq_lens` (or any value in the list) is not a positive integer.
86
+ ValueError: If `max_seq_lens` is not a multiple of (window_size / patch_size)^2 for window-based attention, or is insufficient for the expected image/video resolution.
87
+ ValueError: If `batch_size` (inherited from RBLNModelConfig) is not a positive integer.
73
88
 
74
89
  Max Seq Lens:
75
90
  Since `Qwen2_5_VLForConditionalGeneration` performs inference on a per-image or per-frame basis,
@@ -31,7 +31,7 @@ class RBLNQwen2VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausalLMC
31
31
  Args:
32
32
  use_inputs_embeds (bool): Whether or not to use `inputs_embeds` as input. Defaults to `True`.
33
33
  visual (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
34
- **kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
34
+ kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
35
35
 
36
36
  Raises:
37
37
  ValueError: If `use_inputs_embeds` is False.
@@ -59,7 +59,7 @@ class RBLNQwen2VisionTransformerPretrainedModelConfig(RBLNModelConfig):
59
59
  so `max_seq_lens` must be at least 256. RBLN optimization runs inference per image
60
60
  or video frame, so set `max_seq_lens` to match the maximum expected resolution to
61
61
  optimize computation. If not provided, a `ValueError` is raised.
62
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
62
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
63
63
 
64
64
  Raises:
65
65
  ValueError: If batch_size is not a positive integer.
@@ -46,7 +46,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
46
46
  PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
47
47
  kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
48
48
  in the PagedAttention KV cache for the SelfAttention. Defaults to dec_max_seq_len.
49
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
49
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
50
50
 
51
51
  Raises:
52
52
  ValueError: If batch_size is not a positive integer.
@@ -31,7 +31,7 @@ class Seq2SeqWrapper:
31
31
  Args:
32
32
  model (nn.Module): The Seq2Seq model to wrap.
33
33
  enc_max_seq_len (int): Maximum sequence length for the encoder's position embeddings and cache sizes.
34
- **kwargs: Additional arguments to pass to the decoder wrapper.
34
+ kwargs: Additional arguments to pass to the decoder wrapper.
35
35
  """
36
36
 
37
37
  def __init__(self, model: nn.Module, enc_max_seq_len: int, **kwargs):
@@ -125,7 +125,7 @@ class Seq2SeqDecoderWrapper(nn.Module):
125
125
 
126
126
  Args:
127
127
  model (nn.Module): The Seq2Seq model containing the decoder.
128
- **kwargs: Additional arguments for decoder configuration.
128
+ kwargs: Additional arguments for decoder configuration.
129
129
  """
130
130
 
131
131
  def __init__(self, model: nn.Module, use_attention_mask: bool = True, **kwargs):
@@ -42,7 +42,7 @@ class RBLNSiglipVisionModelConfig(RBLNModelConfig):
42
42
  interpolate_pos_encoding (Optional[bool]): Whether to interpolate the position encoding.
43
43
  output_hidden_states: (Optional[bool]): Whether to return hidden states.
44
44
  output_attentions: (Optional[bool]): Whether to return attentions.
45
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
45
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
46
46
 
47
47
  Raises:
48
48
  ValueError: If batch_size is not a positive integer.
@@ -27,7 +27,7 @@ class RBLNSwinBackboneConfig(RBLNModelForImageClassificationConfig):
27
27
  """
28
28
  Args:
29
29
  batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
30
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
30
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
31
31
 
32
32
  Raises:
33
33
  ValueError: If batch_size is not a positive integer.
@@ -25,7 +25,7 @@ class RBLNTimeSeriesTransformerForPredictionConfig(RBLNModelConfig):
25
25
  enc_max_seq_len (Optional[int]): Maximum sequence length for the encoder.
26
26
  dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
27
27
  num_parallel_samples (Optional[int]): Number of samples to generate in parallel during prediction.
28
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
28
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
29
29
 
30
30
  Raises:
31
31
  ValueError: If batch_size is not a positive integer.
@@ -38,6 +38,7 @@ class RBLNWav2Vec2ForCTC(RBLNModelForMaskedLM):
38
38
  library implements for all its model.
39
39
 
40
40
  It implements the methods to convert a pre-trained Wav2Vec2 model into a RBLN Wav2Vec2 model by:
41
+
41
42
  - transferring the checkpoint weights of the original into an optimized RBLN graph,
42
43
  - compiling the resulting graph using the RBLN compiler.
43
44
  """
@@ -51,7 +51,7 @@ class RBLNWhisperForConditionalGenerationConfig(RBLNModelConfig):
51
51
  PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
52
52
  kvcache_block_size (int, optional): Sets the size (in number of tokens) of each block
53
53
  in the PagedAttention KV cache for the SelfAttention. Defaults to dec_max_seq_len.
54
- **kwargs: Additional arguments passed to the parent RBLNModelConfig.
54
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
55
55
 
56
56
  Raises:
57
57
  ValueError: If batch_size is not a positive integer.
@@ -112,6 +112,7 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
112
112
 
113
113
  This model inherits from [`RBLNModel`]. It implements the methods to convert and run
114
114
  pre-trained transformers based Whisper model on RBLN devices by:
115
+
115
116
  - transferring the checkpoint weights of the original into an optimized RBLN graph,
116
117
  - compiling the resulting graph using the RBLN compiler.
117
118
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optimum-rbln
3
- Version: 0.8.4a6
3
+ Version: 0.8.4a7
4
4
  Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
5
5
  Project-URL: Homepage, https://rebellions.ai
6
6
  Project-URL: Documentation, https://docs.rbln.ai