optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.4a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. optimum/rbln/__init__.py +12 -0
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +16 -6
  4. optimum/rbln/diffusers/__init__.py +12 -0
  5. optimum/rbln/diffusers/configurations/__init__.py +3 -0
  6. optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
  7. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
  8. optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
  9. optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
  10. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
  11. optimum/rbln/diffusers/modeling_diffusers.py +1 -1
  12. optimum/rbln/diffusers/models/__init__.py +17 -3
  13. optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
  14. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
  15. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
  16. optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
  17. optimum/rbln/diffusers/models/controlnet.py +17 -2
  18. optimum/rbln/diffusers/models/transformers/prior_transformer.py +16 -2
  19. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +16 -1
  20. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +14 -1
  21. optimum/rbln/diffusers/models/unets/__init__.py +1 -0
  22. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +18 -2
  23. optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
  24. optimum/rbln/diffusers/pipelines/__init__.py +4 -0
  25. optimum/rbln/diffusers/pipelines/auto_pipeline.py +2 -2
  26. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
  27. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +13 -4
  28. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +13 -4
  29. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +13 -4
  30. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -4
  31. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +1 -1
  32. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  33. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -2
  34. optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
  35. optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
  36. optimum/rbln/modeling.py +20 -45
  37. optimum/rbln/modeling_base.py +12 -8
  38. optimum/rbln/transformers/configuration_generic.py +0 -27
  39. optimum/rbln/transformers/modeling_attention_utils.py +242 -109
  40. optimum/rbln/transformers/modeling_generic.py +2 -61
  41. optimum/rbln/transformers/modeling_outputs.py +1 -0
  42. optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
  43. optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
  44. optimum/rbln/transformers/models/auto/auto_factory.py +1 -0
  45. optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
  46. optimum/rbln/transformers/models/bert/modeling_bert.py +86 -1
  47. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +42 -15
  48. optimum/rbln/transformers/models/clip/modeling_clip.py +40 -2
  49. optimum/rbln/transformers/models/colpali/colpali_architecture.py +2 -2
  50. optimum/rbln/transformers/models/colpali/modeling_colpali.py +6 -45
  51. optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +0 -2
  52. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +10 -1
  53. optimum/rbln/transformers/models/decoderonly/configuration_lora.py +1 -1
  54. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +92 -43
  55. optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +207 -64
  56. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +17 -9
  57. optimum/rbln/transformers/models/decoderonly/lora_architecture.py +1 -1
  58. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +140 -46
  59. optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +17 -0
  60. optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
  61. optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
  62. optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +7 -1
  63. optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +42 -70
  64. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +46 -31
  65. optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +1 -1
  66. optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +24 -9
  67. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -5
  68. optimum/rbln/transformers/models/llava/modeling_llava.py +37 -25
  69. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +3 -5
  70. optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -22
  71. optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
  72. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
  73. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +13 -1
  74. optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +2 -2
  75. optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -28
  76. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +8 -9
  77. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -7
  78. optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +1 -1
  79. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +0 -20
  80. optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
  81. optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
  82. optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
  83. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -4
  84. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +36 -12
  85. optimum/rbln/transformers/models/siglip/modeling_siglip.py +17 -1
  86. optimum/rbln/transformers/models/swin/modeling_swin.py +17 -4
  87. optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
  88. optimum/rbln/transformers/models/t5/t5_architecture.py +1 -1
  89. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +25 -10
  90. optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
  91. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
  92. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +60 -8
  93. optimum/rbln/transformers/models/whisper/generation_whisper.py +48 -14
  94. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
  95. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +53 -0
  96. optimum/rbln/transformers/utils/rbln_quantization.py +9 -0
  97. optimum/rbln/utils/deprecation.py +213 -0
  98. optimum/rbln/utils/hub.py +14 -3
  99. optimum/rbln/utils/import_utils.py +7 -1
  100. optimum/rbln/utils/runtime_utils.py +32 -0
  101. optimum/rbln/utils/submodule.py +3 -1
  102. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/METADATA +2 -2
  103. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/RECORD +106 -99
  104. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/WHEEL +1 -1
  105. optimum/rbln/utils/depreacate_utils.py +0 -16
  106. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/entry_points.txt +0 -0
  107. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/licenses/LICENSE +0 -0
@@ -12,17 +12,80 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from ...modeling_generic import RBLNModelForAudioClassification
15
+ from typing import TYPE_CHECKING, Optional
16
16
 
17
+ import torch
18
+ from transformers import AutoModelForAudioClassification
19
+ from transformers.modeling_outputs import SequenceClassifierOutput
17
20
 
18
- class RBLNASTForAudioClassification(RBLNModelForAudioClassification):
21
+ from ....configuration_utils import RBLNCompileConfig
22
+ from ....modeling import RBLNModel
23
+ from .configuration_audio_spectrogram_transformer import RBLNASTForAudioClassificationConfig
24
+
25
+
26
+ if TYPE_CHECKING:
27
+ from transformers import AutoFeatureExtractor, PretrainedConfig, PreTrainedModel
28
+
29
+
30
+ class RBLNASTForAudioClassification(RBLNModel):
19
31
  """
20
32
  Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2.
21
- This model inherits from [`RBLNModelForAudioClassification`]. Check the superclass documentation for the generic methods the library implements for all its models.
33
+ This model inherits from [RBLNModelForAudioClassification]. Check the superclass documentation for the generic methods the library implements for all its models.
22
34
 
23
- A class to convert and run pre-trained transformer-based `ASTForAudioClassification` models on RBLN devices.
24
- It implements the methods to convert a pre-trained transformers `ASTForAudioClassification` model into a RBLN transformer model by:
35
+ A class to convert and run pre-trained transformer-based ASTForAudioClassification models on RBLN devices.
36
+ It implements the methods to convert a pre-trained transformers ASTForAudioClassification model into a RBLN transformer model by:
25
37
 
26
38
  - transferring the checkpoint weights of the original into an optimized RBLN graph,
27
39
  - compiling the resulting graph using the RBLN Compiler.
28
40
  """
41
+
42
+ auto_model_class = AutoModelForAudioClassification
43
+
44
+ @classmethod
45
+ def _update_rbln_config(
46
+ cls,
47
+ preprocessors: "AutoFeatureExtractor" = None,
48
+ model: Optional["PreTrainedModel"] = None,
49
+ model_config: "PretrainedConfig" = None,
50
+ rbln_config: Optional[RBLNASTForAudioClassificationConfig] = None,
51
+ ) -> RBLNASTForAudioClassificationConfig:
52
+ num_mel_bins = getattr(model_config, "num_mel_bins", None)
53
+
54
+ if rbln_config.max_length is None:
55
+ rbln_config.max_length = getattr(model_config, "max_length", None)
56
+ for feature_extractor in preprocessors:
57
+ if hasattr(feature_extractor, "max_length"):
58
+ rbln_config.max_length = feature_extractor.max_length
59
+ break
60
+
61
+ if rbln_config.max_length is None:
62
+ raise ValueError("max_length should be specified!")
63
+
64
+ input_info = [
65
+ (
66
+ "input_values",
67
+ [rbln_config.batch_size, rbln_config.max_length, num_mel_bins],
68
+ "float32",
69
+ ),
70
+ ]
71
+
72
+ rbln_config.set_compile_cfgs([RBLNCompileConfig(input_info=input_info)])
73
+ return rbln_config
74
+
75
+ def forward(self, input_values: torch.Tensor, **kwargs) -> SequenceClassifierOutput:
76
+ """
77
+ Forward pass for the RBLN-optimized Audio Spectrogram Transformer model for audio classification.
78
+
79
+ Args:
80
+ input_values (torch.FloatTensor of shape (batch_size, max_length, num_mel_bins)):
81
+ Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
82
+ loading a .flac or .wav audio file into an array of type list[float], a numpy.ndarray or a torch.Tensor, *e.g.* via
83
+ the torchcodec library (pip install torchcodec) or the soundfile library (pip install soundfile).
84
+ To prepare the array into input_features, the [AutoFeatureExtractor] should be used for extracting the
85
+ mel features, padding and conversion into a tensor of type torch.FloatTensor.
86
+
87
+ Returns:
88
+ Returns a SequenceClassifierOutput object.
89
+ """
90
+
91
+ return super().forward(input_values, **kwargs)
@@ -150,6 +150,7 @@ class _BaseAutoModelClass:
150
150
  f"from the checkpoint, leading to potential unintended behavior. If this is not intentional, consider calling the "
151
151
  f"`from_pretrained()` method directly from the `RBLN{config.architectures[0]}` class instead.",
152
152
  UserWarning,
153
+ stacklevel=2,
153
154
  )
154
155
 
155
156
  return model_class
@@ -13,9 +13,11 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import inspect
16
- from typing import Any, Callable
16
+ from typing import Any, Callable, Optional, Tuple, Union
17
17
 
18
+ import torch
18
19
  from transformers import BartForConditionalGeneration, PreTrainedModel
20
+ from transformers.modeling_outputs import Seq2SeqModelOutput
19
21
 
20
22
  from ....utils.logging import get_logger
21
23
  from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
@@ -35,6 +37,25 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
35
37
  on RBLN devices, optimized for feature extraction use cases.
36
38
  """
37
39
 
40
+ def forward(
41
+ self,
42
+ input_ids: Optional[torch.Tensor] = None,
43
+ attention_mask: Optional[torch.Tensor] = None,
44
+ **kwargs,
45
+ ) -> Union[Tuple, Seq2SeqModelOutput]:
46
+ """
47
+ Forward pass for the RBLN-optimized BART model for feature extraction tasks.
48
+
49
+ Args:
50
+ input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
51
+ attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
52
+
53
+ Returns:
54
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
55
+ """
56
+
57
+ return super().forward(input_ids, attention_mask, **kwargs)
58
+
38
59
 
39
60
  class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
40
61
  """
@@ -48,7 +69,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
48
69
  support_causal_attn = True
49
70
 
50
71
  @classmethod
51
- def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
72
+ def _wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
52
73
  return BartWrapper(
53
74
  model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
54
75
  )
@@ -12,7 +12,14 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from typing import Optional, Tuple, Union
16
+
15
17
  import torch
18
+ from transformers.modeling_outputs import (
19
+ BaseModelOutputWithPoolingAndCrossAttentions,
20
+ MaskedLMOutput,
21
+ QuestionAnsweringModelOutput,
22
+ )
16
23
 
17
24
  from ...modeling_generic import (
18
25
  RBLNModelForMaskedLM,
@@ -35,9 +42,45 @@ class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
35
42
  rbln_model_input_names = ["input_ids", "attention_mask"]
36
43
 
37
44
  @classmethod
38
- def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
45
+ def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
39
46
  return BertModelWrapper(model, rbln_config)
40
47
 
48
+ def forward(
49
+ self,
50
+ input_ids: Optional[torch.Tensor] = None,
51
+ attention_mask: Optional[torch.Tensor] = None,
52
+ token_type_ids: Optional[torch.Tensor] = None,
53
+ position_ids: Optional[torch.Tensor] = None,
54
+ **kwargs,
55
+ ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple]:
56
+ """
57
+ Forward pass for the RBLN-optimized BERT model for feature extraction tasks.
58
+
59
+ Args:
60
+ input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
61
+ attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
62
+ token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
63
+ position_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of positions of each input sequence tokens in the position embeddings.
64
+
65
+ Returns:
66
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPoolingAndCrossAttentions object.
67
+ """
68
+
69
+ input_map = {
70
+ "input_ids": input_ids,
71
+ "attention_mask": attention_mask,
72
+ "token_type_ids": token_type_ids,
73
+ "position_ids": position_ids,
74
+ }
75
+
76
+ model_input_names = getattr(self.rbln_config, "model_input_names", None)
77
+ if model_input_names is None:
78
+ model_input_names = self.rbln_model_input_names
79
+
80
+ ordered_inputs = [input_map[name] for name in model_input_names if name in input_map]
81
+
82
+ return super().forward(*ordered_inputs, **kwargs)
83
+
41
84
 
42
85
  class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
43
86
  """
@@ -50,6 +93,27 @@ class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
50
93
 
51
94
  rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
52
95
 
96
+ def forward(
97
+ self,
98
+ input_ids: Optional[torch.Tensor] = None,
99
+ attention_mask: Optional[torch.Tensor] = None,
100
+ token_type_ids: Optional[torch.Tensor] = None,
101
+ **kwargs,
102
+ ) -> Union[MaskedLMOutput, Tuple]:
103
+ """
104
+ Forward pass for the RBLN-optimized BERT model for masked language modeling tasks.
105
+
106
+ Args:
107
+ input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
108
+ attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
109
+ token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
110
+
111
+ Returns:
112
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
113
+ """
114
+
115
+ return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
116
+
53
117
 
54
118
  class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
55
119
  """
@@ -61,3 +125,24 @@ class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
61
125
  """
62
126
 
63
127
  rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
128
+
129
+ def forward(
130
+ self,
131
+ input_ids: Optional[torch.Tensor] = None,
132
+ attention_mask: Optional[torch.Tensor] = None,
133
+ token_type_ids: Optional[torch.Tensor] = None,
134
+ **kwargs,
135
+ ) -> Union[QuestionAnsweringModelOutput, Tuple]:
136
+ """
137
+ Forward pass for the RBLN-optimized BERT model for question answering tasks.
138
+
139
+ Args:
140
+ input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
141
+ attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
142
+ token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
143
+
144
+ Returns:
145
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
146
+ """
147
+
148
+ return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
@@ -14,7 +14,7 @@
14
14
 
15
15
  import inspect
16
16
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
17
+ from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
18
18
 
19
19
  import torch
20
20
  from transformers import (
@@ -71,7 +71,7 @@ class RBLNBlip2VisionModel(RBLNModel):
71
71
  return self.embeddings
72
72
 
73
73
  @classmethod
74
- def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
74
+ def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
75
75
  class Blip2VisionModelWrapper(torch.nn.Module):
76
76
  def __init__(self, model: "Blip2VisionModel") -> None:
77
77
  super().__init__()
@@ -111,11 +111,20 @@ class RBLNBlip2VisionModel(RBLNModel):
111
111
  def forward(
112
112
  self,
113
113
  pixel_values: torch.FloatTensor,
114
- output_attentions: Optional[bool] = None,
115
- output_hidden_states: Optional[bool] = None,
116
- return_dict: Optional[bool] = None,
117
114
  interpolate_pos_encoding: bool = False,
115
+ return_dict: Optional[bool] = None,
118
116
  ) -> Union[Tuple, BaseModelOutputWithPooling]:
117
+ """
118
+ Forward pass for the RBLN-optimized Blip2VisionModel model.
119
+
120
+ Args:
121
+ pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
122
+ interpolate_pos_encoding (bool, optional): Whether to interpolate the positional encoding of the image embeddings. Defaults to False.
123
+ return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
124
+
125
+ Returns:
126
+ BaseModelOutputWithPooling or tuple(torch.FloatTensor): The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
127
+ """
119
128
  batch_size = pixel_values.shape[0]
120
129
  outputs = []
121
130
  for i in range(batch_size):
@@ -151,7 +160,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
151
160
  return self.embeddings.word_embeddings
152
161
 
153
162
  @classmethod
154
- def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
163
+ def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
155
164
  class Blip2QFormerModelWrapper(torch.nn.Module):
156
165
  def __init__(self, model: "Blip2QFormerModel"):
157
166
  super().__init__()
@@ -231,17 +240,22 @@ class RBLNBlip2QFormerModel(RBLNModel):
231
240
  def forward(
232
241
  self,
233
242
  query_embeds: torch.FloatTensor,
234
- query_length: Optional[int] = None,
235
- attention_mask: Optional[torch.FloatTensor] = None,
236
- head_mask: Optional[torch.FloatTensor] = None,
237
243
  encoder_hidden_states: Optional[torch.FloatTensor] = None,
238
244
  encoder_attention_mask: Optional[torch.FloatTensor] = None,
239
- past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
240
- use_cache: Optional[bool] = None,
241
- output_attentions: Optional[bool] = None,
242
- output_hidden_states: Optional[bool] = None,
243
245
  return_dict: Optional[bool] = None,
244
246
  ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
247
+ """
248
+ The forward pass for the RBLN-optimized Blip2QFormerModel model.
249
+
250
+ Args:
251
+ query_embeds (torch.FloatTensor): Hidden states to be used in the attention computation.
252
+ encoder_hidden_states (torch.FloatTensor, optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder.
253
+ encoder_attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder.
254
+ return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
255
+
256
+ Returns:
257
+ BaseModelOutputWithPoolingAndCrossAttentions or tuple(torch.FloatTensor): The model outputs. If `return_dict=False` is passed, returns a tuple of tensors. Otherwise, returns a `BaseModelOutputWithPoolingAndCrossAttentions` object.
258
+ """
245
259
  batch_size = query_embeds.shape[0]
246
260
  outputs = []
247
261
  for i in range(batch_size):
@@ -349,7 +363,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
349
363
  return self.language_model.get_input_embeddings()
350
364
 
351
365
  @classmethod
352
- def wrap_model_if_needed(cls, model, rbln_config):
366
+ def _wrap_model_if_needed(cls, model, rbln_config):
353
367
  return model.language_projection
354
368
 
355
369
  @classmethod
@@ -444,7 +458,20 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
444
458
  inputs_embeds: Optional[torch.FloatTensor] = None,
445
459
  interpolate_pos_encoding: bool = False,
446
460
  **generate_kwargs,
447
- ) -> torch.LongTensor:
461
+ ) -> List[torch.LongTensor]:
462
+ """
463
+ The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
464
+ Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.generate) for more details.
465
+
466
+ Args:
467
+ pixel_values (torch.FloatTensor): Input images to be processed.
468
+ input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
469
+ attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
470
+ inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
471
+ interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
472
+ Returns:
473
+ A list of strings of length batch_size * num_captions.
474
+ """
448
475
  batch_size = pixel_values.shape[0]
449
476
  image_embeds = self.vision_model(
450
477
  pixel_values,
@@ -54,7 +54,7 @@ class RBLNCLIPTextModel(RBLNModel):
54
54
  _tp_support = False
55
55
 
56
56
  @classmethod
57
- def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
57
+ def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
58
58
  return _TextEncoder(model).eval()
59
59
 
60
60
  @classmethod
@@ -92,6 +92,9 @@ class RBLNCLIPTextModel(RBLNModel):
92
92
  Args:
93
93
  input_ids (torch.LongTensor): The input ids to the model.
94
94
  return_dict (Optional[bool]): Whether to return a dictionary of outputs.
95
+
96
+ Returns:
97
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPTextModelOutput object.
95
98
  """
96
99
 
97
100
  # To ignore using attention_mask, we override forward method.
@@ -157,7 +160,7 @@ class RBLNCLIPVisionModel(RBLNModel):
157
160
  _tp_support = False
158
161
 
159
162
  @classmethod
160
- def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
163
+ def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
161
164
  wrapper_cfg = {
162
165
  "interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
163
166
  "output_hidden_states": rbln_config.output_hidden_states,
@@ -230,6 +233,9 @@ class RBLNCLIPVisionModel(RBLNModel):
230
233
  output_attentions (Optional[bool]): Whether to return attentions.
231
234
  output_hidden_states (Optional[bool]): Whether to return hidden states.
232
235
  interpolate_pos_encoding (bool): Whether to interpolate position encoding.
236
+
237
+ Returns:
238
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
233
239
  """
234
240
 
235
241
  if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
@@ -307,6 +313,38 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
307
313
  multimodal embedding alignment tasks.
308
314
  """
309
315
 
316
+ def forward(
317
+ self,
318
+ pixel_values: torch.FloatTensor,
319
+ return_dict: bool = True,
320
+ output_attentions: Optional[bool] = None,
321
+ output_hidden_states: Optional[bool] = None,
322
+ interpolate_pos_encoding: bool = False,
323
+ **kwargs,
324
+ ) -> Union[Tuple, CLIPVisionModelOutput]:
325
+ """
326
+ Forward pass for the RBLN-optimized CLIP vision encoder model with projection.
327
+
328
+ Args:
329
+ pixel_values (torch.Tensor): The pixel values to the model.
330
+ return_dict (bool): Whether to return a dictionary of outputs.
331
+ output_attentions (Optional[bool]): Whether to return attentions.
332
+ output_hidden_states (Optional[bool]): Whether to return hidden states.
333
+ interpolate_pos_encoding (bool): Whether to interpolate position encoding.
334
+
335
+ Returns:
336
+ The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPVisionModelOutput object.
337
+ """
338
+
339
+ return super().forward(
340
+ pixel_values=pixel_values,
341
+ return_dict=return_dict,
342
+ output_attentions=output_attentions,
343
+ output_hidden_states=output_hidden_states,
344
+ interpolate_pos_encoding=interpolate_pos_encoding,
345
+ **kwargs,
346
+ )
347
+
310
348
  def _prepare_output(self, output, return_dict):
311
349
  # Prepare model output based on return_dict flag.
312
350
  # This method can be overridden by subclasses to provide task-specific output handling.
@@ -156,8 +156,8 @@ class ColPaliAttention(nn.Module):
156
156
  def __init__(self, self_attn):
157
157
  super().__init__()
158
158
  self._original_mod = self_attn
159
- self.num_heads = getattr(self._original_mod, "num_heads", None) or getattr(
160
- self._original_mod.config, "num_attention_heads"
159
+ self.num_heads = (
160
+ getattr(self._original_mod, "num_heads", None) or self._original_mod.config.num_attention_heads
161
161
  )
162
162
  self.head_dim = self._original_mod.head_dim
163
163
  self.scaling = self.head_dim**-0.5
@@ -14,8 +14,7 @@
14
14
 
15
15
  import bisect
16
16
  from pathlib import Path
17
- from tempfile import TemporaryDirectory
18
- from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
17
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
19
18
 
20
19
  import torch
21
20
  from transformers import PretrainedConfig, PreTrainedModel
@@ -182,7 +181,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
182
181
  return multi_modal_projector
183
182
 
184
183
  @classmethod
185
- def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
184
+ def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
186
185
  return RBLNColPaliForRetrievalWrapper(
187
186
  causal_lm=model.vlm,
188
187
  embedding_proj_layer=model.embedding_proj_layer,
@@ -236,49 +235,11 @@ class RBLNColPaliForRetrieval(RBLNModel):
236
235
  return rbln_config
237
236
 
238
237
  @classmethod
239
- def from_model(
240
- cls,
241
- model: "PreTrainedModel",
242
- config: Optional[PretrainedConfig] = None,
243
- rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
244
- model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
245
- subfolder: str = "",
246
- **kwargs: Any,
247
- ) -> "RBLNModel":
248
- """
249
- Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
250
- This method performs the actual model conversion and compilation process.
251
-
252
- Args:
253
- model (PreTrainedModel): The PyTorch model to be compiled.
254
- The object must be an instance of the HuggingFace transformers PreTrainedModel class.
255
- config (Optional[PretrainedConfig]): The configuration object associated with the model.
256
- rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
257
- This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
258
- For detailed configuration options, see the specific model's configuration class documentation.
259
- kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
260
-
261
- The method performs the following steps:
262
-
263
- 1. Compiles the PyTorch model into an optimized RBLN graph
264
- 2. Configures the model for the specified NPU device
265
- 3. Creates the necessary runtime objects if requested
266
- 4. Saves the compiled model and configurations
267
-
268
- Returns:
269
- (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
270
- """
271
- if not hasattr(model, "vision_tower"):
238
+ def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
239
+ if hasattr(model, "vlm"):
272
240
  model.vision_tower = model.vlm.vision_tower
273
241
  del model.vlm.model.vision_tower
274
- model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
275
- return model
276
-
277
- @classmethod
278
- def get_pytorch_model(cls, *args, **kwargs):
279
- model = super().get_pytorch_model(*args, **kwargs)
280
- model.vision_tower = model.vlm.vision_tower
281
- del model.vlm.model.vision_tower
242
+ return model
282
243
  return model
283
244
 
284
245
  def get_image_features(self, pixel_values: torch.Tensor):
@@ -371,7 +332,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
371
332
  ]
372
333
  outputs.append(torch.empty(size=language_model_out_size, dtype=torch.float32, device="cpu"))
373
334
  if self.rbln_config.output_hidden_states:
374
- for i in range(self.config.vlm_config.text_config.num_hidden_layers + 1):
335
+ for _ in range(self.config.vlm_config.text_config.num_hidden_layers + 1):
375
336
  outputs.append(torch.empty(size=language_model_hidden_states_size, dtype=torch.float32, device="cpu"))
376
337
 
377
338
  # Embedding_proj_layer is fused on the bottom of the language model.
@@ -58,7 +58,6 @@ class RBLNColQwen2ForRetrievalConfig(RBLNDecoderOnlyModelConfig):
58
58
  visual: Optional[RBLNModelConfig] = None,
59
59
  batch_size: Optional[int] = None,
60
60
  use_inputs_embeds: bool = True,
61
- output_hidden_states: Optional[bool] = False,
62
61
  **kwargs,
63
62
  ):
64
63
  super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
@@ -71,4 +70,3 @@ class RBLNColQwen2ForRetrievalConfig(RBLNDecoderOnlyModelConfig):
71
70
  raise ValueError("batch_size is not supported for RBLNColQwen2ForRetrievalConfig")
72
71
 
73
72
  self.visual = visual
74
- self.output_hidden_states = output_hidden_states
@@ -58,6 +58,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
58
58
  sliding_window_layers: Optional[List[int]] = None,
59
59
  phases: Optional[List[PhaseType]] = None,
60
60
  logits_to_keep: Optional[int] = None,
61
+ output_hidden_states: Optional[bool] = None,
61
62
  **kwargs,
62
63
  ):
63
64
  """
@@ -112,6 +113,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
112
113
  ["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
113
114
  logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
114
115
  Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
116
+ output_hidden_states (Optional[bool]): Whether to output the hidden states of the decoder. Defaults to False.
115
117
  kwargs: Additional arguments passed to the parent RBLNModelConfig.
116
118
 
117
119
  Raises:
@@ -232,6 +234,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
232
234
  if self.logits_to_keep is not None and self.logits_to_keep > 1:
233
235
  raise NotImplementedError("`logits_to_keep` > 1 is currently not supported for RBLN models.")
234
236
 
237
+ self.output_hidden_states = output_hidden_states or False
238
+
235
239
  self.decoder_batch_sizes = None
236
240
  if "decode" in self.phases:
237
241
  self.decoder_batch_sizes = decoder_batch_sizes
@@ -274,13 +278,18 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
274
278
 
275
279
  @property
276
280
  def use_lora(self):
277
- """Check if LoRA is enabled for this configuration."""
278
281
  return self.lora_config is not None
279
282
 
280
283
  @property
281
284
  def can_generate(self) -> bool:
282
285
  return "decode" in self.phases
283
286
 
287
+ @property
288
+ def nbits_per_param(self) -> int:
289
+ if self.quantization:
290
+ return self.quantization.nbits_per_param
291
+ return 16
292
+
284
293
 
285
294
  class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
286
295
  """
@@ -183,7 +183,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
183
183
  f"Failed to download LoRA adapter '{path.as_posix()}' from HuggingFace Hub. "
184
184
  f"Please check if the model ID is correct or provide a valid local path. "
185
185
  f"Error: {e}"
186
- )
186
+ ) from e
187
187
 
188
188
  def _load_adapter_config(self) -> Dict[str, Any]:
189
189
  """