optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. optimum/rbln/__init__.py +0 -12
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +2 -4
  4. optimum/rbln/diffusers/__init__.py +0 -12
  5. optimum/rbln/diffusers/configurations/__init__.py +0 -3
  6. optimum/rbln/diffusers/configurations/models/__init__.py +0 -2
  7. optimum/rbln/diffusers/configurations/pipelines/__init__.py +0 -3
  8. optimum/rbln/diffusers/models/__init__.py +3 -17
  9. optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -1
  10. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
  11. optimum/rbln/diffusers/models/autoencoders/vae.py +8 -27
  12. optimum/rbln/diffusers/models/controlnet.py +1 -16
  13. optimum/rbln/diffusers/models/transformers/prior_transformer.py +2 -16
  14. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -16
  15. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -14
  16. optimum/rbln/diffusers/models/unets/__init__.py +0 -1
  17. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -17
  18. optimum/rbln/diffusers/pipelines/__init__.py +0 -4
  19. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -20
  20. optimum/rbln/modeling.py +45 -20
  21. optimum/rbln/modeling_base.py +1 -0
  22. optimum/rbln/transformers/configuration_generic.py +27 -0
  23. optimum/rbln/transformers/modeling_attention_utils.py +109 -242
  24. optimum/rbln/transformers/modeling_generic.py +61 -2
  25. optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +2 -28
  26. optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +5 -68
  27. optimum/rbln/transformers/models/bart/modeling_bart.py +2 -23
  28. optimum/rbln/transformers/models/bert/modeling_bert.py +1 -86
  29. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +15 -42
  30. optimum/rbln/transformers/models/clip/modeling_clip.py +2 -40
  31. optimum/rbln/transformers/models/colpali/modeling_colpali.py +44 -5
  32. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -6
  33. optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +2 -6
  34. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +9 -17
  35. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +12 -36
  36. optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +0 -17
  37. optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +0 -24
  38. optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -17
  39. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +5 -3
  40. optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +8 -24
  41. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +5 -3
  42. optimum/rbln/transformers/models/llava/modeling_llava.py +24 -36
  43. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +4 -2
  44. optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
  45. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
  46. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -13
  47. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +3 -2
  48. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +3 -2
  49. optimum/rbln/transformers/models/resnet/configuration_resnet.py +0 -17
  50. optimum/rbln/transformers/models/resnet/modeling_resnet.py +0 -73
  51. optimum/rbln/transformers/models/roberta/modeling_roberta.py +0 -33
  52. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +4 -2
  53. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +10 -34
  54. optimum/rbln/transformers/models/siglip/modeling_siglip.py +1 -17
  55. optimum/rbln/transformers/models/swin/modeling_swin.py +1 -14
  56. optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
  57. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -16
  58. optimum/rbln/transformers/models/vit/modeling_vit.py +0 -19
  59. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +3 -15
  60. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +8 -60
  61. optimum/rbln/transformers/models/whisper/generation_whisper.py +14 -48
  62. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
  63. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -43
  64. optimum/rbln/transformers/utils/rbln_quantization.py +0 -9
  65. optimum/rbln/utils/depreacate_utils.py +16 -0
  66. optimum/rbln/utils/hub.py +3 -14
  67. optimum/rbln/utils/runtime_utils.py +0 -32
  68. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +2 -2
  69. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +72 -79
  70. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +1 -1
  71. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +0 -67
  72. optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +0 -59
  73. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +0 -114
  74. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +0 -275
  75. optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +0 -201
  76. optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +0 -15
  77. optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +0 -46
  78. optimum/rbln/utils/deprecation.py +0 -213
  79. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/entry_points.txt +0 -0
  80. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
@@ -12,11 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional, Tuple, Union
16
-
17
- import torch
18
- from transformers.modeling_outputs import QuestionAnsweringModelOutput
19
-
20
15
  from ...modeling_generic import RBLNModelForQuestionAnswering
21
16
 
22
17
 
@@ -30,22 +25,3 @@ class RBLNDistilBertForQuestionAnswering(RBLNModelForQuestionAnswering):
30
25
  """
31
26
 
32
27
  rbln_model_input_names = ["input_ids", "attention_mask"]
33
-
34
- def forward(
35
- self,
36
- input_ids: Optional[torch.Tensor] = None,
37
- attention_mask: Optional[torch.Tensor] = None,
38
- **kwargs,
39
- ) -> Union[Tuple, QuestionAnsweringModelOutput]:
40
- """
41
- Forward pass for the RBLN-optimized DistilBERT model for question answering tasks.
42
-
43
- Args:
44
- input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
45
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
46
-
47
- Returns:
48
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
49
- """
50
-
51
- return super().forward(input_ids, attention_mask, **kwargs)
@@ -13,11 +13,6 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from typing import Tuple, Union
17
-
18
- import torch
19
- from transformers.modeling_outputs import DepthEstimatorOutput
20
-
21
16
  from ...modeling_generic import RBLNModelForDepthEstimation
22
17
 
23
18
 
@@ -28,15 +23,3 @@ class RBLNDPTForDepthEstimation(RBLNModelForDepthEstimation):
28
23
  This class provides hardware-accelerated inference for DPT (Dense Prediction Transformer)
29
24
  models on RBLN devices, supporting monocular depth estimation from single images.
30
25
  """
31
-
32
- def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[Tuple, DepthEstimatorOutput]:
33
- """
34
- Forward pass for the RBLN-optimized DPT model.
35
-
36
- Args:
37
- pixel_values (torch.FloatTensor of shape (batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images.
38
-
39
- Returns:
40
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a DepthEstimatorOutput object.
41
- """
42
- return super().forward(pixel_values, **kwargs)
@@ -99,7 +99,9 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
99
99
  return True
100
100
 
101
101
  @classmethod
102
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
102
+ def get_pytorch_model(cls, *args, **kwargs):
103
+ model = super().get_pytorch_model(*args, **kwargs)
104
+
103
105
  with no_init_weights():
104
106
  model_cls_name = model.model.language_model.__class__.__name__
105
107
  causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
@@ -133,7 +135,7 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
133
135
  return self.language_model.get_input_embeddings()
134
136
 
135
137
  @classmethod
136
- def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
138
+ def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
137
139
  return model.multi_modal_projector
138
140
 
139
141
  @classmethod
@@ -480,7 +482,7 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
480
482
  @classmethod
481
483
  @torch.inference_mode()
482
484
  def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNGemma3ForCausalLMConfig):
483
- wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
485
+ wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
484
486
 
485
487
  rbln_compile_configs = rbln_config.compile_cfgs
486
488
  prefill_compile_config = rbln_compile_configs[0]
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from pathlib import Path
16
- from typing import TYPE_CHECKING, List, Optional, Tuple, Union
16
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
17
17
 
18
18
  import torch
19
19
  from torch import Tensor, nn
@@ -206,7 +206,8 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
206
206
  torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
207
207
 
208
208
  @classmethod
209
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
209
+ def get_pytorch_model(cls, *args, **kwargs):
210
+ model = super().get_pytorch_model(*args, **kwargs)
210
211
  model.encoder = model.model.encoder
211
212
  model.decoder = model.model.decoder
212
213
  model.text_backbone = model.model.text_backbone
@@ -216,7 +217,7 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
216
217
  return model
217
218
 
218
219
  @classmethod
219
- def _wrap_model_if_needed(
220
+ def wrap_model_if_needed(
220
221
  cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
221
222
  ) -> torch.nn.Module:
222
223
  return model.model.text_projection
@@ -529,26 +530,9 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
529
530
  output_attentions: Optional[bool] = None,
530
531
  output_hidden_states: Optional[bool] = None,
531
532
  return_dict: Optional[bool] = None,
533
+ labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
532
534
  **kwargs,
533
- ) -> Union[GroundingDinoObjectDetectionOutput, Tuple]:
534
- """
535
- Forward pass for the RBLN-optimized GroundingDinoForObjectDetection model.
536
-
537
- Args:
538
- pixel_values (torch.Tensor of shape (batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images.
539
- input_ids (torch.LongTensor of shape (batch_size, text_sequence_length)): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
540
- token_type_ids (torch.LongTensor of shape (batch_size, text_sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
541
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
542
- pixel_mask (torch.Tensor of shape (batch_size, height, width), optional): Mask to avoid performing attention on padding pixel values.
543
- encoder_outputs (Tuple consists of last_hidden_state of shape(batch_size, sequence_length, hidden_size), optional): A sequence of hidden-states at the output of the last layer of the encoder.
544
- output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers.
545
- output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers.
546
- return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple.
547
-
548
- Returns:
549
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a GroundingDinoObjectDetectionOutput object.
550
- """
551
-
535
+ ):
552
536
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
553
537
 
554
538
  # Pad image to rbln_config.image_height and rbln_config.image_width
@@ -679,7 +663,7 @@ class RBLNGroundingDinoEncoder(RBLNModel):
679
663
  self.encoder_runtime = RBLNPytorchRuntime(self.model[0])
680
664
 
681
665
  @classmethod
682
- def _wrap_model_if_needed(
666
+ def wrap_model_if_needed(
683
667
  cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
684
668
  ) -> torch.nn.Module:
685
669
  model = _GroundingDinoEncoder(model, rbln_config).eval()
@@ -877,7 +861,7 @@ class RBLNGroundingDinoDecoder(RBLNModel):
877
861
  self.decoder_runtime = RBLNPytorchRuntime(self.model[0])
878
862
 
879
863
  @classmethod
880
- def _wrap_model_if_needed(
864
+ def wrap_model_if_needed(
881
865
  cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
882
866
  ) -> torch.nn.Module:
883
867
  return _GroundingDinoDecoder(model, rbln_config).eval()
@@ -110,7 +110,7 @@ class RBLNIdefics3VisionTransformer(RBLNModel):
110
110
  return self.embeddings
111
111
 
112
112
  @classmethod
113
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
113
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
114
114
  class Idefics3VisionTransformerWrapper(torch.nn.Module):
115
115
  def __init__(self, model: "Idefics3VisionTransformer"):
116
116
  super().__init__()
@@ -240,7 +240,9 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationM
240
240
  return True
241
241
 
242
242
  @classmethod
243
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
243
+ def get_pytorch_model(cls, *args, **kwargs):
244
+ model = super().get_pytorch_model(*args, **kwargs)
245
+
244
246
  with no_init_weights():
245
247
  model_cls_name = model.model.text_model.__class__.__name__
246
248
  causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
@@ -269,7 +271,7 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationM
269
271
  return self.text_model.get_input_embeddings()
270
272
 
271
273
  @classmethod
272
- def _wrap_model_if_needed(cls, model, rbln_config):
274
+ def wrap_model_if_needed(cls, model, rbln_config):
273
275
  return model.model.connector
274
276
 
275
277
  @classmethod
@@ -88,22 +88,15 @@ class LoopVisionTower(LoopProcessor):
88
88
 
89
89
 
90
90
  class LoopProjector(LoopProcessor):
91
- def __init__(self, multi_modal_projector: "RBLNModel", rbln_config=None):
91
+ def __init__(self, multi_modal_projector: "RBLNModel"):
92
92
  super().__init__(model=multi_modal_projector)
93
- self.rbln_config = rbln_config
94
93
 
95
94
  def _get_batch_size(self, image_feature, **kwargs):
96
95
  return image_feature.shape[0]
97
96
 
98
97
  def _prepare_inputs_for_iteration(self, index, common_inputs, image_feature, **kwargs):
99
98
  image_feature_item = image_feature[index : index + 1]
100
- if hasattr(self.rbln_config.vision_tower, "max_image_size"):
101
- out_buffer = [
102
- tensor[:, index * image_feature.shape[1] : (index + 1) * image_feature.shape[1], :]
103
- for tensor in kwargs["out"]
104
- ]
105
- else:
106
- out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
99
+ out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
107
100
  return ([image_feature_item], {"out": out_buffer})
108
101
 
109
102
  def _process_outputs(self, outputs: list, **kwargs):
@@ -182,7 +175,9 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
182
175
  return True
183
176
 
184
177
  @classmethod
185
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
178
+ def get_pytorch_model(cls, *args, **kwargs):
179
+ model = super().get_pytorch_model(*args, **kwargs)
180
+
186
181
  with no_init_weights():
187
182
  model_cls_name = model.model.language_model.__class__.__name__
188
183
  causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
@@ -199,7 +194,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
199
194
  def __post_init__(self, **kwargs):
200
195
  self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
201
196
  self.language_model = self.rbln_submodules[1]
202
- self.multi_modal_projector = LoopProjector(self.model[0], rbln_config=self.rbln_config)
197
+ self.multi_modal_projector = LoopProjector(self.model[0])
203
198
  self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
204
199
  return super().__post_init__(**kwargs)
205
200
 
@@ -213,7 +208,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
213
208
  return self.language_model.get_input_embeddings()
214
209
 
215
210
  @classmethod
216
- def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
211
+ def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
217
212
  return model.multi_modal_projector
218
213
 
219
214
  @classmethod
@@ -226,8 +221,10 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
226
221
  ) -> RBLNModelConfig:
227
222
  # support for pixtral that needs padding
228
223
  if hasattr(rbln_config.vision_tower, "max_image_size"):
229
- num_positions = (rbln_config.vision_tower.max_image_size[0] // model_config.vision_config.patch_size) * (
230
- rbln_config.vision_tower.max_image_size[1] // model_config.vision_config.patch_size
224
+ num_positions = (
225
+ rbln_config.batch_size
226
+ * (rbln_config.vision_tower.max_image_size[0] // model_config.vision_config.patch_size)
227
+ * (rbln_config.vision_tower.max_image_size[1] // model_config.vision_config.patch_size)
231
228
  )
232
229
  selected_image_feature_dim = num_positions
233
230
 
@@ -356,32 +353,23 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
356
353
 
357
354
  if hasattr(self.rbln_config.vision_tower, "max_image_size"):
358
355
  num_real_patches = selected_image_feature.shape[1]
359
- max_patches = (self.rbln_config.vision_tower.max_image_size[0] // self.config.vision_config.patch_size) * (
360
- self.rbln_config.vision_tower.max_image_size[1] // self.config.vision_config.patch_size
356
+ max_patches = (
357
+ (self.rbln_config.vision_tower.max_image_size[0] // self.config.vision_config.patch_size)
358
+ * (self.rbln_config.vision_tower.max_image_size[1] // self.config.vision_config.patch_size)
359
+ * pixel_values.shape[0]
361
360
  )
361
+ num_padding_patches = max_patches - num_real_patches
362
362
 
363
- chunks = []
364
- for i in range(0, num_real_patches, max_patches):
365
- chunk = selected_image_feature[:, i : i + max_patches, :]
366
- chunk_size = chunk.shape[1]
367
-
368
- if chunk_size < max_patches:
369
- padding_tensor = torch.zeros(
370
- (selected_image_feature.shape[0], max_patches - chunk_size, selected_image_feature.shape[2]),
371
- dtype=selected_image_feature.dtype,
372
- )
373
- chunk = torch.cat([chunk, padding_tensor], dim=1)
374
- chunks.append(chunk)
375
-
376
- split_features = torch.cat(chunks, dim=0)
377
- num_chunks = len(chunks)
378
- projector_out_size = [1, max_patches * num_chunks, self.config.text_config.hidden_size]
363
+ projector_out_size = [1, max_patches, self.config.text_config.hidden_size]
379
364
  projector_out_buffer = [torch.empty(size=projector_out_size, dtype=torch.float32, device="cpu")]
380
- projected_features = self.multi_modal_projector(split_features, out=projector_out_buffer)
381
- projected_features = projected_features.view(
382
- selected_image_feature.shape[0], num_chunks * max_patches, self.config.text_config.hidden_size
365
+
366
+ padding_tensor = torch.zeros(
367
+ (selected_image_feature.shape[0], num_padding_patches, selected_image_feature.shape[2]),
368
+ dtype=selected_image_feature.dtype,
383
369
  )
384
- image_features = projected_features[:, :num_real_patches, :]
370
+ padded_feature = torch.cat([selected_image_feature, padding_tensor], dim=1)
371
+ padded_projected_feature = self.multi_modal_projector(padded_feature, out=projector_out_buffer)
372
+ image_features = padded_projected_feature[:, :num_real_patches, :]
385
373
  else:
386
374
  projector_out_size = [
387
375
  pixel_values.shape[0] * pixel_values.shape[1],
@@ -139,7 +139,9 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGeneration
139
139
  return True
140
140
 
141
141
  @classmethod
142
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
142
+ def get_pytorch_model(cls, *args, **kwargs):
143
+ model = super().get_pytorch_model(*args, **kwargs)
144
+
143
145
  with no_init_weights():
144
146
  model_cls_name = model.model.language_model.__class__.__name__
145
147
  causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
@@ -190,7 +192,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGeneration
190
192
  return self.language_model.get_input_embeddings()
191
193
 
192
194
  @classmethod
193
- def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
195
+ def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
194
196
  return model.multi_modal_projector
195
197
 
196
198
  @classmethod
@@ -69,7 +69,7 @@ class RBLNOPTForCausalLM(RBLNDecoderOnlyModelForCausalLM):
69
69
  return layer
70
70
 
71
71
  @classmethod
72
- def _wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
72
+ def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
73
73
  for i in range(len(model.model.decoder.layers)):
74
74
  model.model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.model.decoder.layers[i])
75
75
 
@@ -95,7 +95,7 @@ class RBLNOPTModel(RBLNDecoderOnlyModel):
95
95
  return layer
96
96
 
97
97
  @classmethod
98
- def _wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
98
+ def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
99
99
  for i in range(len(model.decoder.layers)):
100
100
  model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.decoder.layers[i])
101
101
 
@@ -54,7 +54,7 @@ class RBLNPegasusForConditionalGeneration(RBLNModelForSeq2SeqLM):
54
54
  support_causal_attn = True
55
55
 
56
56
  @classmethod
57
- def _wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNPegasusForConditionalGenerationConfig):
57
+ def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNPegasusForConditionalGenerationConfig):
58
58
  return PegasusWrapper(
59
59
  model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
60
60
  )
@@ -229,7 +229,7 @@ class RBLNPixtralVisionModel(RBLNModel):
229
229
  torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
230
230
 
231
231
  @classmethod
232
- def _wrap_model_if_needed(
232
+ def wrap_model_if_needed(
233
233
  cls, model: torch.nn.Module, rbln_config: RBLNPixtralVisionModelConfig
234
234
  ) -> torch.nn.Module:
235
235
  wrapper_cfg = {
@@ -293,18 +293,6 @@ class RBLNPixtralVisionModel(RBLNModel):
293
293
  return_dict: bool = True,
294
294
  **kwargs,
295
295
  ) -> Union[Tuple, BaseModelOutput]:
296
- """
297
- Forward pass for the RBLN-optimized Pixtral vision model.
298
-
299
- Args:
300
- pixel_values (torch.Tensor of shape (batch_size, num_channels, image_size, image_size)) — The tensors corresponding to the input images. Pixel values can be obtained using PixtralImageProcessor. See PixtralImageProcessor.call() for details (PixtralProcessor uses PixtralImageProcessor for processing images).
301
- image_sizes (torch.Tensor of shape (batch_size, 2), optional) — The sizes of the images in the batch, being (height, width) for each image.
302
- output_hidden_states (bool, optional) — Whether or not to return the hidden states of all layers. See hidden_states under returned tensors for more detail.
303
- return_dict (bool, optional) — Whether or not to return a ModelOutput instead of a plain tuple.
304
-
305
- Returns:
306
- BaseModelOutput or tuple(torch.FloatTensor)
307
- """
308
296
  output_hidden_states = (
309
297
  output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
310
298
  )
@@ -88,7 +88,7 @@ class RBLNQwen2_5_VisionTransformerPretrainedModel(RBLNModel):
88
88
  torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
89
89
 
90
90
  @classmethod
91
- def _wrap_model_if_needed(
91
+ def wrap_model_if_needed(
92
92
  cls, model: "PreTrainedModel", rbln_config: RBLNQwen2_5_VisionTransformerPretrainedModelConfig
93
93
  ):
94
94
  return Qwen2_5_VisionTransformerWrapper(model).eval()
@@ -393,7 +393,8 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
393
393
  return True
394
394
 
395
395
  @classmethod
396
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
396
+ def get_pytorch_model(cls, *args, **kwargs):
397
+ model = super().get_pytorch_model(*args, **kwargs)
397
398
  model.model.lm_head = model.lm_head
398
399
  model.lm_head = None
399
400
  del model.lm_head
@@ -89,7 +89,7 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
89
89
  torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
90
90
 
91
91
  @classmethod
92
- def _wrap_model_if_needed(
92
+ def wrap_model_if_needed(
93
93
  cls, model: "PreTrainedModel", rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig
94
94
  ):
95
95
  return Qwen2VisionTransformerWrapper(model).eval()
@@ -282,7 +282,8 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
282
282
  return True
283
283
 
284
284
  @classmethod
285
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
285
+ def get_pytorch_model(cls, *args, **kwargs):
286
+ model = super().get_pytorch_model(*args, **kwargs)
286
287
  model.model.lm_head = model.lm_head
287
288
  model.lm_head = None
288
289
  del model.lm_head
@@ -13,8 +13,6 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from typing import Optional
17
-
18
16
  from ...configuration_generic import RBLNModelForImageClassificationConfig
19
17
 
20
18
 
@@ -25,18 +23,3 @@ class RBLNResNetForImageClassificationConfig(RBLNModelForImageClassificationConf
25
23
  This configuration class stores the configuration parameters specific to
26
24
  RBLN-optimized ResNet models for image classification tasks.
27
25
  """
28
-
29
- def __init__(self, output_hidden_states: Optional[bool] = None, **kwargs):
30
- """
31
- Args:
32
- image_size (Optional[Union[int, Tuple[int, int]]]): The size of input images.
33
- Can be an integer for square images or a tuple (height, width).
34
- batch_size (Optional[int]): The batch size for inference. Defaults to 1.
35
- output_hidden_states (bool, optional) — Whether or not to return the hidden states of all layers.
36
- kwargs: Additional arguments passed to the parent RBLNModelConfig.
37
-
38
- Raises:
39
- ValueError: If batch_size is not a positive integer.
40
- """
41
- super().__init__(**kwargs)
42
- self.output_hidden_states = output_hidden_states
@@ -13,17 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from typing import TYPE_CHECKING, Optional, Tuple, Union
17
-
18
- import torch
19
- from transformers.modeling_outputs import ImageClassifierOutputWithNoAttention
20
-
21
16
  from ...modeling_generic import RBLNModelForImageClassification
22
- from .configuration_resnet import RBLNResNetForImageClassificationConfig
23
-
24
-
25
- if TYPE_CHECKING:
26
- from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
27
17
 
28
18
 
29
19
  class RBLNResNetForImageClassification(RBLNModelForImageClassification):
@@ -34,66 +24,3 @@ class RBLNResNetForImageClassification(RBLNModelForImageClassification):
34
24
  on RBLN devices, supporting image classification with convolutional neural networks
35
25
  designed for computer vision tasks.
36
26
  """
37
-
38
- @classmethod
39
- def _update_rbln_config(
40
- cls,
41
- preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
42
- model: Optional["PreTrainedModel"] = None,
43
- model_config: Optional["PretrainedConfig"] = None,
44
- rbln_config: Optional["RBLNResNetForImageClassificationConfig"] = None,
45
- ) -> "RBLNResNetForImageClassificationConfig":
46
- if rbln_config.output_hidden_states is None:
47
- rbln_config.output_hidden_states = getattr(model_config, "output_hidden_states", False)
48
-
49
- rbln_config = super()._update_rbln_config(
50
- preprocessors=preprocessors,
51
- model=model,
52
- model_config=model_config,
53
- rbln_config=rbln_config,
54
- )
55
-
56
- return rbln_config
57
-
58
- @classmethod
59
- def _wrap_model_if_needed(
60
- cls, model: torch.nn.Module, rbln_config: "RBLNResNetForImageClassificationConfig"
61
- ) -> torch.nn.Module:
62
- class _ResNetForImageClassification(torch.nn.Module):
63
- def __init__(self, model: torch.nn.Module, output_hidden_states: bool):
64
- super().__init__()
65
- self.model = model
66
- self.output_hidden_states = output_hidden_states
67
-
68
- def forward(self, *args, **kwargs):
69
- output = self.model(*args, output_hidden_states=self.output_hidden_states, **kwargs)
70
- return output
71
-
72
- return _ResNetForImageClassification(model, rbln_config.output_hidden_states)
73
-
74
- def forward(
75
- self, pixel_values: torch.Tensor, output_hidden_states: bool = None, return_dict: bool = None, **kwargs
76
- ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
77
- """
78
- Foward pass for the RBLN-optimized ResNet model for image classification.
79
-
80
- Args:
81
- pixel_values (torch.FloatTensor of shape (batch_size, channels, height, width)): The tensors corresponding to the input images.
82
- output_hidden_states (bool, *optional*, defaults to False): Whether or not to return the hidden states of all layers.
83
- See hidden_states under returned tensors for more details.
84
- return_dict (bool, *optional*, defaults to True): Whether to return a dictionary of outputs.
85
-
86
- Returns:
87
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a ImageClassifierOutputWithNoAttention object.
88
- """
89
- output_hidden_states = (
90
- output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
91
- )
92
-
93
- if output_hidden_states != self.rbln_config.output_hidden_states:
94
- raise ValueError(
95
- f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
96
- f"Please compile again with the correct argument."
97
- )
98
-
99
- return super().forward(pixel_values=pixel_values, return_dict=return_dict, **kwargs)
@@ -12,11 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Tuple, Union
16
-
17
- import torch
18
- from transformers.modeling_outputs import MaskedLMOutput, SequenceClassifierOutput
19
-
20
15
  from ...modeling_generic import RBLNModelForMaskedLM, RBLNModelForSequenceClassification
21
16
 
22
17
 
@@ -31,19 +26,6 @@ class RBLNRobertaForMaskedLM(RBLNModelForMaskedLM):
31
26
 
32
27
  rbln_model_input_names = ["input_ids", "attention_mask"]
33
28
 
34
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Union[Tuple, MaskedLMOutput]:
35
- """
36
- Forward pass for the RBLN-optimized RoBERTa model for masked language modeling tasks.
37
-
38
- Args:
39
- input_ids (torch.LongTensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
40
- attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
41
-
42
- Returns:
43
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
44
- """
45
- return super().forward(input_ids, attention_mask, **kwargs)
46
-
47
29
 
48
30
  class RBLNRobertaForSequenceClassification(RBLNModelForSequenceClassification):
49
31
  """
@@ -55,18 +37,3 @@ class RBLNRobertaForSequenceClassification(RBLNModelForSequenceClassification):
55
37
  """
56
38
 
57
39
  rbln_model_input_names = ["input_ids", "attention_mask"]
58
-
59
- def forward(
60
- self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
61
- ) -> Union[Tuple, SequenceClassifierOutput]:
62
- """
63
- Forward pass for the RBLN-optimized RoBERTa model for sequence classification tasks.
64
-
65
- Args:
66
- input_ids (torch.LongTensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
67
- attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
68
-
69
- Returns:
70
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a SequenceClassifierOutput object.
71
- """
72
- return super().forward(input_ids, attention_mask, **kwargs)
@@ -15,7 +15,6 @@
15
15
  from typing import Any, Optional
16
16
 
17
17
  from ....configuration_utils import RBLNModelConfig
18
- from ....utils.deprecation import deprecate_kwarg
19
18
  from ....utils.logging import get_logger
20
19
 
21
20
 
@@ -25,13 +24,13 @@ logger = get_logger()
25
24
  class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
26
25
  support_paged_attention = None
27
26
 
28
- @deprecate_kwarg(old_name="pad_token_id", version="0.10.0")
29
27
  def __init__(
30
28
  self,
31
29
  batch_size: Optional[int] = None,
32
30
  enc_max_seq_len: Optional[int] = None,
33
31
  dec_max_seq_len: Optional[int] = None,
34
32
  use_attention_mask: Optional[bool] = None,
33
+ pad_token_id: Optional[int] = None,
35
34
  kvcache_num_blocks: Optional[int] = None,
36
35
  kvcache_block_size: Optional[int] = None,
37
36
  **kwargs: Any,
@@ -42,6 +41,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
42
41
  enc_max_seq_len (Optional[int]): Maximum sequence length for the encoder.
43
42
  dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
44
43
  use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
44
+ pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
45
45
  kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
46
46
  PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
47
47
  kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
@@ -61,6 +61,8 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
61
61
 
62
62
  self.use_attention_mask = use_attention_mask
63
63
 
64
+ self.pad_token_id = pad_token_id
65
+
64
66
  if self.support_paged_attention:
65
67
  self.kvcache_num_blocks = kvcache_num_blocks
66
68
  self.kvcache_block_size = kvcache_block_size