optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. optimum/rbln/__init__.py +0 -12
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +2 -4
  4. optimum/rbln/diffusers/__init__.py +0 -12
  5. optimum/rbln/diffusers/configurations/__init__.py +0 -3
  6. optimum/rbln/diffusers/configurations/models/__init__.py +0 -2
  7. optimum/rbln/diffusers/configurations/pipelines/__init__.py +0 -3
  8. optimum/rbln/diffusers/models/__init__.py +3 -17
  9. optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -1
  10. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
  11. optimum/rbln/diffusers/models/autoencoders/vae.py +8 -27
  12. optimum/rbln/diffusers/models/controlnet.py +1 -16
  13. optimum/rbln/diffusers/models/transformers/prior_transformer.py +2 -16
  14. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -16
  15. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -14
  16. optimum/rbln/diffusers/models/unets/__init__.py +0 -1
  17. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -17
  18. optimum/rbln/diffusers/pipelines/__init__.py +0 -4
  19. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -20
  20. optimum/rbln/modeling.py +45 -20
  21. optimum/rbln/modeling_base.py +1 -0
  22. optimum/rbln/transformers/configuration_generic.py +27 -0
  23. optimum/rbln/transformers/modeling_attention_utils.py +109 -242
  24. optimum/rbln/transformers/modeling_generic.py +61 -2
  25. optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +2 -28
  26. optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +5 -68
  27. optimum/rbln/transformers/models/bart/modeling_bart.py +2 -23
  28. optimum/rbln/transformers/models/bert/modeling_bert.py +1 -86
  29. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +15 -42
  30. optimum/rbln/transformers/models/clip/modeling_clip.py +2 -40
  31. optimum/rbln/transformers/models/colpali/modeling_colpali.py +44 -5
  32. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -6
  33. optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +2 -6
  34. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +9 -17
  35. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +12 -36
  36. optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +0 -17
  37. optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +0 -24
  38. optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -17
  39. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +5 -3
  40. optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +8 -24
  41. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +5 -3
  42. optimum/rbln/transformers/models/llava/modeling_llava.py +24 -36
  43. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +4 -2
  44. optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
  45. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
  46. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -13
  47. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +3 -2
  48. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +3 -2
  49. optimum/rbln/transformers/models/resnet/configuration_resnet.py +0 -17
  50. optimum/rbln/transformers/models/resnet/modeling_resnet.py +0 -73
  51. optimum/rbln/transformers/models/roberta/modeling_roberta.py +0 -33
  52. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +4 -2
  53. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +10 -34
  54. optimum/rbln/transformers/models/siglip/modeling_siglip.py +1 -17
  55. optimum/rbln/transformers/models/swin/modeling_swin.py +1 -14
  56. optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
  57. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -16
  58. optimum/rbln/transformers/models/vit/modeling_vit.py +0 -19
  59. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +3 -15
  60. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +8 -60
  61. optimum/rbln/transformers/models/whisper/generation_whisper.py +14 -48
  62. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
  63. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -43
  64. optimum/rbln/transformers/utils/rbln_quantization.py +0 -9
  65. optimum/rbln/utils/depreacate_utils.py +16 -0
  66. optimum/rbln/utils/hub.py +3 -14
  67. optimum/rbln/utils/runtime_utils.py +0 -32
  68. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +2 -2
  69. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +72 -79
  70. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +1 -1
  71. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +0 -67
  72. optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +0 -59
  73. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +0 -114
  74. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +0 -275
  75. optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +0 -201
  76. optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +0 -15
  77. optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +0 -46
  78. optimum/rbln/utils/deprecation.py +0 -213
  79. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/entry_points.txt +0 -0
  80. {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
@@ -13,11 +13,9 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import inspect
16
- from typing import Any, Callable, Optional, Tuple, Union
16
+ from typing import Any, Callable
17
17
 
18
- import torch
19
18
  from transformers import BartForConditionalGeneration, PreTrainedModel
20
- from transformers.modeling_outputs import Seq2SeqModelOutput
21
19
 
22
20
  from ....utils.logging import get_logger
23
21
  from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
@@ -37,25 +35,6 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
37
35
  on RBLN devices, optimized for feature extraction use cases.
38
36
  """
39
37
 
40
- def forward(
41
- self,
42
- input_ids: Optional[torch.Tensor] = None,
43
- attention_mask: Optional[torch.Tensor] = None,
44
- **kwargs,
45
- ) -> Union[Tuple, Seq2SeqModelOutput]:
46
- """
47
- Forward pass for the RBLN-optimized BART model for feature extraction tasks.
48
-
49
- Args:
50
- input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
51
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
52
-
53
- Returns:
54
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
55
- """
56
-
57
- return super().forward(input_ids, attention_mask, **kwargs)
58
-
59
38
 
60
39
  class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
61
40
  """
@@ -69,7 +48,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
69
48
  support_causal_attn = True
70
49
 
71
50
  @classmethod
72
- def _wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
51
+ def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
73
52
  return BartWrapper(
74
53
  model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
75
54
  )
@@ -12,14 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional, Tuple, Union
16
-
17
15
  import torch
18
- from transformers.modeling_outputs import (
19
- BaseModelOutputWithPoolingAndCrossAttentions,
20
- MaskedLMOutput,
21
- QuestionAnsweringModelOutput,
22
- )
23
16
 
24
17
  from ...modeling_generic import (
25
18
  RBLNModelForMaskedLM,
@@ -42,45 +35,9 @@ class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
42
35
  rbln_model_input_names = ["input_ids", "attention_mask"]
43
36
 
44
37
  @classmethod
45
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
38
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
46
39
  return BertModelWrapper(model, rbln_config)
47
40
 
48
- def forward(
49
- self,
50
- input_ids: Optional[torch.Tensor] = None,
51
- attention_mask: Optional[torch.Tensor] = None,
52
- token_type_ids: Optional[torch.Tensor] = None,
53
- position_ids: Optional[torch.Tensor] = None,
54
- **kwargs,
55
- ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple]:
56
- """
57
- Forward pass for the RBLN-optimized BERT model for feature extraction tasks.
58
-
59
- Args:
60
- input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
61
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
62
- token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
63
- position_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of positions of each input sequence tokens in the position embeddings.
64
-
65
- Returns:
66
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPoolingAndCrossAttentions object.
67
- """
68
-
69
- input_map = {
70
- "input_ids": input_ids,
71
- "attention_mask": attention_mask,
72
- "token_type_ids": token_type_ids,
73
- "position_ids": position_ids,
74
- }
75
-
76
- model_input_names = getattr(self.rbln_config, "model_input_names", None)
77
- if model_input_names is None:
78
- model_input_names = self.rbln_model_input_names
79
-
80
- ordered_inputs = [input_map[name] for name in model_input_names if name in input_map]
81
-
82
- return super().forward(*ordered_inputs, **kwargs)
83
-
84
41
 
85
42
  class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
86
43
  """
@@ -93,27 +50,6 @@ class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
93
50
 
94
51
  rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
95
52
 
96
- def forward(
97
- self,
98
- input_ids: Optional[torch.Tensor] = None,
99
- attention_mask: Optional[torch.Tensor] = None,
100
- token_type_ids: Optional[torch.Tensor] = None,
101
- **kwargs,
102
- ) -> Union[MaskedLMOutput, Tuple]:
103
- """
104
- Forward pass for the RBLN-optimized BERT model for masked language modeling tasks.
105
-
106
- Args:
107
- input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
108
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
109
- token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
110
-
111
- Returns:
112
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
113
- """
114
-
115
- return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
116
-
117
53
 
118
54
  class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
119
55
  """
@@ -125,24 +61,3 @@ class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
125
61
  """
126
62
 
127
63
  rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
128
-
129
- def forward(
130
- self,
131
- input_ids: Optional[torch.Tensor] = None,
132
- attention_mask: Optional[torch.Tensor] = None,
133
- token_type_ids: Optional[torch.Tensor] = None,
134
- **kwargs,
135
- ) -> Union[QuestionAnsweringModelOutput, Tuple]:
136
- """
137
- Forward pass for the RBLN-optimized BERT model for question answering tasks.
138
-
139
- Args:
140
- input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
141
- attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
142
- token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
143
-
144
- Returns:
145
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
146
- """
147
-
148
- return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
@@ -14,7 +14,7 @@
14
14
 
15
15
  import inspect
16
16
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
17
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
18
18
 
19
19
  import torch
20
20
  from transformers import (
@@ -71,7 +71,7 @@ class RBLNBlip2VisionModel(RBLNModel):
71
71
  return self.embeddings
72
72
 
73
73
  @classmethod
74
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
74
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
75
75
  class Blip2VisionModelWrapper(torch.nn.Module):
76
76
  def __init__(self, model: "Blip2VisionModel") -> None:
77
77
  super().__init__()
@@ -111,20 +111,11 @@ class RBLNBlip2VisionModel(RBLNModel):
111
111
  def forward(
112
112
  self,
113
113
  pixel_values: torch.FloatTensor,
114
- interpolate_pos_encoding: bool = False,
114
+ output_attentions: Optional[bool] = None,
115
+ output_hidden_states: Optional[bool] = None,
115
116
  return_dict: Optional[bool] = None,
117
+ interpolate_pos_encoding: bool = False,
116
118
  ) -> Union[Tuple, BaseModelOutputWithPooling]:
117
- """
118
- Forward pass for the RBLN-optimized Blip2VisionModel model.
119
-
120
- Args:
121
- pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
122
- interpolate_pos_encoding (bool, optional): Whether to interpolate the positional encoding of the image embeddings. Defaults to False.
123
- return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
124
-
125
- Returns:
126
- BaseModelOutputWithPooling or tuple(torch.FloatTensor): The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
127
- """
128
119
  batch_size = pixel_values.shape[0]
129
120
  outputs = []
130
121
  for i in range(batch_size):
@@ -160,7 +151,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
160
151
  return self.embeddings.word_embeddings
161
152
 
162
153
  @classmethod
163
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
154
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
164
155
  class Blip2QFormerModelWrapper(torch.nn.Module):
165
156
  def __init__(self, model: "Blip2QFormerModel"):
166
157
  super().__init__()
@@ -240,22 +231,17 @@ class RBLNBlip2QFormerModel(RBLNModel):
240
231
  def forward(
241
232
  self,
242
233
  query_embeds: torch.FloatTensor,
234
+ query_length: Optional[int] = None,
235
+ attention_mask: Optional[torch.FloatTensor] = None,
236
+ head_mask: Optional[torch.FloatTensor] = None,
243
237
  encoder_hidden_states: Optional[torch.FloatTensor] = None,
244
238
  encoder_attention_mask: Optional[torch.FloatTensor] = None,
239
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
240
+ use_cache: Optional[bool] = None,
241
+ output_attentions: Optional[bool] = None,
242
+ output_hidden_states: Optional[bool] = None,
245
243
  return_dict: Optional[bool] = None,
246
244
  ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
247
- """
248
- The forward pass for the RBLN-optimized Blip2QFormerModel model.
249
-
250
- Args:
251
- query_embeds (torch.FloatTensor): Hidden states to be used in the attention computation.
252
- encoder_hidden_states (torch.FloatTensor, optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder.
253
- encoder_attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder.
254
- return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
255
-
256
- Returns:
257
- BaseModelOutputWithPoolingAndCrossAttentions or tuple(torch.FloatTensor): The model outputs. If `return_dict=False` is passed, returns a tuple of tensors. Otherwise, returns a `BaseModelOutputWithPoolingAndCrossAttentions` object.
258
- """
259
245
  batch_size = query_embeds.shape[0]
260
246
  outputs = []
261
247
  for i in range(batch_size):
@@ -363,7 +349,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
363
349
  return self.language_model.get_input_embeddings()
364
350
 
365
351
  @classmethod
366
- def _wrap_model_if_needed(cls, model, rbln_config):
352
+ def wrap_model_if_needed(cls, model, rbln_config):
367
353
  return model.language_projection
368
354
 
369
355
  @classmethod
@@ -458,20 +444,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
458
444
  inputs_embeds: Optional[torch.FloatTensor] = None,
459
445
  interpolate_pos_encoding: bool = False,
460
446
  **generate_kwargs,
461
- ) -> List[torch.LongTensor]:
462
- """
463
- The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
464
- Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.generate) for more details.
465
-
466
- Args:
467
- pixel_values (torch.FloatTensor): Input images to be processed.
468
- input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
469
- attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
470
- inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
471
- interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
472
- Returns:
473
- A list of strings of length batch_size * num_captions.
474
- """
447
+ ) -> torch.LongTensor:
475
448
  batch_size = pixel_values.shape[0]
476
449
  image_embeds = self.vision_model(
477
450
  pixel_values,
@@ -54,7 +54,7 @@ class RBLNCLIPTextModel(RBLNModel):
54
54
  _tp_support = False
55
55
 
56
56
  @classmethod
57
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
57
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
58
58
  return _TextEncoder(model).eval()
59
59
 
60
60
  @classmethod
@@ -92,9 +92,6 @@ class RBLNCLIPTextModel(RBLNModel):
92
92
  Args:
93
93
  input_ids (torch.LongTensor): The input ids to the model.
94
94
  return_dict (Optional[bool]): Whether to return a dictionary of outputs.
95
-
96
- Returns:
97
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPTextModelOutput object.
98
95
  """
99
96
 
100
97
  # To ignore using attention_mask, we override forward method.
@@ -160,7 +157,7 @@ class RBLNCLIPVisionModel(RBLNModel):
160
157
  _tp_support = False
161
158
 
162
159
  @classmethod
163
- def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
160
+ def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
164
161
  wrapper_cfg = {
165
162
  "interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
166
163
  "output_hidden_states": rbln_config.output_hidden_states,
@@ -233,9 +230,6 @@ class RBLNCLIPVisionModel(RBLNModel):
233
230
  output_attentions (Optional[bool]): Whether to return attentions.
234
231
  output_hidden_states (Optional[bool]): Whether to return hidden states.
235
232
  interpolate_pos_encoding (bool): Whether to interpolate position encoding.
236
-
237
- Returns:
238
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
239
233
  """
240
234
 
241
235
  if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
@@ -313,38 +307,6 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
313
307
  multimodal embedding alignment tasks.
314
308
  """
315
309
 
316
- def forward(
317
- self,
318
- pixel_values: torch.FloatTensor,
319
- return_dict: bool = True,
320
- output_attentions: Optional[bool] = None,
321
- output_hidden_states: Optional[bool] = None,
322
- interpolate_pos_encoding: bool = False,
323
- **kwargs,
324
- ) -> Union[Tuple, CLIPVisionModelOutput]:
325
- """
326
- Forward pass for the RBLN-optimized CLIP vision encoder model with projection.
327
-
328
- Args:
329
- pixel_values (torch.Tensor): The pixel values to the model.
330
- return_dict (bool): Whether to return a dictionary of outputs.
331
- output_attentions (Optional[bool]): Whether to return attentions.
332
- output_hidden_states (Optional[bool]): Whether to return hidden states.
333
- interpolate_pos_encoding (bool): Whether to interpolate position encoding.
334
-
335
- Returns:
336
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPVisionModelOutput object.
337
- """
338
-
339
- return super().forward(
340
- pixel_values=pixel_values,
341
- return_dict=return_dict,
342
- output_attentions=output_attentions,
343
- output_hidden_states=output_hidden_states,
344
- interpolate_pos_encoding=interpolate_pos_encoding,
345
- **kwargs,
346
- )
347
-
348
310
  def _prepare_output(self, output, return_dict):
349
311
  # Prepare model output based on return_dict flag.
350
312
  # This method can be overridden by subclasses to provide task-specific output handling.
@@ -14,7 +14,8 @@
14
14
 
15
15
  import bisect
16
16
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Optional, Tuple, Union
17
+ from tempfile import TemporaryDirectory
18
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
18
19
 
19
20
  import torch
20
21
  from transformers import PretrainedConfig, PreTrainedModel
@@ -181,7 +182,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
181
182
  return multi_modal_projector
182
183
 
183
184
  @classmethod
184
- def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
185
+ def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
185
186
  return RBLNColPaliForRetrievalWrapper(
186
187
  causal_lm=model.vlm,
187
188
  embedding_proj_layer=model.embedding_proj_layer,
@@ -235,11 +236,49 @@ class RBLNColPaliForRetrieval(RBLNModel):
235
236
  return rbln_config
236
237
 
237
238
  @classmethod
238
- def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
239
- if hasattr(model, "vlm"):
239
+ def from_model(
240
+ cls,
241
+ model: "PreTrainedModel",
242
+ config: Optional[PretrainedConfig] = None,
243
+ rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
244
+ model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
245
+ subfolder: str = "",
246
+ **kwargs: Any,
247
+ ) -> "RBLNModel":
248
+ """
249
+ Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
250
+ This method performs the actual model conversion and compilation process.
251
+
252
+ Args:
253
+ model (PreTrainedModel): The PyTorch model to be compiled.
254
+ The object must be an instance of the HuggingFace transformers PreTrainedModel class.
255
+ config (Optional[PretrainedConfig]): The configuration object associated with the model.
256
+ rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
257
+ This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
258
+ For detailed configuration options, see the specific model's configuration class documentation.
259
+ kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
260
+
261
+ The method performs the following steps:
262
+
263
+ 1. Compiles the PyTorch model into an optimized RBLN graph
264
+ 2. Configures the model for the specified NPU device
265
+ 3. Creates the necessary runtime objects if requested
266
+ 4. Saves the compiled model and configurations
267
+
268
+ Returns:
269
+ (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
270
+ """
271
+ if not hasattr(model, "vision_tower"):
240
272
  model.vision_tower = model.vlm.vision_tower
241
273
  del model.vlm.model.vision_tower
242
- return model
274
+ model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
275
+ return model
276
+
277
+ @classmethod
278
+ def get_pytorch_model(cls, *args, **kwargs):
279
+ model = super().get_pytorch_model(*args, **kwargs)
280
+ model.vision_tower = model.vlm.vision_tower
281
+ del model.vlm.model.vision_tower
243
282
  return model
244
283
 
245
284
  def get_image_features(self, pixel_values: torch.Tensor):
@@ -274,18 +274,13 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
274
274
 
275
275
  @property
276
276
  def use_lora(self):
277
+ """Check if LoRA is enabled for this configuration."""
277
278
  return self.lora_config is not None
278
279
 
279
280
  @property
280
281
  def can_generate(self) -> bool:
281
282
  return "decode" in self.phases
282
283
 
283
- @property
284
- def nbits_per_param(self) -> int:
285
- if self.quantization:
286
- return self.quantization.nbits_per_param
287
- return 16
288
-
289
284
 
290
285
  class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
291
286
  """
@@ -46,12 +46,6 @@ class RBLNPageTableManager:
46
46
  """
47
47
  If the block is empty (empty_block), allocates a block from the free_block_pool.
48
48
  """
49
- if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
50
- raise IndexError(
51
- f"Invalid index(batch_idx={batch_idx}, block_idx={block_idx}): \n \
52
- BlockTable Shape(batch_axis, block_axis): {self.block_tables.shape}, BlockSize: {self.rbln_config.kvcache_block_size}"
53
- )
54
-
55
49
  if self.block_tables[batch_idx][block_idx] == self.EMPTY_BLOCK:
56
50
  if self.free_block_pool:
57
51
  block = self.free_block_pool.popleft()
@@ -102,6 +96,8 @@ class RBLNPageTableManager:
102
96
  s, e = cache_position[0][0].item(), cache_position[0][-1].item()
103
97
  for position in range(s, e + 1, self.rbln_config.kvcache_block_size):
104
98
  block_idx = position // self.rbln_config.kvcache_block_size
99
+ if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
100
+ raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
105
101
  self.update_block(batch_idx, block_idx)
106
102
 
107
103
  return self.replace_empty_block(self.block_tables[batch_idx])
@@ -12,12 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import TYPE_CHECKING, Any, Dict, Optional, Union
15
+ from typing import TYPE_CHECKING, Any, Dict, Optional
16
16
 
17
17
  import torch
18
- from transformers import GenerationConfig
19
18
  from transformers.generation.utils import GenerationMixin
20
- from transformers.modeling_outputs import ModelOutput
21
19
 
22
20
 
23
21
  if TYPE_CHECKING:
@@ -93,26 +91,20 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
93
91
  self,
94
92
  input_ids: torch.LongTensor,
95
93
  attention_mask: Optional[torch.LongTensor] = None,
96
- generation_config: Optional[GenerationConfig] = None,
94
+ max_length: Optional[int] = None,
97
95
  **kwargs,
98
- ) -> Union[ModelOutput, torch.LongTensor]:
96
+ ):
99
97
  """
100
98
  The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
101
- Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) for more details.
102
99
 
103
100
  Args:
104
- input_ids (torch.LongTensor): The input ids to the model.
105
- attention_mask (torch.LongTensor, optional): The attention mask to the model.
106
- generation_config (GenerationConfig, optional): The generation configuration to be used as base parametrization for the generation call. **kwargs passed to generate matching the attributes of generation_config will override them.
107
- If generation_config is not provided, the default will be used, which had the following loading priority: 1) from the generation_config.json model file, if it exists; 2) from the model configuration.
108
- Please note that unspecified parameters will inherit [GenerationConfig](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig)’s default values.
109
- kwargs (dict[str, Any], optional): Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
110
-
111
- Returns:
112
- A ModelOutput (if return_dict_in_generate=True or when config.return_dict_in_generate=True) or a torch.LongTensor.
101
+ input_ids: The input ids to the model.
102
+ attention_mask: The attention mask to the model.
103
+ max_length: The maximum length of the generated text.
104
+ kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
113
105
  """
114
- if generation_config is not None:
115
- kwargs["generation_config"] = generation_config
106
+ if max_length is not None:
107
+ kwargs["max_length"] = max_length
116
108
  if attention_mask is not None:
117
109
  kwargs["attention_mask"] = attention_mask
118
110
 
@@ -216,7 +216,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
216
216
  return self.rbln_config.kvcache_num_blocks
217
217
 
218
218
  @classmethod
219
- def _wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
219
+ def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
220
220
  return cls._decoder_wrapper_cls(model, rbln_config, cls._use_rotary_emb).eval()
221
221
 
222
222
  @classmethod
@@ -272,7 +272,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
272
272
  @classmethod
273
273
  @torch.inference_mode()
274
274
  def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
275
- wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
275
+ wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
276
276
  prefill_compile_config = rbln_config.compile_cfgs[0]
277
277
 
278
278
  # Here we use meta tensor, for the memory efficiency.
@@ -466,8 +466,13 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
466
466
 
467
467
  # Update kvcache_num_blocks based on the attention implementation.
468
468
  if rbln_config.attn_impl == "flash_attn":
469
- estimated_max_num_blocks = cls.get_maximum_num_blocks_by_model(
470
- model=model, model_config=model_config, rbln_config=rbln_config
469
+ estimated_max_num_blocks = cls.get_maximum_num_blocks(
470
+ config=model_config,
471
+ tensor_parallel_size=rbln_config.tensor_parallel_size or 1,
472
+ kvcache_block_size=rbln_config.kvcache_block_size,
473
+ nbits_per_param=16 if not rbln_config.quantization else 4, # TODO(jongho): FIX Ad-hoc
474
+ n_model_params=sum(p.numel() for p in model.parameters()),
475
+ num_runtimes=1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes),
471
476
  )
472
477
 
473
478
  if rbln_config.kvcache_num_blocks is None:
@@ -506,6 +511,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
506
511
  f" than the required number of blocks ({num_full_blocks})."
507
512
  "This can cause a failure during model compilation."
508
513
  )
514
+
509
515
  logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
510
516
 
511
517
  return rbln_config
@@ -602,21 +608,11 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
602
608
  input_ids: Optional[torch.LongTensor] = None,
603
609
  inputs_embeds: Optional[torch.Tensor] = None,
604
610
  attention_mask: Optional[torch.LongTensor] = None,
611
+ position_embed: Optional[torch.Tensor] = None,
605
612
  **kwargs,
606
- ) -> BaseModelOutputWithPast:
607
- """
608
- Args:
609
- input_ids (torch.LongTensor, optional): The input IDs to the model.
610
- inputs_embeds (torch.Tensor, optional): The input embeddings to the model.
611
- attention_mask (torch.LongTensor, optional): The attention mask to the model.
612
- kwargs (dict[str, Any], optional): Additional keyword arguments.
613
-
614
- Returns:
615
- Dataclass containing the last hidden states of the model.
616
- """
613
+ ) -> Tuple[torch.FloatTensor]:
617
614
  inputs = inputs_embeds if inputs_embeds is not None else input_ids
618
615
  batch_size = inputs.shape[0]
619
- position_embed = kwargs.get("position_embed", None)
620
616
 
621
617
  if batch_size != self.rbln_config.batch_size:
622
618
  raise ValueError(
@@ -639,7 +635,6 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
639
635
  all_last_hidden_states.append(last_hidden_states)
640
636
 
641
637
  last_hidden_states = torch.concat(all_last_hidden_states, dim=0)
642
-
643
638
  return BaseModelOutputWithPast(last_hidden_state=last_hidden_states)
644
639
 
645
640
 
@@ -764,16 +759,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
764
759
  logits = []
765
760
  inputs = inputs_embeds if inputs_embeds is not None else input_ids
766
761
  batch_size = inputs.shape[0]
767
- input_len = inputs.shape[1]
768
- if batch_size > self.rbln_config.batch_size:
769
- raise ValueError(
770
- f"Input's batch({batch_size}) exceeds compiled batch_size({self.rbln_config.batch_size})"
771
- )
772
- if input_len > self.rbln_config.max_seq_len:
773
- raise ValueError(
774
- f"Input's length({input_len}) exceeds compiled max_seq_len({self.rbln_config.max_seq_len})."
775
- )
776
-
777
762
  for b_idx in range(batch_size):
778
763
  cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
779
764
  output = self.prefill_decoder(
@@ -798,15 +783,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
798
783
  f"Available batch sizes are: {list(self.decoders.keys())}. "
799
784
  f"Please run your model with one of these batch sizes or add support for batch size {batch_size}."
800
785
  )
801
- if max(cache_position.reshape(-1)) >= self.rbln_config.max_seq_len:
802
- raise ValueError(
803
- f"Cache position exceeds the maximum sequence length.\n"
804
- f" - Current max cache position: {int(torch.max(cache_position).item())}\n"
805
- f" - Allowed max_seq_len: {self.rbln_config.max_seq_len}\n"
806
- f"Solution: Reduce the generation length by adjusting `max_new_tokens` "
807
- f"or `max_length` in the generation config."
808
- )
809
-
810
786
  logits = self.decoders[batch_size](
811
787
  input_ids=input_ids,
812
788
  inputs_embeds=inputs_embeds,
@@ -13,11 +13,6 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from typing import Tuple, Union
17
-
18
- import torch
19
- from transformers.modeling_outputs import DepthEstimatorOutput
20
-
21
16
  from ...modeling_generic import RBLNModelForDepthEstimation
22
17
 
23
18
 
@@ -28,15 +23,3 @@ class RBLNDepthAnythingForDepthEstimation(RBLNModelForDepthEstimation):
28
23
  This class provides hardware-accelerated inference for Depth Anything V2
29
24
  models on RBLN devices, providing the most capable monocular depth estimation (MDE) model.
30
25
  """
31
-
32
- def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[Tuple, DepthEstimatorOutput]:
33
- """
34
- Forward pass for the RBLN-optimized DepthAnythingForDepthEstimation model.
35
-
36
- Args:
37
- pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
38
-
39
- Returns:
40
- The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a DepthEstimatorOutput object.
41
- """
42
- return super().forward(pixel_values, **kwargs)