optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +0 -12
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +2 -4
- optimum/rbln/diffusers/__init__.py +0 -12
- optimum/rbln/diffusers/configurations/__init__.py +0 -3
- optimum/rbln/diffusers/configurations/models/__init__.py +0 -2
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +0 -3
- optimum/rbln/diffusers/models/__init__.py +3 -17
- optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -1
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/models/autoencoders/vae.py +8 -27
- optimum/rbln/diffusers/models/controlnet.py +1 -16
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +2 -16
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -16
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -14
- optimum/rbln/diffusers/models/unets/__init__.py +0 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -17
- optimum/rbln/diffusers/pipelines/__init__.py +0 -4
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -20
- optimum/rbln/modeling.py +45 -20
- optimum/rbln/modeling_base.py +1 -0
- optimum/rbln/transformers/configuration_generic.py +27 -0
- optimum/rbln/transformers/modeling_attention_utils.py +109 -242
- optimum/rbln/transformers/modeling_generic.py +61 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +2 -28
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +5 -68
- optimum/rbln/transformers/models/bart/modeling_bart.py +2 -23
- optimum/rbln/transformers/models/bert/modeling_bert.py +1 -86
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +15 -42
- optimum/rbln/transformers/models/clip/modeling_clip.py +2 -40
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +44 -5
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -6
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +2 -6
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +9 -17
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +12 -36
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +0 -17
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +0 -24
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -17
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +5 -3
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +8 -24
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +5 -3
- optimum/rbln/transformers/models/llava/modeling_llava.py +24 -36
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +4 -2
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -13
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +3 -2
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +3 -2
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +0 -17
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +0 -73
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +0 -33
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +4 -2
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +10 -34
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +1 -17
- optimum/rbln/transformers/models/swin/modeling_swin.py +1 -14
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -16
- optimum/rbln/transformers/models/vit/modeling_vit.py +0 -19
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +3 -15
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +8 -60
- optimum/rbln/transformers/models/whisper/generation_whisper.py +14 -48
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -43
- optimum/rbln/transformers/utils/rbln_quantization.py +0 -9
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/hub.py +3 -14
- optimum/rbln/utils/runtime_utils.py +0 -32
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +2 -2
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +72 -79
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +0 -67
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +0 -59
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +0 -114
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +0 -275
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +0 -201
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +0 -15
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +0 -46
- optimum/rbln/utils/deprecation.py +0 -213
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/entry_points.txt +0 -0
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,11 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional, Tuple, Union
|
|
16
|
-
|
|
17
|
-
import torch
|
|
18
|
-
from transformers.modeling_outputs import QuestionAnsweringModelOutput
|
|
19
|
-
|
|
20
15
|
from ...modeling_generic import RBLNModelForQuestionAnswering
|
|
21
16
|
|
|
22
17
|
|
|
@@ -30,22 +25,3 @@ class RBLNDistilBertForQuestionAnswering(RBLNModelForQuestionAnswering):
|
|
|
30
25
|
"""
|
|
31
26
|
|
|
32
27
|
rbln_model_input_names = ["input_ids", "attention_mask"]
|
|
33
|
-
|
|
34
|
-
def forward(
|
|
35
|
-
self,
|
|
36
|
-
input_ids: Optional[torch.Tensor] = None,
|
|
37
|
-
attention_mask: Optional[torch.Tensor] = None,
|
|
38
|
-
**kwargs,
|
|
39
|
-
) -> Union[Tuple, QuestionAnsweringModelOutput]:
|
|
40
|
-
"""
|
|
41
|
-
Forward pass for the RBLN-optimized DistilBERT model for question answering tasks.
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
45
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
return super().forward(input_ids, attention_mask, **kwargs)
|
|
@@ -13,11 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from typing import Tuple, Union
|
|
17
|
-
|
|
18
|
-
import torch
|
|
19
|
-
from transformers.modeling_outputs import DepthEstimatorOutput
|
|
20
|
-
|
|
21
16
|
from ...modeling_generic import RBLNModelForDepthEstimation
|
|
22
17
|
|
|
23
18
|
|
|
@@ -28,15 +23,3 @@ class RBLNDPTForDepthEstimation(RBLNModelForDepthEstimation):
|
|
|
28
23
|
This class provides hardware-accelerated inference for DPT (Dense Prediction Transformer)
|
|
29
24
|
models on RBLN devices, supporting monocular depth estimation from single images.
|
|
30
25
|
"""
|
|
31
|
-
|
|
32
|
-
def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[Tuple, DepthEstimatorOutput]:
|
|
33
|
-
"""
|
|
34
|
-
Forward pass for the RBLN-optimized DPT model.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
pixel_values (torch.FloatTensor of shape (batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a DepthEstimatorOutput object.
|
|
41
|
-
"""
|
|
42
|
-
return super().forward(pixel_values, **kwargs)
|
|
@@ -99,7 +99,9 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
|
|
|
99
99
|
return True
|
|
100
100
|
|
|
101
101
|
@classmethod
|
|
102
|
-
def
|
|
102
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
103
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
104
|
+
|
|
103
105
|
with no_init_weights():
|
|
104
106
|
model_cls_name = model.model.language_model.__class__.__name__
|
|
105
107
|
causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
|
|
@@ -133,7 +135,7 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
|
|
|
133
135
|
return self.language_model.get_input_embeddings()
|
|
134
136
|
|
|
135
137
|
@classmethod
|
|
136
|
-
def
|
|
138
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
|
|
137
139
|
return model.multi_modal_projector
|
|
138
140
|
|
|
139
141
|
@classmethod
|
|
@@ -480,7 +482,7 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
480
482
|
@classmethod
|
|
481
483
|
@torch.inference_mode()
|
|
482
484
|
def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNGemma3ForCausalLMConfig):
|
|
483
|
-
wrapped_model = cls.
|
|
485
|
+
wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
|
|
484
486
|
|
|
485
487
|
rbln_compile_configs = rbln_config.compile_cfgs
|
|
486
488
|
prefill_compile_config = rbln_compile_configs[0]
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
|
16
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
19
|
from torch import Tensor, nn
|
|
@@ -206,7 +206,8 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
|
|
|
206
206
|
torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
|
|
207
207
|
|
|
208
208
|
@classmethod
|
|
209
|
-
def
|
|
209
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
210
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
210
211
|
model.encoder = model.model.encoder
|
|
211
212
|
model.decoder = model.model.decoder
|
|
212
213
|
model.text_backbone = model.model.text_backbone
|
|
@@ -216,7 +217,7 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
|
|
|
216
217
|
return model
|
|
217
218
|
|
|
218
219
|
@classmethod
|
|
219
|
-
def
|
|
220
|
+
def wrap_model_if_needed(
|
|
220
221
|
cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
|
|
221
222
|
) -> torch.nn.Module:
|
|
222
223
|
return model.model.text_projection
|
|
@@ -529,26 +530,9 @@ class RBLNGroundingDinoForObjectDetection(RBLNModel):
|
|
|
529
530
|
output_attentions: Optional[bool] = None,
|
|
530
531
|
output_hidden_states: Optional[bool] = None,
|
|
531
532
|
return_dict: Optional[bool] = None,
|
|
533
|
+
labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
|
|
532
534
|
**kwargs,
|
|
533
|
-
)
|
|
534
|
-
"""
|
|
535
|
-
Forward pass for the RBLN-optimized GroundingDinoForObjectDetection model.
|
|
536
|
-
|
|
537
|
-
Args:
|
|
538
|
-
pixel_values (torch.Tensor of shape (batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images.
|
|
539
|
-
input_ids (torch.LongTensor of shape (batch_size, text_sequence_length)): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
|
|
540
|
-
token_type_ids (torch.LongTensor of shape (batch_size, text_sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
|
|
541
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
542
|
-
pixel_mask (torch.Tensor of shape (batch_size, height, width), optional): Mask to avoid performing attention on padding pixel values.
|
|
543
|
-
encoder_outputs (Tuple consists of last_hidden_state of shape(batch_size, sequence_length, hidden_size), optional): A sequence of hidden-states at the output of the last layer of the encoder.
|
|
544
|
-
output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers.
|
|
545
|
-
output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers.
|
|
546
|
-
return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple.
|
|
547
|
-
|
|
548
|
-
Returns:
|
|
549
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a GroundingDinoObjectDetectionOutput object.
|
|
550
|
-
"""
|
|
551
|
-
|
|
535
|
+
):
|
|
552
536
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
553
537
|
|
|
554
538
|
# Pad image to rbln_config.image_height and rbln_config.image_width
|
|
@@ -679,7 +663,7 @@ class RBLNGroundingDinoEncoder(RBLNModel):
|
|
|
679
663
|
self.encoder_runtime = RBLNPytorchRuntime(self.model[0])
|
|
680
664
|
|
|
681
665
|
@classmethod
|
|
682
|
-
def
|
|
666
|
+
def wrap_model_if_needed(
|
|
683
667
|
cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
|
|
684
668
|
) -> torch.nn.Module:
|
|
685
669
|
model = _GroundingDinoEncoder(model, rbln_config).eval()
|
|
@@ -877,7 +861,7 @@ class RBLNGroundingDinoDecoder(RBLNModel):
|
|
|
877
861
|
self.decoder_runtime = RBLNPytorchRuntime(self.model[0])
|
|
878
862
|
|
|
879
863
|
@classmethod
|
|
880
|
-
def
|
|
864
|
+
def wrap_model_if_needed(
|
|
881
865
|
cls, model: torch.nn.Module, rbln_config: RBLNGroundingDinoForObjectDetectionConfig
|
|
882
866
|
) -> torch.nn.Module:
|
|
883
867
|
return _GroundingDinoDecoder(model, rbln_config).eval()
|
|
@@ -110,7 +110,7 @@ class RBLNIdefics3VisionTransformer(RBLNModel):
|
|
|
110
110
|
return self.embeddings
|
|
111
111
|
|
|
112
112
|
@classmethod
|
|
113
|
-
def
|
|
113
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
|
|
114
114
|
class Idefics3VisionTransformerWrapper(torch.nn.Module):
|
|
115
115
|
def __init__(self, model: "Idefics3VisionTransformer"):
|
|
116
116
|
super().__init__()
|
|
@@ -240,7 +240,9 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationM
|
|
|
240
240
|
return True
|
|
241
241
|
|
|
242
242
|
@classmethod
|
|
243
|
-
def
|
|
243
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
244
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
245
|
+
|
|
244
246
|
with no_init_weights():
|
|
245
247
|
model_cls_name = model.model.text_model.__class__.__name__
|
|
246
248
|
causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
|
|
@@ -269,7 +271,7 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationM
|
|
|
269
271
|
return self.text_model.get_input_embeddings()
|
|
270
272
|
|
|
271
273
|
@classmethod
|
|
272
|
-
def
|
|
274
|
+
def wrap_model_if_needed(cls, model, rbln_config):
|
|
273
275
|
return model.model.connector
|
|
274
276
|
|
|
275
277
|
@classmethod
|
|
@@ -88,22 +88,15 @@ class LoopVisionTower(LoopProcessor):
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
class LoopProjector(LoopProcessor):
|
|
91
|
-
def __init__(self, multi_modal_projector: "RBLNModel"
|
|
91
|
+
def __init__(self, multi_modal_projector: "RBLNModel"):
|
|
92
92
|
super().__init__(model=multi_modal_projector)
|
|
93
|
-
self.rbln_config = rbln_config
|
|
94
93
|
|
|
95
94
|
def _get_batch_size(self, image_feature, **kwargs):
|
|
96
95
|
return image_feature.shape[0]
|
|
97
96
|
|
|
98
97
|
def _prepare_inputs_for_iteration(self, index, common_inputs, image_feature, **kwargs):
|
|
99
98
|
image_feature_item = image_feature[index : index + 1]
|
|
100
|
-
|
|
101
|
-
out_buffer = [
|
|
102
|
-
tensor[:, index * image_feature.shape[1] : (index + 1) * image_feature.shape[1], :]
|
|
103
|
-
for tensor in kwargs["out"]
|
|
104
|
-
]
|
|
105
|
-
else:
|
|
106
|
-
out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
|
|
99
|
+
out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
|
|
107
100
|
return ([image_feature_item], {"out": out_buffer})
|
|
108
101
|
|
|
109
102
|
def _process_outputs(self, outputs: list, **kwargs):
|
|
@@ -182,7 +175,9 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
182
175
|
return True
|
|
183
176
|
|
|
184
177
|
@classmethod
|
|
185
|
-
def
|
|
178
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
179
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
180
|
+
|
|
186
181
|
with no_init_weights():
|
|
187
182
|
model_cls_name = model.model.language_model.__class__.__name__
|
|
188
183
|
causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
|
|
@@ -199,7 +194,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
199
194
|
def __post_init__(self, **kwargs):
|
|
200
195
|
self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
|
|
201
196
|
self.language_model = self.rbln_submodules[1]
|
|
202
|
-
self.multi_modal_projector = LoopProjector(self.model[0]
|
|
197
|
+
self.multi_modal_projector = LoopProjector(self.model[0])
|
|
203
198
|
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
|
204
199
|
return super().__post_init__(**kwargs)
|
|
205
200
|
|
|
@@ -213,7 +208,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
213
208
|
return self.language_model.get_input_embeddings()
|
|
214
209
|
|
|
215
210
|
@classmethod
|
|
216
|
-
def
|
|
211
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
|
|
217
212
|
return model.multi_modal_projector
|
|
218
213
|
|
|
219
214
|
@classmethod
|
|
@@ -226,8 +221,10 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
226
221
|
) -> RBLNModelConfig:
|
|
227
222
|
# support for pixtral that needs padding
|
|
228
223
|
if hasattr(rbln_config.vision_tower, "max_image_size"):
|
|
229
|
-
num_positions = (
|
|
230
|
-
rbln_config.
|
|
224
|
+
num_positions = (
|
|
225
|
+
rbln_config.batch_size
|
|
226
|
+
* (rbln_config.vision_tower.max_image_size[0] // model_config.vision_config.patch_size)
|
|
227
|
+
* (rbln_config.vision_tower.max_image_size[1] // model_config.vision_config.patch_size)
|
|
231
228
|
)
|
|
232
229
|
selected_image_feature_dim = num_positions
|
|
233
230
|
|
|
@@ -356,32 +353,23 @@ class RBLNLlavaForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
356
353
|
|
|
357
354
|
if hasattr(self.rbln_config.vision_tower, "max_image_size"):
|
|
358
355
|
num_real_patches = selected_image_feature.shape[1]
|
|
359
|
-
max_patches = (
|
|
360
|
-
self.rbln_config.vision_tower.max_image_size[
|
|
356
|
+
max_patches = (
|
|
357
|
+
(self.rbln_config.vision_tower.max_image_size[0] // self.config.vision_config.patch_size)
|
|
358
|
+
* (self.rbln_config.vision_tower.max_image_size[1] // self.config.vision_config.patch_size)
|
|
359
|
+
* pixel_values.shape[0]
|
|
361
360
|
)
|
|
361
|
+
num_padding_patches = max_patches - num_real_patches
|
|
362
362
|
|
|
363
|
-
|
|
364
|
-
for i in range(0, num_real_patches, max_patches):
|
|
365
|
-
chunk = selected_image_feature[:, i : i + max_patches, :]
|
|
366
|
-
chunk_size = chunk.shape[1]
|
|
367
|
-
|
|
368
|
-
if chunk_size < max_patches:
|
|
369
|
-
padding_tensor = torch.zeros(
|
|
370
|
-
(selected_image_feature.shape[0], max_patches - chunk_size, selected_image_feature.shape[2]),
|
|
371
|
-
dtype=selected_image_feature.dtype,
|
|
372
|
-
)
|
|
373
|
-
chunk = torch.cat([chunk, padding_tensor], dim=1)
|
|
374
|
-
chunks.append(chunk)
|
|
375
|
-
|
|
376
|
-
split_features = torch.cat(chunks, dim=0)
|
|
377
|
-
num_chunks = len(chunks)
|
|
378
|
-
projector_out_size = [1, max_patches * num_chunks, self.config.text_config.hidden_size]
|
|
363
|
+
projector_out_size = [1, max_patches, self.config.text_config.hidden_size]
|
|
379
364
|
projector_out_buffer = [torch.empty(size=projector_out_size, dtype=torch.float32, device="cpu")]
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
selected_image_feature.shape[0],
|
|
365
|
+
|
|
366
|
+
padding_tensor = torch.zeros(
|
|
367
|
+
(selected_image_feature.shape[0], num_padding_patches, selected_image_feature.shape[2]),
|
|
368
|
+
dtype=selected_image_feature.dtype,
|
|
383
369
|
)
|
|
384
|
-
|
|
370
|
+
padded_feature = torch.cat([selected_image_feature, padding_tensor], dim=1)
|
|
371
|
+
padded_projected_feature = self.multi_modal_projector(padded_feature, out=projector_out_buffer)
|
|
372
|
+
image_features = padded_projected_feature[:, :num_real_patches, :]
|
|
385
373
|
else:
|
|
386
374
|
projector_out_size = [
|
|
387
375
|
pixel_values.shape[0] * pixel_values.shape[1],
|
|
@@ -139,7 +139,9 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGeneration
|
|
|
139
139
|
return True
|
|
140
140
|
|
|
141
141
|
@classmethod
|
|
142
|
-
def
|
|
142
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
143
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
144
|
+
|
|
143
145
|
with no_init_weights():
|
|
144
146
|
model_cls_name = model.model.language_model.__class__.__name__
|
|
145
147
|
causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
|
|
@@ -190,7 +192,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGeneration
|
|
|
190
192
|
return self.language_model.get_input_embeddings()
|
|
191
193
|
|
|
192
194
|
@classmethod
|
|
193
|
-
def
|
|
195
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
|
|
194
196
|
return model.multi_modal_projector
|
|
195
197
|
|
|
196
198
|
@classmethod
|
|
@@ -69,7 +69,7 @@ class RBLNOPTForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
69
69
|
return layer
|
|
70
70
|
|
|
71
71
|
@classmethod
|
|
72
|
-
def
|
|
72
|
+
def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
73
73
|
for i in range(len(model.model.decoder.layers)):
|
|
74
74
|
model.model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.model.decoder.layers[i])
|
|
75
75
|
|
|
@@ -95,7 +95,7 @@ class RBLNOPTModel(RBLNDecoderOnlyModel):
|
|
|
95
95
|
return layer
|
|
96
96
|
|
|
97
97
|
@classmethod
|
|
98
|
-
def
|
|
98
|
+
def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
99
99
|
for i in range(len(model.decoder.layers)):
|
|
100
100
|
model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.decoder.layers[i])
|
|
101
101
|
|
|
@@ -54,7 +54,7 @@ class RBLNPegasusForConditionalGeneration(RBLNModelForSeq2SeqLM):
|
|
|
54
54
|
support_causal_attn = True
|
|
55
55
|
|
|
56
56
|
@classmethod
|
|
57
|
-
def
|
|
57
|
+
def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: RBLNPegasusForConditionalGenerationConfig):
|
|
58
58
|
return PegasusWrapper(
|
|
59
59
|
model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
|
|
60
60
|
)
|
|
@@ -229,7 +229,7 @@ class RBLNPixtralVisionModel(RBLNModel):
|
|
|
229
229
|
torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
|
|
230
230
|
|
|
231
231
|
@classmethod
|
|
232
|
-
def
|
|
232
|
+
def wrap_model_if_needed(
|
|
233
233
|
cls, model: torch.nn.Module, rbln_config: RBLNPixtralVisionModelConfig
|
|
234
234
|
) -> torch.nn.Module:
|
|
235
235
|
wrapper_cfg = {
|
|
@@ -293,18 +293,6 @@ class RBLNPixtralVisionModel(RBLNModel):
|
|
|
293
293
|
return_dict: bool = True,
|
|
294
294
|
**kwargs,
|
|
295
295
|
) -> Union[Tuple, BaseModelOutput]:
|
|
296
|
-
"""
|
|
297
|
-
Forward pass for the RBLN-optimized Pixtral vision model.
|
|
298
|
-
|
|
299
|
-
Args:
|
|
300
|
-
pixel_values (torch.Tensor of shape (batch_size, num_channels, image_size, image_size)) — The tensors corresponding to the input images. Pixel values can be obtained using PixtralImageProcessor. See PixtralImageProcessor.call() for details (PixtralProcessor uses PixtralImageProcessor for processing images).
|
|
301
|
-
image_sizes (torch.Tensor of shape (batch_size, 2), optional) — The sizes of the images in the batch, being (height, width) for each image.
|
|
302
|
-
output_hidden_states (bool, optional) — Whether or not to return the hidden states of all layers. See hidden_states under returned tensors for more detail.
|
|
303
|
-
return_dict (bool, optional) — Whether or not to return a ModelOutput instead of a plain tuple.
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
BaseModelOutput or tuple(torch.FloatTensor)
|
|
307
|
-
"""
|
|
308
296
|
output_hidden_states = (
|
|
309
297
|
output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
|
|
310
298
|
)
|
|
@@ -88,7 +88,7 @@ class RBLNQwen2_5_VisionTransformerPretrainedModel(RBLNModel):
|
|
|
88
88
|
torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
|
|
89
89
|
|
|
90
90
|
@classmethod
|
|
91
|
-
def
|
|
91
|
+
def wrap_model_if_needed(
|
|
92
92
|
cls, model: "PreTrainedModel", rbln_config: RBLNQwen2_5_VisionTransformerPretrainedModelConfig
|
|
93
93
|
):
|
|
94
94
|
return Qwen2_5_VisionTransformerWrapper(model).eval()
|
|
@@ -393,7 +393,8 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
|
|
|
393
393
|
return True
|
|
394
394
|
|
|
395
395
|
@classmethod
|
|
396
|
-
def
|
|
396
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
397
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
397
398
|
model.model.lm_head = model.lm_head
|
|
398
399
|
model.lm_head = None
|
|
399
400
|
del model.lm_head
|
|
@@ -89,7 +89,7 @@ class RBLNQwen2VisionTransformerPretrainedModel(RBLNModel):
|
|
|
89
89
|
torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
|
|
90
90
|
|
|
91
91
|
@classmethod
|
|
92
|
-
def
|
|
92
|
+
def wrap_model_if_needed(
|
|
93
93
|
cls, model: "PreTrainedModel", rbln_config: RBLNQwen2VisionTransformerPretrainedModelConfig
|
|
94
94
|
):
|
|
95
95
|
return Qwen2VisionTransformerWrapper(model).eval()
|
|
@@ -282,7 +282,8 @@ class RBLNQwen2VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
|
|
|
282
282
|
return True
|
|
283
283
|
|
|
284
284
|
@classmethod
|
|
285
|
-
def
|
|
285
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
286
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
286
287
|
model.model.lm_head = model.lm_head
|
|
287
288
|
model.lm_head = None
|
|
288
289
|
del model.lm_head
|
|
@@ -13,8 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from typing import Optional
|
|
17
|
-
|
|
18
16
|
from ...configuration_generic import RBLNModelForImageClassificationConfig
|
|
19
17
|
|
|
20
18
|
|
|
@@ -25,18 +23,3 @@ class RBLNResNetForImageClassificationConfig(RBLNModelForImageClassificationConf
|
|
|
25
23
|
This configuration class stores the configuration parameters specific to
|
|
26
24
|
RBLN-optimized ResNet models for image classification tasks.
|
|
27
25
|
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self, output_hidden_states: Optional[bool] = None, **kwargs):
|
|
30
|
-
"""
|
|
31
|
-
Args:
|
|
32
|
-
image_size (Optional[Union[int, Tuple[int, int]]]): The size of input images.
|
|
33
|
-
Can be an integer for square images or a tuple (height, width).
|
|
34
|
-
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
35
|
-
output_hidden_states (bool, optional) — Whether or not to return the hidden states of all layers.
|
|
36
|
-
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
37
|
-
|
|
38
|
-
Raises:
|
|
39
|
-
ValueError: If batch_size is not a positive integer.
|
|
40
|
-
"""
|
|
41
|
-
super().__init__(**kwargs)
|
|
42
|
-
self.output_hidden_states = output_hidden_states
|
|
@@ -13,17 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
|
17
|
-
|
|
18
|
-
import torch
|
|
19
|
-
from transformers.modeling_outputs import ImageClassifierOutputWithNoAttention
|
|
20
|
-
|
|
21
16
|
from ...modeling_generic import RBLNModelForImageClassification
|
|
22
|
-
from .configuration_resnet import RBLNResNetForImageClassificationConfig
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if TYPE_CHECKING:
|
|
26
|
-
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
|
|
27
17
|
|
|
28
18
|
|
|
29
19
|
class RBLNResNetForImageClassification(RBLNModelForImageClassification):
|
|
@@ -34,66 +24,3 @@ class RBLNResNetForImageClassification(RBLNModelForImageClassification):
|
|
|
34
24
|
on RBLN devices, supporting image classification with convolutional neural networks
|
|
35
25
|
designed for computer vision tasks.
|
|
36
26
|
"""
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def _update_rbln_config(
|
|
40
|
-
cls,
|
|
41
|
-
preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
|
|
42
|
-
model: Optional["PreTrainedModel"] = None,
|
|
43
|
-
model_config: Optional["PretrainedConfig"] = None,
|
|
44
|
-
rbln_config: Optional["RBLNResNetForImageClassificationConfig"] = None,
|
|
45
|
-
) -> "RBLNResNetForImageClassificationConfig":
|
|
46
|
-
if rbln_config.output_hidden_states is None:
|
|
47
|
-
rbln_config.output_hidden_states = getattr(model_config, "output_hidden_states", False)
|
|
48
|
-
|
|
49
|
-
rbln_config = super()._update_rbln_config(
|
|
50
|
-
preprocessors=preprocessors,
|
|
51
|
-
model=model,
|
|
52
|
-
model_config=model_config,
|
|
53
|
-
rbln_config=rbln_config,
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
return rbln_config
|
|
57
|
-
|
|
58
|
-
@classmethod
|
|
59
|
-
def _wrap_model_if_needed(
|
|
60
|
-
cls, model: torch.nn.Module, rbln_config: "RBLNResNetForImageClassificationConfig"
|
|
61
|
-
) -> torch.nn.Module:
|
|
62
|
-
class _ResNetForImageClassification(torch.nn.Module):
|
|
63
|
-
def __init__(self, model: torch.nn.Module, output_hidden_states: bool):
|
|
64
|
-
super().__init__()
|
|
65
|
-
self.model = model
|
|
66
|
-
self.output_hidden_states = output_hidden_states
|
|
67
|
-
|
|
68
|
-
def forward(self, *args, **kwargs):
|
|
69
|
-
output = self.model(*args, output_hidden_states=self.output_hidden_states, **kwargs)
|
|
70
|
-
return output
|
|
71
|
-
|
|
72
|
-
return _ResNetForImageClassification(model, rbln_config.output_hidden_states)
|
|
73
|
-
|
|
74
|
-
def forward(
|
|
75
|
-
self, pixel_values: torch.Tensor, output_hidden_states: bool = None, return_dict: bool = None, **kwargs
|
|
76
|
-
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
|
|
77
|
-
"""
|
|
78
|
-
Foward pass for the RBLN-optimized ResNet model for image classification.
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
pixel_values (torch.FloatTensor of shape (batch_size, channels, height, width)): The tensors corresponding to the input images.
|
|
82
|
-
output_hidden_states (bool, *optional*, defaults to False): Whether or not to return the hidden states of all layers.
|
|
83
|
-
See hidden_states under returned tensors for more details.
|
|
84
|
-
return_dict (bool, *optional*, defaults to True): Whether to return a dictionary of outputs.
|
|
85
|
-
|
|
86
|
-
Returns:
|
|
87
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a ImageClassifierOutputWithNoAttention object.
|
|
88
|
-
"""
|
|
89
|
-
output_hidden_states = (
|
|
90
|
-
output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
if output_hidden_states != self.rbln_config.output_hidden_states:
|
|
94
|
-
raise ValueError(
|
|
95
|
-
f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
|
|
96
|
-
f"Please compile again with the correct argument."
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
return super().forward(pixel_values=pixel_values, return_dict=return_dict, **kwargs)
|
|
@@ -12,11 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Tuple, Union
|
|
16
|
-
|
|
17
|
-
import torch
|
|
18
|
-
from transformers.modeling_outputs import MaskedLMOutput, SequenceClassifierOutput
|
|
19
|
-
|
|
20
15
|
from ...modeling_generic import RBLNModelForMaskedLM, RBLNModelForSequenceClassification
|
|
21
16
|
|
|
22
17
|
|
|
@@ -31,19 +26,6 @@ class RBLNRobertaForMaskedLM(RBLNModelForMaskedLM):
|
|
|
31
26
|
|
|
32
27
|
rbln_model_input_names = ["input_ids", "attention_mask"]
|
|
33
28
|
|
|
34
|
-
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Union[Tuple, MaskedLMOutput]:
|
|
35
|
-
"""
|
|
36
|
-
Forward pass for the RBLN-optimized RoBERTa model for masked language modeling tasks.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
input_ids (torch.LongTensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
40
|
-
attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
|
|
44
|
-
"""
|
|
45
|
-
return super().forward(input_ids, attention_mask, **kwargs)
|
|
46
|
-
|
|
47
29
|
|
|
48
30
|
class RBLNRobertaForSequenceClassification(RBLNModelForSequenceClassification):
|
|
49
31
|
"""
|
|
@@ -55,18 +37,3 @@ class RBLNRobertaForSequenceClassification(RBLNModelForSequenceClassification):
|
|
|
55
37
|
"""
|
|
56
38
|
|
|
57
39
|
rbln_model_input_names = ["input_ids", "attention_mask"]
|
|
58
|
-
|
|
59
|
-
def forward(
|
|
60
|
-
self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
|
|
61
|
-
) -> Union[Tuple, SequenceClassifierOutput]:
|
|
62
|
-
"""
|
|
63
|
-
Forward pass for the RBLN-optimized RoBERTa model for sequence classification tasks.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
input_ids (torch.LongTensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
67
|
-
attention_mask (torch.FloatTensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
68
|
-
|
|
69
|
-
Returns:
|
|
70
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a SequenceClassifierOutput object.
|
|
71
|
-
"""
|
|
72
|
-
return super().forward(input_ids, attention_mask, **kwargs)
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
|
-
from ....utils.deprecation import deprecate_kwarg
|
|
19
18
|
from ....utils.logging import get_logger
|
|
20
19
|
|
|
21
20
|
|
|
@@ -25,13 +24,13 @@ logger = get_logger()
|
|
|
25
24
|
class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
26
25
|
support_paged_attention = None
|
|
27
26
|
|
|
28
|
-
@deprecate_kwarg(old_name="pad_token_id", version="0.10.0")
|
|
29
27
|
def __init__(
|
|
30
28
|
self,
|
|
31
29
|
batch_size: Optional[int] = None,
|
|
32
30
|
enc_max_seq_len: Optional[int] = None,
|
|
33
31
|
dec_max_seq_len: Optional[int] = None,
|
|
34
32
|
use_attention_mask: Optional[bool] = None,
|
|
33
|
+
pad_token_id: Optional[int] = None,
|
|
35
34
|
kvcache_num_blocks: Optional[int] = None,
|
|
36
35
|
kvcache_block_size: Optional[int] = None,
|
|
37
36
|
**kwargs: Any,
|
|
@@ -42,6 +41,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
42
41
|
enc_max_seq_len (Optional[int]): Maximum sequence length for the encoder.
|
|
43
42
|
dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
|
|
44
43
|
use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
|
|
44
|
+
pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
|
|
45
45
|
kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
|
|
46
46
|
PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
|
|
47
47
|
kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
|
|
@@ -61,6 +61,8 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
61
61
|
|
|
62
62
|
self.use_attention_mask = use_attention_mask
|
|
63
63
|
|
|
64
|
+
self.pad_token_id = pad_token_id
|
|
65
|
+
|
|
64
66
|
if self.support_paged_attention:
|
|
65
67
|
self.kvcache_num_blocks = kvcache_num_blocks
|
|
66
68
|
self.kvcache_block_size = kvcache_block_size
|