optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +0 -12
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +2 -4
- optimum/rbln/diffusers/__init__.py +0 -12
- optimum/rbln/diffusers/configurations/__init__.py +0 -3
- optimum/rbln/diffusers/configurations/models/__init__.py +0 -2
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +0 -3
- optimum/rbln/diffusers/models/__init__.py +3 -17
- optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -1
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/models/autoencoders/vae.py +8 -27
- optimum/rbln/diffusers/models/controlnet.py +1 -16
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +2 -16
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -16
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -14
- optimum/rbln/diffusers/models/unets/__init__.py +0 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -17
- optimum/rbln/diffusers/pipelines/__init__.py +0 -4
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -20
- optimum/rbln/modeling.py +45 -20
- optimum/rbln/modeling_base.py +1 -0
- optimum/rbln/transformers/configuration_generic.py +27 -0
- optimum/rbln/transformers/modeling_attention_utils.py +109 -242
- optimum/rbln/transformers/modeling_generic.py +61 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +2 -28
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +5 -68
- optimum/rbln/transformers/models/bart/modeling_bart.py +2 -23
- optimum/rbln/transformers/models/bert/modeling_bert.py +1 -86
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +15 -42
- optimum/rbln/transformers/models/clip/modeling_clip.py +2 -40
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +44 -5
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -6
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +2 -6
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +9 -17
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +12 -36
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +0 -17
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +0 -24
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -17
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +5 -3
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +8 -24
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +5 -3
- optimum/rbln/transformers/models/llava/modeling_llava.py +24 -36
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +4 -2
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -13
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +3 -2
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +3 -2
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +0 -17
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +0 -73
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +0 -33
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +4 -2
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +10 -34
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +1 -17
- optimum/rbln/transformers/models/swin/modeling_swin.py +1 -14
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -16
- optimum/rbln/transformers/models/vit/modeling_vit.py +0 -19
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +3 -15
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +8 -60
- optimum/rbln/transformers/models/whisper/generation_whisper.py +14 -48
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -43
- optimum/rbln/transformers/utils/rbln_quantization.py +0 -9
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/hub.py +3 -14
- optimum/rbln/utils/runtime_utils.py +0 -32
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +2 -2
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +72 -79
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +0 -67
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +0 -59
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +0 -114
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +0 -275
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +0 -201
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +0 -15
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +0 -46
- optimum/rbln/utils/deprecation.py +0 -213
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/entry_points.txt +0 -0
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
|
@@ -13,11 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import inspect
|
|
16
|
-
from typing import Any, Callable
|
|
16
|
+
from typing import Any, Callable
|
|
17
17
|
|
|
18
|
-
import torch
|
|
19
18
|
from transformers import BartForConditionalGeneration, PreTrainedModel
|
|
20
|
-
from transformers.modeling_outputs import Seq2SeqModelOutput
|
|
21
19
|
|
|
22
20
|
from ....utils.logging import get_logger
|
|
23
21
|
from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
|
|
@@ -37,25 +35,6 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
|
|
|
37
35
|
on RBLN devices, optimized for feature extraction use cases.
|
|
38
36
|
"""
|
|
39
37
|
|
|
40
|
-
def forward(
|
|
41
|
-
self,
|
|
42
|
-
input_ids: Optional[torch.Tensor] = None,
|
|
43
|
-
attention_mask: Optional[torch.Tensor] = None,
|
|
44
|
-
**kwargs,
|
|
45
|
-
) -> Union[Tuple, Seq2SeqModelOutput]:
|
|
46
|
-
"""
|
|
47
|
-
Forward pass for the RBLN-optimized BART model for feature extraction tasks.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
51
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
return super().forward(input_ids, attention_mask, **kwargs)
|
|
58
|
-
|
|
59
38
|
|
|
60
39
|
class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
|
|
61
40
|
"""
|
|
@@ -69,7 +48,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
|
|
|
69
48
|
support_causal_attn = True
|
|
70
49
|
|
|
71
50
|
@classmethod
|
|
72
|
-
def
|
|
51
|
+
def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
|
|
73
52
|
return BartWrapper(
|
|
74
53
|
model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
|
|
75
54
|
)
|
|
@@ -12,14 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional, Tuple, Union
|
|
16
|
-
|
|
17
15
|
import torch
|
|
18
|
-
from transformers.modeling_outputs import (
|
|
19
|
-
BaseModelOutputWithPoolingAndCrossAttentions,
|
|
20
|
-
MaskedLMOutput,
|
|
21
|
-
QuestionAnsweringModelOutput,
|
|
22
|
-
)
|
|
23
16
|
|
|
24
17
|
from ...modeling_generic import (
|
|
25
18
|
RBLNModelForMaskedLM,
|
|
@@ -42,45 +35,9 @@ class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
|
|
|
42
35
|
rbln_model_input_names = ["input_ids", "attention_mask"]
|
|
43
36
|
|
|
44
37
|
@classmethod
|
|
45
|
-
def
|
|
38
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
|
|
46
39
|
return BertModelWrapper(model, rbln_config)
|
|
47
40
|
|
|
48
|
-
def forward(
|
|
49
|
-
self,
|
|
50
|
-
input_ids: Optional[torch.Tensor] = None,
|
|
51
|
-
attention_mask: Optional[torch.Tensor] = None,
|
|
52
|
-
token_type_ids: Optional[torch.Tensor] = None,
|
|
53
|
-
position_ids: Optional[torch.Tensor] = None,
|
|
54
|
-
**kwargs,
|
|
55
|
-
) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple]:
|
|
56
|
-
"""
|
|
57
|
-
Forward pass for the RBLN-optimized BERT model for feature extraction tasks.
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
61
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
62
|
-
token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
|
|
63
|
-
position_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of positions of each input sequence tokens in the position embeddings.
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPoolingAndCrossAttentions object.
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
input_map = {
|
|
70
|
-
"input_ids": input_ids,
|
|
71
|
-
"attention_mask": attention_mask,
|
|
72
|
-
"token_type_ids": token_type_ids,
|
|
73
|
-
"position_ids": position_ids,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
model_input_names = getattr(self.rbln_config, "model_input_names", None)
|
|
77
|
-
if model_input_names is None:
|
|
78
|
-
model_input_names = self.rbln_model_input_names
|
|
79
|
-
|
|
80
|
-
ordered_inputs = [input_map[name] for name in model_input_names if name in input_map]
|
|
81
|
-
|
|
82
|
-
return super().forward(*ordered_inputs, **kwargs)
|
|
83
|
-
|
|
84
41
|
|
|
85
42
|
class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
|
|
86
43
|
"""
|
|
@@ -93,27 +50,6 @@ class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
|
|
|
93
50
|
|
|
94
51
|
rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
|
|
95
52
|
|
|
96
|
-
def forward(
|
|
97
|
-
self,
|
|
98
|
-
input_ids: Optional[torch.Tensor] = None,
|
|
99
|
-
attention_mask: Optional[torch.Tensor] = None,
|
|
100
|
-
token_type_ids: Optional[torch.Tensor] = None,
|
|
101
|
-
**kwargs,
|
|
102
|
-
) -> Union[MaskedLMOutput, Tuple]:
|
|
103
|
-
"""
|
|
104
|
-
Forward pass for the RBLN-optimized BERT model for masked language modeling tasks.
|
|
105
|
-
|
|
106
|
-
Args:
|
|
107
|
-
input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
108
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
109
|
-
token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
|
|
113
|
-
"""
|
|
114
|
-
|
|
115
|
-
return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
|
|
116
|
-
|
|
117
53
|
|
|
118
54
|
class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
|
|
119
55
|
"""
|
|
@@ -125,24 +61,3 @@ class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
|
|
|
125
61
|
"""
|
|
126
62
|
|
|
127
63
|
rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
|
|
128
|
-
|
|
129
|
-
def forward(
|
|
130
|
-
self,
|
|
131
|
-
input_ids: Optional[torch.Tensor] = None,
|
|
132
|
-
attention_mask: Optional[torch.Tensor] = None,
|
|
133
|
-
token_type_ids: Optional[torch.Tensor] = None,
|
|
134
|
-
**kwargs,
|
|
135
|
-
) -> Union[QuestionAnsweringModelOutput, Tuple]:
|
|
136
|
-
"""
|
|
137
|
-
Forward pass for the RBLN-optimized BERT model for question answering tasks.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
|
|
141
|
-
attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
|
|
142
|
-
token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import inspect
|
|
16
16
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Callable,
|
|
17
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
from transformers import (
|
|
@@ -71,7 +71,7 @@ class RBLNBlip2VisionModel(RBLNModel):
|
|
|
71
71
|
return self.embeddings
|
|
72
72
|
|
|
73
73
|
@classmethod
|
|
74
|
-
def
|
|
74
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
|
|
75
75
|
class Blip2VisionModelWrapper(torch.nn.Module):
|
|
76
76
|
def __init__(self, model: "Blip2VisionModel") -> None:
|
|
77
77
|
super().__init__()
|
|
@@ -111,20 +111,11 @@ class RBLNBlip2VisionModel(RBLNModel):
|
|
|
111
111
|
def forward(
|
|
112
112
|
self,
|
|
113
113
|
pixel_values: torch.FloatTensor,
|
|
114
|
-
|
|
114
|
+
output_attentions: Optional[bool] = None,
|
|
115
|
+
output_hidden_states: Optional[bool] = None,
|
|
115
116
|
return_dict: Optional[bool] = None,
|
|
117
|
+
interpolate_pos_encoding: bool = False,
|
|
116
118
|
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
|
117
|
-
"""
|
|
118
|
-
Forward pass for the RBLN-optimized Blip2VisionModel model.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
|
|
122
|
-
interpolate_pos_encoding (bool, optional): Whether to interpolate the positional encoding of the image embeddings. Defaults to False.
|
|
123
|
-
return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
BaseModelOutputWithPooling or tuple(torch.FloatTensor): The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
|
|
127
|
-
"""
|
|
128
119
|
batch_size = pixel_values.shape[0]
|
|
129
120
|
outputs = []
|
|
130
121
|
for i in range(batch_size):
|
|
@@ -160,7 +151,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
|
|
|
160
151
|
return self.embeddings.word_embeddings
|
|
161
152
|
|
|
162
153
|
@classmethod
|
|
163
|
-
def
|
|
154
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
|
|
164
155
|
class Blip2QFormerModelWrapper(torch.nn.Module):
|
|
165
156
|
def __init__(self, model: "Blip2QFormerModel"):
|
|
166
157
|
super().__init__()
|
|
@@ -240,22 +231,17 @@ class RBLNBlip2QFormerModel(RBLNModel):
|
|
|
240
231
|
def forward(
|
|
241
232
|
self,
|
|
242
233
|
query_embeds: torch.FloatTensor,
|
|
234
|
+
query_length: Optional[int] = None,
|
|
235
|
+
attention_mask: Optional[torch.FloatTensor] = None,
|
|
236
|
+
head_mask: Optional[torch.FloatTensor] = None,
|
|
243
237
|
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
|
244
238
|
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
|
239
|
+
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
|
240
|
+
use_cache: Optional[bool] = None,
|
|
241
|
+
output_attentions: Optional[bool] = None,
|
|
242
|
+
output_hidden_states: Optional[bool] = None,
|
|
245
243
|
return_dict: Optional[bool] = None,
|
|
246
244
|
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
|
247
|
-
"""
|
|
248
|
-
The forward pass for the RBLN-optimized Blip2QFormerModel model.
|
|
249
|
-
|
|
250
|
-
Args:
|
|
251
|
-
query_embeds (torch.FloatTensor): Hidden states to be used in the attention computation.
|
|
252
|
-
encoder_hidden_states (torch.FloatTensor, optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder.
|
|
253
|
-
encoder_attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder.
|
|
254
|
-
return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
|
|
255
|
-
|
|
256
|
-
Returns:
|
|
257
|
-
BaseModelOutputWithPoolingAndCrossAttentions or tuple(torch.FloatTensor): The model outputs. If `return_dict=False` is passed, returns a tuple of tensors. Otherwise, returns a `BaseModelOutputWithPoolingAndCrossAttentions` object.
|
|
258
|
-
"""
|
|
259
245
|
batch_size = query_embeds.shape[0]
|
|
260
246
|
outputs = []
|
|
261
247
|
for i in range(batch_size):
|
|
@@ -363,7 +349,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
363
349
|
return self.language_model.get_input_embeddings()
|
|
364
350
|
|
|
365
351
|
@classmethod
|
|
366
|
-
def
|
|
352
|
+
def wrap_model_if_needed(cls, model, rbln_config):
|
|
367
353
|
return model.language_projection
|
|
368
354
|
|
|
369
355
|
@classmethod
|
|
@@ -458,20 +444,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
|
|
|
458
444
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
459
445
|
interpolate_pos_encoding: bool = False,
|
|
460
446
|
**generate_kwargs,
|
|
461
|
-
) ->
|
|
462
|
-
"""
|
|
463
|
-
The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
|
|
464
|
-
Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.generate) for more details.
|
|
465
|
-
|
|
466
|
-
Args:
|
|
467
|
-
pixel_values (torch.FloatTensor): Input images to be processed.
|
|
468
|
-
input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
|
|
469
|
-
attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
|
|
470
|
-
inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
|
|
471
|
-
interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
|
|
472
|
-
Returns:
|
|
473
|
-
A list of strings of length batch_size * num_captions.
|
|
474
|
-
"""
|
|
447
|
+
) -> torch.LongTensor:
|
|
475
448
|
batch_size = pixel_values.shape[0]
|
|
476
449
|
image_embeds = self.vision_model(
|
|
477
450
|
pixel_values,
|
|
@@ -54,7 +54,7 @@ class RBLNCLIPTextModel(RBLNModel):
|
|
|
54
54
|
_tp_support = False
|
|
55
55
|
|
|
56
56
|
@classmethod
|
|
57
|
-
def
|
|
57
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
|
|
58
58
|
return _TextEncoder(model).eval()
|
|
59
59
|
|
|
60
60
|
@classmethod
|
|
@@ -92,9 +92,6 @@ class RBLNCLIPTextModel(RBLNModel):
|
|
|
92
92
|
Args:
|
|
93
93
|
input_ids (torch.LongTensor): The input ids to the model.
|
|
94
94
|
return_dict (Optional[bool]): Whether to return a dictionary of outputs.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPTextModelOutput object.
|
|
98
95
|
"""
|
|
99
96
|
|
|
100
97
|
# To ignore using attention_mask, we override forward method.
|
|
@@ -160,7 +157,7 @@ class RBLNCLIPVisionModel(RBLNModel):
|
|
|
160
157
|
_tp_support = False
|
|
161
158
|
|
|
162
159
|
@classmethod
|
|
163
|
-
def
|
|
160
|
+
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
|
|
164
161
|
wrapper_cfg = {
|
|
165
162
|
"interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
|
|
166
163
|
"output_hidden_states": rbln_config.output_hidden_states,
|
|
@@ -233,9 +230,6 @@ class RBLNCLIPVisionModel(RBLNModel):
|
|
|
233
230
|
output_attentions (Optional[bool]): Whether to return attentions.
|
|
234
231
|
output_hidden_states (Optional[bool]): Whether to return hidden states.
|
|
235
232
|
interpolate_pos_encoding (bool): Whether to interpolate position encoding.
|
|
236
|
-
|
|
237
|
-
Returns:
|
|
238
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
|
|
239
233
|
"""
|
|
240
234
|
|
|
241
235
|
if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
|
|
@@ -313,38 +307,6 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
|
|
|
313
307
|
multimodal embedding alignment tasks.
|
|
314
308
|
"""
|
|
315
309
|
|
|
316
|
-
def forward(
|
|
317
|
-
self,
|
|
318
|
-
pixel_values: torch.FloatTensor,
|
|
319
|
-
return_dict: bool = True,
|
|
320
|
-
output_attentions: Optional[bool] = None,
|
|
321
|
-
output_hidden_states: Optional[bool] = None,
|
|
322
|
-
interpolate_pos_encoding: bool = False,
|
|
323
|
-
**kwargs,
|
|
324
|
-
) -> Union[Tuple, CLIPVisionModelOutput]:
|
|
325
|
-
"""
|
|
326
|
-
Forward pass for the RBLN-optimized CLIP vision encoder model with projection.
|
|
327
|
-
|
|
328
|
-
Args:
|
|
329
|
-
pixel_values (torch.Tensor): The pixel values to the model.
|
|
330
|
-
return_dict (bool): Whether to return a dictionary of outputs.
|
|
331
|
-
output_attentions (Optional[bool]): Whether to return attentions.
|
|
332
|
-
output_hidden_states (Optional[bool]): Whether to return hidden states.
|
|
333
|
-
interpolate_pos_encoding (bool): Whether to interpolate position encoding.
|
|
334
|
-
|
|
335
|
-
Returns:
|
|
336
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPVisionModelOutput object.
|
|
337
|
-
"""
|
|
338
|
-
|
|
339
|
-
return super().forward(
|
|
340
|
-
pixel_values=pixel_values,
|
|
341
|
-
return_dict=return_dict,
|
|
342
|
-
output_attentions=output_attentions,
|
|
343
|
-
output_hidden_states=output_hidden_states,
|
|
344
|
-
interpolate_pos_encoding=interpolate_pos_encoding,
|
|
345
|
-
**kwargs,
|
|
346
|
-
)
|
|
347
|
-
|
|
348
310
|
def _prepare_output(self, output, return_dict):
|
|
349
311
|
# Prepare model output based on return_dict flag.
|
|
350
312
|
# This method can be overridden by subclasses to provide task-specific output handling.
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
import bisect
|
|
16
16
|
from pathlib import Path
|
|
17
|
-
from
|
|
17
|
+
from tempfile import TemporaryDirectory
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
18
19
|
|
|
19
20
|
import torch
|
|
20
21
|
from transformers import PretrainedConfig, PreTrainedModel
|
|
@@ -181,7 +182,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
|
|
|
181
182
|
return multi_modal_projector
|
|
182
183
|
|
|
183
184
|
@classmethod
|
|
184
|
-
def
|
|
185
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
|
|
185
186
|
return RBLNColPaliForRetrievalWrapper(
|
|
186
187
|
causal_lm=model.vlm,
|
|
187
188
|
embedding_proj_layer=model.embedding_proj_layer,
|
|
@@ -235,11 +236,49 @@ class RBLNColPaliForRetrieval(RBLNModel):
|
|
|
235
236
|
return rbln_config
|
|
236
237
|
|
|
237
238
|
@classmethod
|
|
238
|
-
def
|
|
239
|
-
|
|
239
|
+
def from_model(
|
|
240
|
+
cls,
|
|
241
|
+
model: "PreTrainedModel",
|
|
242
|
+
config: Optional[PretrainedConfig] = None,
|
|
243
|
+
rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
|
|
244
|
+
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
|
|
245
|
+
subfolder: str = "",
|
|
246
|
+
**kwargs: Any,
|
|
247
|
+
) -> "RBLNModel":
|
|
248
|
+
"""
|
|
249
|
+
Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
|
|
250
|
+
This method performs the actual model conversion and compilation process.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
model (PreTrainedModel): The PyTorch model to be compiled.
|
|
254
|
+
The object must be an instance of the HuggingFace transformers PreTrainedModel class.
|
|
255
|
+
config (Optional[PretrainedConfig]): The configuration object associated with the model.
|
|
256
|
+
rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
|
|
257
|
+
This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
|
|
258
|
+
For detailed configuration options, see the specific model's configuration class documentation.
|
|
259
|
+
kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
|
|
260
|
+
|
|
261
|
+
The method performs the following steps:
|
|
262
|
+
|
|
263
|
+
1. Compiles the PyTorch model into an optimized RBLN graph
|
|
264
|
+
2. Configures the model for the specified NPU device
|
|
265
|
+
3. Creates the necessary runtime objects if requested
|
|
266
|
+
4. Saves the compiled model and configurations
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
(RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
|
|
270
|
+
"""
|
|
271
|
+
if not hasattr(model, "vision_tower"):
|
|
240
272
|
model.vision_tower = model.vlm.vision_tower
|
|
241
273
|
del model.vlm.model.vision_tower
|
|
242
|
-
|
|
274
|
+
model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
|
|
275
|
+
return model
|
|
276
|
+
|
|
277
|
+
@classmethod
|
|
278
|
+
def get_pytorch_model(cls, *args, **kwargs):
|
|
279
|
+
model = super().get_pytorch_model(*args, **kwargs)
|
|
280
|
+
model.vision_tower = model.vlm.vision_tower
|
|
281
|
+
del model.vlm.model.vision_tower
|
|
243
282
|
return model
|
|
244
283
|
|
|
245
284
|
def get_image_features(self, pixel_values: torch.Tensor):
|
|
@@ -274,18 +274,13 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
274
274
|
|
|
275
275
|
@property
|
|
276
276
|
def use_lora(self):
|
|
277
|
+
"""Check if LoRA is enabled for this configuration."""
|
|
277
278
|
return self.lora_config is not None
|
|
278
279
|
|
|
279
280
|
@property
|
|
280
281
|
def can_generate(self) -> bool:
|
|
281
282
|
return "decode" in self.phases
|
|
282
283
|
|
|
283
|
-
@property
|
|
284
|
-
def nbits_per_param(self) -> int:
|
|
285
|
-
if self.quantization:
|
|
286
|
-
return self.quantization.nbits_per_param
|
|
287
|
-
return 16
|
|
288
|
-
|
|
289
284
|
|
|
290
285
|
class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
|
|
291
286
|
"""
|
|
@@ -46,12 +46,6 @@ class RBLNPageTableManager:
|
|
|
46
46
|
"""
|
|
47
47
|
If the block is empty (empty_block), allocates a block from the free_block_pool.
|
|
48
48
|
"""
|
|
49
|
-
if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
|
|
50
|
-
raise IndexError(
|
|
51
|
-
f"Invalid index(batch_idx={batch_idx}, block_idx={block_idx}): \n \
|
|
52
|
-
BlockTable Shape(batch_axis, block_axis): {self.block_tables.shape}, BlockSize: {self.rbln_config.kvcache_block_size}"
|
|
53
|
-
)
|
|
54
|
-
|
|
55
49
|
if self.block_tables[batch_idx][block_idx] == self.EMPTY_BLOCK:
|
|
56
50
|
if self.free_block_pool:
|
|
57
51
|
block = self.free_block_pool.popleft()
|
|
@@ -102,6 +96,8 @@ class RBLNPageTableManager:
|
|
|
102
96
|
s, e = cache_position[0][0].item(), cache_position[0][-1].item()
|
|
103
97
|
for position in range(s, e + 1, self.rbln_config.kvcache_block_size):
|
|
104
98
|
block_idx = position // self.rbln_config.kvcache_block_size
|
|
99
|
+
if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
|
|
100
|
+
raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
|
|
105
101
|
self.update_block(batch_idx, block_idx)
|
|
106
102
|
|
|
107
103
|
return self.replace_empty_block(self.block_tables[batch_idx])
|
|
@@ -12,12 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
|
-
from transformers import GenerationConfig
|
|
19
18
|
from transformers.generation.utils import GenerationMixin
|
|
20
|
-
from transformers.modeling_outputs import ModelOutput
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
if TYPE_CHECKING:
|
|
@@ -93,26 +91,20 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
|
|
|
93
91
|
self,
|
|
94
92
|
input_ids: torch.LongTensor,
|
|
95
93
|
attention_mask: Optional[torch.LongTensor] = None,
|
|
96
|
-
|
|
94
|
+
max_length: Optional[int] = None,
|
|
97
95
|
**kwargs,
|
|
98
|
-
)
|
|
96
|
+
):
|
|
99
97
|
"""
|
|
100
98
|
The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
|
|
101
|
-
Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) for more details.
|
|
102
99
|
|
|
103
100
|
Args:
|
|
104
|
-
input_ids
|
|
105
|
-
attention_mask
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
Please note that unspecified parameters will inherit [GenerationConfig](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig)’s default values.
|
|
109
|
-
kwargs (dict[str, Any], optional): Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
A ModelOutput (if return_dict_in_generate=True or when config.return_dict_in_generate=True) or a torch.LongTensor.
|
|
101
|
+
input_ids: The input ids to the model.
|
|
102
|
+
attention_mask: The attention mask to the model.
|
|
103
|
+
max_length: The maximum length of the generated text.
|
|
104
|
+
kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
|
|
113
105
|
"""
|
|
114
|
-
if
|
|
115
|
-
kwargs["
|
|
106
|
+
if max_length is not None:
|
|
107
|
+
kwargs["max_length"] = max_length
|
|
116
108
|
if attention_mask is not None:
|
|
117
109
|
kwargs["attention_mask"] = attention_mask
|
|
118
110
|
|
|
@@ -216,7 +216,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
216
216
|
return self.rbln_config.kvcache_num_blocks
|
|
217
217
|
|
|
218
218
|
@classmethod
|
|
219
|
-
def
|
|
219
|
+
def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
|
|
220
220
|
return cls._decoder_wrapper_cls(model, rbln_config, cls._use_rotary_emb).eval()
|
|
221
221
|
|
|
222
222
|
@classmethod
|
|
@@ -272,7 +272,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
272
272
|
@classmethod
|
|
273
273
|
@torch.inference_mode()
|
|
274
274
|
def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
275
|
-
wrapped_model = cls.
|
|
275
|
+
wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
|
|
276
276
|
prefill_compile_config = rbln_config.compile_cfgs[0]
|
|
277
277
|
|
|
278
278
|
# Here we use meta tensor, for the memory efficiency.
|
|
@@ -466,8 +466,13 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
466
466
|
|
|
467
467
|
# Update kvcache_num_blocks based on the attention implementation.
|
|
468
468
|
if rbln_config.attn_impl == "flash_attn":
|
|
469
|
-
estimated_max_num_blocks = cls.
|
|
470
|
-
|
|
469
|
+
estimated_max_num_blocks = cls.get_maximum_num_blocks(
|
|
470
|
+
config=model_config,
|
|
471
|
+
tensor_parallel_size=rbln_config.tensor_parallel_size or 1,
|
|
472
|
+
kvcache_block_size=rbln_config.kvcache_block_size,
|
|
473
|
+
nbits_per_param=16 if not rbln_config.quantization else 4, # TODO(jongho): FIX Ad-hoc
|
|
474
|
+
n_model_params=sum(p.numel() for p in model.parameters()),
|
|
475
|
+
num_runtimes=1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes),
|
|
471
476
|
)
|
|
472
477
|
|
|
473
478
|
if rbln_config.kvcache_num_blocks is None:
|
|
@@ -506,6 +511,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
506
511
|
f" than the required number of blocks ({num_full_blocks})."
|
|
507
512
|
"This can cause a failure during model compilation."
|
|
508
513
|
)
|
|
514
|
+
|
|
509
515
|
logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
|
|
510
516
|
|
|
511
517
|
return rbln_config
|
|
@@ -602,21 +608,11 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
602
608
|
input_ids: Optional[torch.LongTensor] = None,
|
|
603
609
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
604
610
|
attention_mask: Optional[torch.LongTensor] = None,
|
|
611
|
+
position_embed: Optional[torch.Tensor] = None,
|
|
605
612
|
**kwargs,
|
|
606
|
-
) ->
|
|
607
|
-
"""
|
|
608
|
-
Args:
|
|
609
|
-
input_ids (torch.LongTensor, optional): The input IDs to the model.
|
|
610
|
-
inputs_embeds (torch.Tensor, optional): The input embeddings to the model.
|
|
611
|
-
attention_mask (torch.LongTensor, optional): The attention mask to the model.
|
|
612
|
-
kwargs (dict[str, Any], optional): Additional keyword arguments.
|
|
613
|
-
|
|
614
|
-
Returns:
|
|
615
|
-
Dataclass containing the last hidden states of the model.
|
|
616
|
-
"""
|
|
613
|
+
) -> Tuple[torch.FloatTensor]:
|
|
617
614
|
inputs = inputs_embeds if inputs_embeds is not None else input_ids
|
|
618
615
|
batch_size = inputs.shape[0]
|
|
619
|
-
position_embed = kwargs.get("position_embed", None)
|
|
620
616
|
|
|
621
617
|
if batch_size != self.rbln_config.batch_size:
|
|
622
618
|
raise ValueError(
|
|
@@ -639,7 +635,6 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
|
|
|
639
635
|
all_last_hidden_states.append(last_hidden_states)
|
|
640
636
|
|
|
641
637
|
last_hidden_states = torch.concat(all_last_hidden_states, dim=0)
|
|
642
|
-
|
|
643
638
|
return BaseModelOutputWithPast(last_hidden_state=last_hidden_states)
|
|
644
639
|
|
|
645
640
|
|
|
@@ -764,16 +759,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
|
|
|
764
759
|
logits = []
|
|
765
760
|
inputs = inputs_embeds if inputs_embeds is not None else input_ids
|
|
766
761
|
batch_size = inputs.shape[0]
|
|
767
|
-
input_len = inputs.shape[1]
|
|
768
|
-
if batch_size > self.rbln_config.batch_size:
|
|
769
|
-
raise ValueError(
|
|
770
|
-
f"Input's batch({batch_size}) exceeds compiled batch_size({self.rbln_config.batch_size})"
|
|
771
|
-
)
|
|
772
|
-
if input_len > self.rbln_config.max_seq_len:
|
|
773
|
-
raise ValueError(
|
|
774
|
-
f"Input's length({input_len}) exceeds compiled max_seq_len({self.rbln_config.max_seq_len})."
|
|
775
|
-
)
|
|
776
|
-
|
|
777
762
|
for b_idx in range(batch_size):
|
|
778
763
|
cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
|
|
779
764
|
output = self.prefill_decoder(
|
|
@@ -798,15 +783,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
|
|
|
798
783
|
f"Available batch sizes are: {list(self.decoders.keys())}. "
|
|
799
784
|
f"Please run your model with one of these batch sizes or add support for batch size {batch_size}."
|
|
800
785
|
)
|
|
801
|
-
if max(cache_position.reshape(-1)) >= self.rbln_config.max_seq_len:
|
|
802
|
-
raise ValueError(
|
|
803
|
-
f"Cache position exceeds the maximum sequence length.\n"
|
|
804
|
-
f" - Current max cache position: {int(torch.max(cache_position).item())}\n"
|
|
805
|
-
f" - Allowed max_seq_len: {self.rbln_config.max_seq_len}\n"
|
|
806
|
-
f"Solution: Reduce the generation length by adjusting `max_new_tokens` "
|
|
807
|
-
f"or `max_length` in the generation config."
|
|
808
|
-
)
|
|
809
|
-
|
|
810
786
|
logits = self.decoders[batch_size](
|
|
811
787
|
input_ids=input_ids,
|
|
812
788
|
inputs_embeds=inputs_embeds,
|
|
@@ -13,11 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from typing import Tuple, Union
|
|
17
|
-
|
|
18
|
-
import torch
|
|
19
|
-
from transformers.modeling_outputs import DepthEstimatorOutput
|
|
20
|
-
|
|
21
16
|
from ...modeling_generic import RBLNModelForDepthEstimation
|
|
22
17
|
|
|
23
18
|
|
|
@@ -28,15 +23,3 @@ class RBLNDepthAnythingForDepthEstimation(RBLNModelForDepthEstimation):
|
|
|
28
23
|
This class provides hardware-accelerated inference for Depth Anything V2
|
|
29
24
|
models on RBLN devices, providing the most capable monocular depth estimation (MDE) model.
|
|
30
25
|
"""
|
|
31
|
-
|
|
32
|
-
def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[Tuple, DepthEstimatorOutput]:
|
|
33
|
-
"""
|
|
34
|
-
Forward pass for the RBLN-optimized DepthAnythingForDepthEstimation model.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a DepthEstimatorOutput object.
|
|
41
|
-
"""
|
|
42
|
-
return super().forward(pixel_values, **kwargs)
|