optimum-rbln 0.8.2a1__py3-none-any.whl → 0.8.2a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of optimum-rbln might be problematic. Click here for more details.
- optimum/rbln/__init__.py +8 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +16 -1
- optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +3 -0
- optimum/rbln/diffusers/modeling_diffusers.py +3 -4
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +10 -2
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -30
- optimum/rbln/modeling.py +2 -3
- optimum/rbln/modeling_base.py +17 -13
- optimum/rbln/transformers/__init__.py +8 -0
- optimum/rbln/transformers/models/__init__.py +2 -0
- optimum/rbln/transformers/models/clip/configuration_clip.py +12 -1
- optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +13 -1
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +2 -3
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +107 -249
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +18 -1
- optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +377 -0
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +275 -0
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +2 -0
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -0
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -0
- optimum/rbln/utils/hub.py +8 -47
- optimum/rbln/utils/runtime_utils.py +28 -2
- {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/METADATA +1 -1
- {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/RECORD +34 -30
- {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Optional, Tuple, Union
|
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
18
|
from transformers import CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
|
|
19
|
+
from transformers.modeling_outputs import BaseModelOutputWithPooling
|
|
19
20
|
from transformers.models.clip.modeling_clip import CLIPTextModelOutput, CLIPVisionModelOutput
|
|
20
21
|
|
|
21
22
|
from ....configuration_utils import RBLNCompileConfig
|
|
@@ -111,12 +112,27 @@ class RBLNCLIPTextModelWithProjection(RBLNCLIPTextModel):
|
|
|
111
112
|
|
|
112
113
|
|
|
113
114
|
class _VisionEncoder(torch.nn.Module):
|
|
114
|
-
def __init__(
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
enc: CLIPVisionModel,
|
|
118
|
+
interpolate_pos_encoding: bool,
|
|
119
|
+
output_hidden_states: bool,
|
|
120
|
+
output_attentions: bool,
|
|
121
|
+
):
|
|
115
122
|
super().__init__()
|
|
116
123
|
self.enc = enc
|
|
124
|
+
self.interpolate_pos_encoding = interpolate_pos_encoding
|
|
125
|
+
self.output_hidden_states = output_hidden_states
|
|
126
|
+
self.output_attentions = output_attentions
|
|
117
127
|
|
|
118
128
|
def forward(self, inp):
|
|
119
|
-
enc_out = self.enc(
|
|
129
|
+
enc_out = self.enc(
|
|
130
|
+
inp,
|
|
131
|
+
output_hidden_states=self.output_hidden_states,
|
|
132
|
+
interpolate_pos_encoding=self.interpolate_pos_encoding,
|
|
133
|
+
output_attentions=self.output_attentions,
|
|
134
|
+
return_dict=False,
|
|
135
|
+
)
|
|
120
136
|
return enc_out
|
|
121
137
|
|
|
122
138
|
|
|
@@ -130,7 +146,12 @@ class RBLNCLIPVisionModel(RBLNModel):
|
|
|
130
146
|
|
|
131
147
|
@classmethod
|
|
132
148
|
def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
|
|
133
|
-
|
|
149
|
+
wrapper_cfg = {
|
|
150
|
+
"interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
|
|
151
|
+
"output_hidden_states": rbln_config.output_hidden_states,
|
|
152
|
+
"output_attentions": rbln_config.output_attentions,
|
|
153
|
+
}
|
|
154
|
+
return _VisionEncoder(model, **wrapper_cfg).eval()
|
|
134
155
|
|
|
135
156
|
@classmethod
|
|
136
157
|
def update_rbln_config_using_pipe(
|
|
@@ -155,6 +176,12 @@ class RBLNCLIPVisionModel(RBLNModel):
|
|
|
155
176
|
if rbln_config.image_size is None:
|
|
156
177
|
raise ValueError("`rbln_image_size` should be specified!")
|
|
157
178
|
|
|
179
|
+
if rbln_config.output_attentions is None:
|
|
180
|
+
rbln_config.output_attentions = getattr(model_config, "output_attentions", False)
|
|
181
|
+
|
|
182
|
+
if rbln_config.output_hidden_states is None:
|
|
183
|
+
rbln_config.output_hidden_states = getattr(model_config, "output_hidden_states", False)
|
|
184
|
+
|
|
158
185
|
rbln_compile_config = RBLNCompileConfig(
|
|
159
186
|
input_info=[
|
|
160
187
|
(
|
|
@@ -176,27 +203,76 @@ class RBLNCLIPVisionModel(RBLNModel):
|
|
|
176
203
|
def forward(
|
|
177
204
|
self,
|
|
178
205
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
179
|
-
return_dict: bool =
|
|
206
|
+
return_dict: bool = True,
|
|
207
|
+
output_attentions: bool = None,
|
|
208
|
+
output_hidden_states: bool = None,
|
|
209
|
+
interpolate_pos_encoding: bool = False,
|
|
180
210
|
**kwargs,
|
|
181
|
-
) -> Union[Tuple,
|
|
211
|
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
|
182
212
|
if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
|
|
183
213
|
logger.warning(
|
|
184
214
|
f"Currently, optimum-rbln does not support kwargs {kwargs.keys()} for {self.__class__.__name__}."
|
|
185
215
|
)
|
|
216
|
+
|
|
217
|
+
output_attentions = output_attentions if output_attentions is not None else self.rbln_config.output_attentions
|
|
218
|
+
output_hidden_states = (
|
|
219
|
+
output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
if output_attentions != self.rbln_config.output_attentions:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Variable output_attentions {output_attentions} is not equal to rbln_config.output_attentions {self.rbln_config.output_attentions} "
|
|
225
|
+
f"Please compile again with the correct argument."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if output_hidden_states != self.rbln_config.output_hidden_states:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
|
|
231
|
+
f"Please compile again with the correct argument."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if interpolate_pos_encoding != self.rbln_config.interpolate_pos_encoding:
|
|
235
|
+
raise ValueError(
|
|
236
|
+
f"Variable interpolate_pos_encoding {interpolate_pos_encoding} is not equal to rbln_config.interpolate_pos_encoding {self.rbln_config.interpolate_pos_encoding} "
|
|
237
|
+
f"Please compile again with the correct argument."
|
|
238
|
+
)
|
|
239
|
+
|
|
186
240
|
output = super().forward(pixel_values, return_dict=return_dict)
|
|
187
241
|
return output
|
|
188
242
|
|
|
189
243
|
def _prepare_output(self, output, return_dict):
|
|
190
244
|
# Prepare model output based on return_dict flag.
|
|
191
245
|
# This method can be overridden by subclasses to provide task-specific output handling.
|
|
246
|
+
last_hidden_state = output.pop(0)
|
|
247
|
+
pooler_output = output.pop(0)
|
|
248
|
+
vision_config = self.config.vision_config if hasattr(self.config, "vision_config") else self.config
|
|
249
|
+
|
|
250
|
+
if self.rbln_config.output_hidden_states:
|
|
251
|
+
hidden_states = ()
|
|
252
|
+
num_hidden_layers = vision_config.num_hidden_layers
|
|
253
|
+
for _ in range(num_hidden_layers + 1):
|
|
254
|
+
hidden_states += (output.pop(0),)
|
|
255
|
+
else:
|
|
256
|
+
hidden_states = None
|
|
257
|
+
|
|
258
|
+
if self.rbln_config.output_attentions:
|
|
259
|
+
attentions = ()
|
|
260
|
+
num_hidden_layers = vision_config.num_hidden_layers
|
|
261
|
+
for _ in range(num_hidden_layers):
|
|
262
|
+
attentions += (output.pop(0),)
|
|
263
|
+
else:
|
|
264
|
+
attentions = None
|
|
192
265
|
|
|
193
266
|
if not return_dict:
|
|
194
|
-
return (
|
|
267
|
+
return tuple(
|
|
268
|
+
item for item in (last_hidden_state, pooler_output, hidden_states, attentions) if item is not None
|
|
269
|
+
)
|
|
195
270
|
else:
|
|
196
|
-
return
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
hidden_states=
|
|
271
|
+
return BaseModelOutputWithPooling(
|
|
272
|
+
last_hidden_state=last_hidden_state,
|
|
273
|
+
pooler_output=pooler_output,
|
|
274
|
+
hidden_states=hidden_states,
|
|
275
|
+
attentions=attentions,
|
|
200
276
|
)
|
|
201
277
|
|
|
202
278
|
|
|
@@ -208,21 +284,40 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
|
|
|
208
284
|
multimodal embedding alignment tasks.
|
|
209
285
|
"""
|
|
210
286
|
|
|
211
|
-
def
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
287
|
+
def _prepare_output(self, output, return_dict):
|
|
288
|
+
# Prepare model output based on return_dict flag.
|
|
289
|
+
# This method can be overridden by subclasses to provide task-specific output handling.
|
|
290
|
+
|
|
291
|
+
image_embeds = output.pop(0) if isinstance(output, (tuple, list)) else output
|
|
292
|
+
last_hidden_state = output.pop(0)
|
|
293
|
+
|
|
294
|
+
vision_config = self.config.vision_config if hasattr(self.config, "vision_config") else self.config
|
|
295
|
+
|
|
296
|
+
if self.rbln_config.output_hidden_states:
|
|
297
|
+
hidden_states = ()
|
|
298
|
+
num_hidden_layers = vision_config.num_hidden_layers
|
|
299
|
+
for _ in range(num_hidden_layers + 1):
|
|
300
|
+
hidden_states += (output.pop(0),)
|
|
301
|
+
else:
|
|
302
|
+
hidden_states = None
|
|
303
|
+
|
|
304
|
+
if self.rbln_config.output_attentions:
|
|
305
|
+
attentions = ()
|
|
306
|
+
num_hidden_layers = vision_config.num_hidden_layers
|
|
307
|
+
for _ in range(num_hidden_layers):
|
|
308
|
+
attentions += (output.pop(0),)
|
|
309
|
+
else:
|
|
310
|
+
attentions = None
|
|
311
|
+
|
|
312
|
+
if not return_dict:
|
|
313
|
+
return tuple(
|
|
314
|
+
item for item in (image_embeds, last_hidden_state, hidden_states, attentions) if item is not None
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
else:
|
|
318
|
+
return CLIPVisionModelOutput(
|
|
319
|
+
image_embeds=image_embeds,
|
|
320
|
+
last_hidden_state=last_hidden_state,
|
|
321
|
+
hidden_states=hidden_states,
|
|
322
|
+
attentions=attentions,
|
|
323
|
+
)
|
|
@@ -352,8 +352,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
|
|
|
352
352
|
if position_embed is not None:
|
|
353
353
|
position_embed = torch.nn.functional.pad(position_embed, (0, 0, 0, padding_size))
|
|
354
354
|
|
|
355
|
+
if token_type_ids is not None:
|
|
356
|
+
token_type_ids = torch.nn.functional.pad(token_type_ids, (0, padding_size), value=-1)
|
|
357
|
+
|
|
355
358
|
# Overwrite position_ids and padded_cache_lengths
|
|
356
|
-
position_ids =
|
|
359
|
+
position_ids = cache_position.clone()
|
|
357
360
|
padded_cache_lengths = 0
|
|
358
361
|
|
|
359
362
|
return (
|
|
@@ -365,6 +368,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
|
|
|
365
368
|
position_embed,
|
|
366
369
|
padded_cache_lengths,
|
|
367
370
|
query_length,
|
|
371
|
+
token_type_ids,
|
|
368
372
|
)
|
|
369
373
|
|
|
370
374
|
def prefill_forward(
|
|
@@ -393,6 +397,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
|
|
|
393
397
|
position_embed,
|
|
394
398
|
padded_cache_lengths,
|
|
395
399
|
query_length,
|
|
400
|
+
token_type_ids,
|
|
396
401
|
) = self._prepare_prefill_inputs(
|
|
397
402
|
inputs, cache_position, attention_mask, position_embed, token_type_ids=token_type_ids
|
|
398
403
|
)
|
|
@@ -1085,6 +1090,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
|
1085
1090
|
tensor_type="pt",
|
|
1086
1091
|
device=rbln_config.device_map["prefill"],
|
|
1087
1092
|
activate_profiler=rbln_config.activate_profiler,
|
|
1093
|
+
timeout=rbln_config.timeout,
|
|
1088
1094
|
),
|
|
1089
1095
|
*[
|
|
1090
1096
|
rebel.Runtime(
|
|
@@ -1092,6 +1098,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
|
1092
1098
|
tensor_type="pt",
|
|
1093
1099
|
device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
|
|
1094
1100
|
activate_profiler=rbln_config.activate_profiler,
|
|
1101
|
+
timeout=rbln_config.timeout,
|
|
1095
1102
|
)
|
|
1096
1103
|
for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
|
|
1097
1104
|
],
|
|
@@ -1190,6 +1197,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
|
1190
1197
|
if cache_position is None:
|
|
1191
1198
|
logits = []
|
|
1192
1199
|
inputs = inputs_embeds if inputs_embeds is not None else input_ids
|
|
1200
|
+
# for only use forward
|
|
1201
|
+
if generate_idx is None:
|
|
1202
|
+
generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
|
|
1203
|
+
if padded_cache_lengths is None:
|
|
1204
|
+
padded_cache_lengths = torch.zeros_like(generate_idx)
|
|
1193
1205
|
batch_size = inputs.shape[0]
|
|
1194
1206
|
for b_idx in range(batch_size):
|
|
1195
1207
|
cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
|
|
@@ -23,22 +23,21 @@ from ..siglip.configuration_siglip import RBLNSiglipVisionModelConfig
|
|
|
23
23
|
class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
|
|
24
24
|
def __init__(
|
|
25
25
|
self,
|
|
26
|
-
prefill_chunk_size: Optional[int] = None,
|
|
27
26
|
use_position_ids: Optional[bool] = None,
|
|
28
27
|
use_attention_mask: Optional[bool] = None,
|
|
28
|
+
image_prefill_chunk_size: Optional[int] = None,
|
|
29
29
|
**kwargs: Dict[str, Any],
|
|
30
30
|
):
|
|
31
31
|
# use_attention_mask and use_position_ids are always True for Gemma3
|
|
32
32
|
use_attention_mask = use_attention_mask or True
|
|
33
33
|
use_position_ids = use_position_ids or True
|
|
34
|
-
prefill_chunk_size = prefill_chunk_size or 256
|
|
35
34
|
|
|
36
35
|
super().__init__(
|
|
37
|
-
prefill_chunk_size=prefill_chunk_size,
|
|
38
36
|
use_attention_mask=use_attention_mask,
|
|
39
37
|
use_position_ids=use_position_ids,
|
|
40
38
|
**kwargs,
|
|
41
39
|
)
|
|
40
|
+
self.image_prefill_chunk_size = image_prefill_chunk_size
|
|
42
41
|
|
|
43
42
|
npu = self.npu or rebel.get_npu_name()
|
|
44
43
|
if npu == "RBLN-CA02":
|