optimum-rbln 0.8.2a1__py3-none-any.whl → 0.8.2a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (34) hide show
  1. optimum/rbln/__init__.py +8 -0
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +16 -1
  4. optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +3 -0
  5. optimum/rbln/diffusers/modeling_diffusers.py +3 -4
  6. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
  7. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
  8. optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
  9. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
  10. optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +10 -2
  11. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -30
  12. optimum/rbln/modeling.py +2 -3
  13. optimum/rbln/modeling_base.py +17 -13
  14. optimum/rbln/transformers/__init__.py +8 -0
  15. optimum/rbln/transformers/models/__init__.py +2 -0
  16. optimum/rbln/transformers/models/clip/configuration_clip.py +12 -1
  17. optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
  18. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +13 -1
  19. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +2 -3
  20. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +107 -249
  21. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +18 -1
  22. optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
  23. optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
  24. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +377 -0
  25. optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +275 -0
  26. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +2 -0
  27. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -0
  28. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -0
  29. optimum/rbln/utils/hub.py +8 -47
  30. optimum/rbln/utils/runtime_utils.py +28 -2
  31. {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/METADATA +1 -1
  32. {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/RECORD +34 -30
  33. {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/WHEEL +0 -0
  34. {optimum_rbln-0.8.2a1.dist-info → optimum_rbln-0.8.2a3.dist-info}/licenses/LICENSE +0 -0
@@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Optional, Tuple, Union
16
16
 
17
17
  import torch
18
18
  from transformers import CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
19
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
19
20
  from transformers.models.clip.modeling_clip import CLIPTextModelOutput, CLIPVisionModelOutput
20
21
 
21
22
  from ....configuration_utils import RBLNCompileConfig
@@ -111,12 +112,27 @@ class RBLNCLIPTextModelWithProjection(RBLNCLIPTextModel):
111
112
 
112
113
 
113
114
  class _VisionEncoder(torch.nn.Module):
114
- def __init__(self, enc: CLIPVisionModel):
115
+ def __init__(
116
+ self,
117
+ enc: CLIPVisionModel,
118
+ interpolate_pos_encoding: bool,
119
+ output_hidden_states: bool,
120
+ output_attentions: bool,
121
+ ):
115
122
  super().__init__()
116
123
  self.enc = enc
124
+ self.interpolate_pos_encoding = interpolate_pos_encoding
125
+ self.output_hidden_states = output_hidden_states
126
+ self.output_attentions = output_attentions
117
127
 
118
128
  def forward(self, inp):
119
- enc_out = self.enc(inp, output_hidden_states=True, return_dict=False)
129
+ enc_out = self.enc(
130
+ inp,
131
+ output_hidden_states=self.output_hidden_states,
132
+ interpolate_pos_encoding=self.interpolate_pos_encoding,
133
+ output_attentions=self.output_attentions,
134
+ return_dict=False,
135
+ )
120
136
  return enc_out
121
137
 
122
138
 
@@ -130,7 +146,12 @@ class RBLNCLIPVisionModel(RBLNModel):
130
146
 
131
147
  @classmethod
132
148
  def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
133
- return _VisionEncoder(model).eval()
149
+ wrapper_cfg = {
150
+ "interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
151
+ "output_hidden_states": rbln_config.output_hidden_states,
152
+ "output_attentions": rbln_config.output_attentions,
153
+ }
154
+ return _VisionEncoder(model, **wrapper_cfg).eval()
134
155
 
135
156
  @classmethod
136
157
  def update_rbln_config_using_pipe(
@@ -155,6 +176,12 @@ class RBLNCLIPVisionModel(RBLNModel):
155
176
  if rbln_config.image_size is None:
156
177
  raise ValueError("`rbln_image_size` should be specified!")
157
178
 
179
+ if rbln_config.output_attentions is None:
180
+ rbln_config.output_attentions = getattr(model_config, "output_attentions", False)
181
+
182
+ if rbln_config.output_hidden_states is None:
183
+ rbln_config.output_hidden_states = getattr(model_config, "output_hidden_states", False)
184
+
158
185
  rbln_compile_config = RBLNCompileConfig(
159
186
  input_info=[
160
187
  (
@@ -176,27 +203,76 @@ class RBLNCLIPVisionModel(RBLNModel):
176
203
  def forward(
177
204
  self,
178
205
  pixel_values: Optional[torch.FloatTensor] = None,
179
- return_dict: bool = None,
206
+ return_dict: bool = True,
207
+ output_attentions: bool = None,
208
+ output_hidden_states: bool = None,
209
+ interpolate_pos_encoding: bool = False,
180
210
  **kwargs,
181
- ) -> Union[Tuple, CLIPVisionModelOutput]:
211
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
182
212
  if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
183
213
  logger.warning(
184
214
  f"Currently, optimum-rbln does not support kwargs {kwargs.keys()} for {self.__class__.__name__}."
185
215
  )
216
+
217
+ output_attentions = output_attentions if output_attentions is not None else self.rbln_config.output_attentions
218
+ output_hidden_states = (
219
+ output_hidden_states if output_hidden_states is not None else self.rbln_config.output_hidden_states
220
+ )
221
+
222
+ if output_attentions != self.rbln_config.output_attentions:
223
+ raise ValueError(
224
+ f"Variable output_attentions {output_attentions} is not equal to rbln_config.output_attentions {self.rbln_config.output_attentions} "
225
+ f"Please compile again with the correct argument."
226
+ )
227
+
228
+ if output_hidden_states != self.rbln_config.output_hidden_states:
229
+ raise ValueError(
230
+ f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.output_hidden_states {self.rbln_config.output_hidden_states} "
231
+ f"Please compile again with the correct argument."
232
+ )
233
+
234
+ if interpolate_pos_encoding != self.rbln_config.interpolate_pos_encoding:
235
+ raise ValueError(
236
+ f"Variable interpolate_pos_encoding {interpolate_pos_encoding} is not equal to rbln_config.interpolate_pos_encoding {self.rbln_config.interpolate_pos_encoding} "
237
+ f"Please compile again with the correct argument."
238
+ )
239
+
186
240
  output = super().forward(pixel_values, return_dict=return_dict)
187
241
  return output
188
242
 
189
243
  def _prepare_output(self, output, return_dict):
190
244
  # Prepare model output based on return_dict flag.
191
245
  # This method can be overridden by subclasses to provide task-specific output handling.
246
+ last_hidden_state = output.pop(0)
247
+ pooler_output = output.pop(0)
248
+ vision_config = self.config.vision_config if hasattr(self.config, "vision_config") else self.config
249
+
250
+ if self.rbln_config.output_hidden_states:
251
+ hidden_states = ()
252
+ num_hidden_layers = vision_config.num_hidden_layers
253
+ for _ in range(num_hidden_layers + 1):
254
+ hidden_states += (output.pop(0),)
255
+ else:
256
+ hidden_states = None
257
+
258
+ if self.rbln_config.output_attentions:
259
+ attentions = ()
260
+ num_hidden_layers = vision_config.num_hidden_layers
261
+ for _ in range(num_hidden_layers):
262
+ attentions += (output.pop(0),)
263
+ else:
264
+ attentions = None
192
265
 
193
266
  if not return_dict:
194
- return (output,) if not isinstance(output, (tuple, list)) else output
267
+ return tuple(
268
+ item for item in (last_hidden_state, pooler_output, hidden_states, attentions) if item is not None
269
+ )
195
270
  else:
196
- return CLIPVisionModelOutput(
197
- image_embeds=output[0],
198
- last_hidden_state=output[1],
199
- hidden_states=output[2:],
271
+ return BaseModelOutputWithPooling(
272
+ last_hidden_state=last_hidden_state,
273
+ pooler_output=pooler_output,
274
+ hidden_states=hidden_states,
275
+ attentions=attentions,
200
276
  )
201
277
 
202
278
 
@@ -208,21 +284,40 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
208
284
  multimodal embedding alignment tasks.
209
285
  """
210
286
 
211
- def forward(
212
- self,
213
- pixel_values: Optional[torch.FloatTensor] = None,
214
- **kwargs,
215
- ) -> Union[Tuple, CLIPVisionModelOutput]:
216
- if len(kwargs) > 0 and any(kwargs.values()):
217
- logger.warning(f"Currently, optimum-rbln does not support kwargs {kwargs.keys()} for {self.__class__}.")
218
-
219
- output = super().forward(pixel_values)
220
- image_embeds = output[0]
221
- last_hidden_state = output[1]
222
- hidden_states = output[2:]
223
-
224
- return CLIPVisionModelOutput(
225
- image_embeds=image_embeds,
226
- last_hidden_state=last_hidden_state,
227
- hidden_states=hidden_states,
228
- )
287
+ def _prepare_output(self, output, return_dict):
288
+ # Prepare model output based on return_dict flag.
289
+ # This method can be overridden by subclasses to provide task-specific output handling.
290
+
291
+ image_embeds = output.pop(0) if isinstance(output, (tuple, list)) else output
292
+ last_hidden_state = output.pop(0)
293
+
294
+ vision_config = self.config.vision_config if hasattr(self.config, "vision_config") else self.config
295
+
296
+ if self.rbln_config.output_hidden_states:
297
+ hidden_states = ()
298
+ num_hidden_layers = vision_config.num_hidden_layers
299
+ for _ in range(num_hidden_layers + 1):
300
+ hidden_states += (output.pop(0),)
301
+ else:
302
+ hidden_states = None
303
+
304
+ if self.rbln_config.output_attentions:
305
+ attentions = ()
306
+ num_hidden_layers = vision_config.num_hidden_layers
307
+ for _ in range(num_hidden_layers):
308
+ attentions += (output.pop(0),)
309
+ else:
310
+ attentions = None
311
+
312
+ if not return_dict:
313
+ return tuple(
314
+ item for item in (image_embeds, last_hidden_state, hidden_states, attentions) if item is not None
315
+ )
316
+
317
+ else:
318
+ return CLIPVisionModelOutput(
319
+ image_embeds=image_embeds,
320
+ last_hidden_state=last_hidden_state,
321
+ hidden_states=hidden_states,
322
+ attentions=attentions,
323
+ )
@@ -352,8 +352,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
352
352
  if position_embed is not None:
353
353
  position_embed = torch.nn.functional.pad(position_embed, (0, 0, 0, padding_size))
354
354
 
355
+ if token_type_ids is not None:
356
+ token_type_ids = torch.nn.functional.pad(token_type_ids, (0, padding_size), value=-1)
357
+
355
358
  # Overwrite position_ids and padded_cache_lengths
356
- position_ids = None
359
+ position_ids = cache_position.clone()
357
360
  padded_cache_lengths = 0
358
361
 
359
362
  return (
@@ -365,6 +368,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
365
368
  position_embed,
366
369
  padded_cache_lengths,
367
370
  query_length,
371
+ token_type_ids,
368
372
  )
369
373
 
370
374
  def prefill_forward(
@@ -393,6 +397,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
393
397
  position_embed,
394
398
  padded_cache_lengths,
395
399
  query_length,
400
+ token_type_ids,
396
401
  ) = self._prepare_prefill_inputs(
397
402
  inputs, cache_position, attention_mask, position_embed, token_type_ids=token_type_ids
398
403
  )
@@ -1085,6 +1090,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
1085
1090
  tensor_type="pt",
1086
1091
  device=rbln_config.device_map["prefill"],
1087
1092
  activate_profiler=rbln_config.activate_profiler,
1093
+ timeout=rbln_config.timeout,
1088
1094
  ),
1089
1095
  *[
1090
1096
  rebel.Runtime(
@@ -1092,6 +1098,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
1092
1098
  tensor_type="pt",
1093
1099
  device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
1094
1100
  activate_profiler=rbln_config.activate_profiler,
1101
+ timeout=rbln_config.timeout,
1095
1102
  )
1096
1103
  for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
1097
1104
  ],
@@ -1190,6 +1197,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
1190
1197
  if cache_position is None:
1191
1198
  logits = []
1192
1199
  inputs = inputs_embeds if inputs_embeds is not None else input_ids
1200
+ # for only use forward
1201
+ if generate_idx is None:
1202
+ generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
1203
+ if padded_cache_lengths is None:
1204
+ padded_cache_lengths = torch.zeros_like(generate_idx)
1193
1205
  batch_size = inputs.shape[0]
1194
1206
  for b_idx in range(batch_size):
1195
1207
  cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
@@ -23,22 +23,21 @@ from ..siglip.configuration_siglip import RBLNSiglipVisionModelConfig
23
23
  class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
24
24
  def __init__(
25
25
  self,
26
- prefill_chunk_size: Optional[int] = None,
27
26
  use_position_ids: Optional[bool] = None,
28
27
  use_attention_mask: Optional[bool] = None,
28
+ image_prefill_chunk_size: Optional[int] = None,
29
29
  **kwargs: Dict[str, Any],
30
30
  ):
31
31
  # use_attention_mask and use_position_ids are always True for Gemma3
32
32
  use_attention_mask = use_attention_mask or True
33
33
  use_position_ids = use_position_ids or True
34
- prefill_chunk_size = prefill_chunk_size or 256
35
34
 
36
35
  super().__init__(
37
- prefill_chunk_size=prefill_chunk_size,
38
36
  use_attention_mask=use_attention_mask,
39
37
  use_position_ids=use_position_ids,
40
38
  **kwargs,
41
39
  )
40
+ self.image_prefill_chunk_size = image_prefill_chunk_size
42
41
 
43
42
  npu = self.npu or rebel.get_npu_name()
44
43
  if npu == "RBLN-CA02":