optimum-rbln 0.9.1__py3-none-any.whl → 0.9.2a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (36) hide show
  1. optimum/rbln/__version__.py +2 -2
  2. optimum/rbln/configuration_utils.py +54 -7
  3. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +30 -14
  4. optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +11 -8
  5. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +23 -13
  6. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +10 -6
  7. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +14 -10
  8. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +14 -7
  9. optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +9 -11
  10. optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +35 -3
  11. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +21 -22
  12. optimum/rbln/transformers/models/clip/modeling_clip.py +4 -0
  13. optimum/rbln/transformers/models/colpali/colpali_architecture.py +2 -2
  14. optimum/rbln/transformers/models/colpali/configuration_colpali.py +17 -1
  15. optimum/rbln/transformers/models/colpali/modeling_colpali.py +72 -79
  16. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +2 -2
  17. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +11 -3
  18. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +58 -43
  19. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +27 -3
  20. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +22 -15
  21. optimum/rbln/transformers/models/llava/configuration_llava.py +16 -2
  22. optimum/rbln/transformers/models/llava/modeling_llava.py +106 -49
  23. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +11 -13
  24. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +232 -342
  25. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +6 -11
  26. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +11 -1
  27. optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +22 -0
  28. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +11 -1
  29. optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +22 -0
  30. optimum/rbln/transformers/models/siglip/modeling_siglip.py +3 -14
  31. optimum/rbln/transformers/utils/rbln_runtime_wrapper.py +79 -0
  32. optimum/rbln/utils/submodule.py +21 -5
  33. {optimum_rbln-0.9.1.dist-info → optimum_rbln-0.9.2a1.dist-info}/METADATA +2 -2
  34. {optimum_rbln-0.9.1.dist-info → optimum_rbln-0.9.2a1.dist-info}/RECORD +36 -35
  35. {optimum_rbln-0.9.1.dist-info → optimum_rbln-0.9.2a1.dist-info}/WHEEL +0 -0
  36. {optimum_rbln-0.9.1.dist-info → optimum_rbln-0.9.2a1.dist-info}/licenses/LICENSE +0 -0
@@ -12,18 +12,21 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import importlib
15
16
  import inspect
16
17
  from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
17
18
 
18
19
  import torch
19
20
  from transformers import AutoModelForImageTextToText, LlavaForConditionalGeneration, PretrainedConfig, PreTrainedModel
20
21
  from transformers.modeling_outputs import BaseModelOutputWithPooling
22
+ from transformers.modeling_utils import no_init_weights
21
23
  from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
22
24
 
23
25
  from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
24
26
  from ....modeling import RBLNModel
25
27
  from ....utils.logging import get_logger
26
28
  from ...modeling_outputs import RBLNDecoderOnlyOutput
29
+ from ...utils.rbln_runtime_wrapper import LoopProcessor
27
30
 
28
31
 
29
32
  logger = get_logger(__name__)
@@ -32,20 +35,32 @@ if TYPE_CHECKING:
32
35
  from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
33
36
 
34
37
 
35
- class LoopVisionTower:
36
- def __init__(self, vision_tower: RBLNModel) -> None:
37
- self.vision_tower = vision_tower
38
+ class LoopVisionTower(LoopProcessor):
39
+ def __init__(self, vision_tower):
40
+ # FIXME: need to know RBLNModel or RuntimeWrapper
41
+ if hasattr(vision_tower.model, "runtime"):
42
+ super().__init__(model=vision_tower)
43
+ else:
44
+ super().__init__(model=vision_tower.model[0])
38
45
 
39
- def forward(self, pixel_values, image_sizes: Optional[torch.Tensor] = None, **kwargs):
40
- outputs = []
41
- for i in range(pixel_values.shape[0]):
42
- outputs.append(
43
- self.vision_tower(
44
- pixel_values[i : i + 1], image_sizes[i : i + 1] if image_sizes is not None else None, **kwargs
45
- )
46
- )
46
+ self.rbln_config = vision_tower.rbln_config
47
+
48
+ def _get_batch_size(self, pixel_values, **kwargs):
49
+ return pixel_values.shape[0]
50
+
51
+ def _prepare_inputs_for_iteration(self, index, common_inputs, pixel_values, **kwargs):
52
+ pixel_values_item = pixel_values[index : index + 1]
53
+ if "image_sizes" in kwargs and kwargs["image_sizes"] is not None:
54
+ ret_val = [pixel_values_item, kwargs["image_sizes"][index : index + 1]]
55
+ else:
56
+ ret_val = [pixel_values_item]
57
+
58
+ out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]] if "out" in kwargs else None
59
+ return (ret_val, {"out": out_buffer})
47
60
 
48
- if hasattr(self.vision_tower.rbln_config, "max_image_size"):
61
+ def _process_outputs(self, outputs: list, **kwargs) -> "BaseModelOutputWithPooling":
62
+ # when use another Wrapper
63
+ if hasattr(self.rbln_config, "max_image_size"):
49
64
  last_hidden_states = [output.last_hidden_state for output in outputs]
50
65
  last_hidden_states = torch.cat(last_hidden_states, dim=1)
51
66
  hidden_states = tuple(
@@ -55,49 +70,37 @@ class LoopVisionTower:
55
70
  )
56
71
  for layer_idx in range(len(outputs[0].hidden_states))
57
72
  )
58
-
59
73
  else:
60
- last_hidden_states = [output.last_hidden_state for output in outputs]
61
- last_hidden_states = torch.cat(last_hidden_states, dim=0)
62
- hidden_states = [output.hidden_states for output in outputs]
63
- hidden_states = tuple(
64
- torch.cat(tuple((hidden_states[n][i] for n in range(pixel_values.shape[0]))), dim=0)
65
- for i in range(len(hidden_states[0]))
66
- )
74
+ output = kwargs["out"]
75
+ last_hidden_states = output[0]
76
+
77
+ if not output[2:]:
78
+ hidden_states = None
79
+ else:
80
+ hidden_states = tuple(output[2:])
67
81
 
68
82
  return BaseModelOutputWithPooling(
69
83
  last_hidden_state=last_hidden_states,
84
+ pooler_output=None,
70
85
  hidden_states=hidden_states,
71
86
  )
72
87
 
73
- def __call__(self, *args: Any, **kwds: Any) -> Any:
74
- return self.forward(*args, **kwds)
75
88
 
76
- def __repr__(self) -> str:
77
- return repr(self.vision_tower)
89
+ class LoopProjector(LoopProcessor):
90
+ def __init__(self, multi_modal_projector: "RBLNModel"):
91
+ super().__init__(model=multi_modal_projector)
78
92
 
93
+ def _get_batch_size(self, image_feature, **kwargs):
94
+ return image_feature.shape[0]
79
95
 
80
- class LoopProjector:
81
- def __init__(self, multi_modal_projector) -> None:
82
- self.multi_modal_projector = multi_modal_projector
96
+ def _prepare_inputs_for_iteration(self, index, common_inputs, image_feature, **kwargs):
97
+ image_feature_item = image_feature[index : index + 1]
98
+ out_buffer = [tensor[index : index + 1] for tensor in kwargs["out"]]
99
+ return ([image_feature_item], {"out": out_buffer})
83
100
 
84
- def forward(self, *args, **kwargs):
85
- # Loop instead of batch
86
- image_feature = args[0]
87
-
88
- outputs = []
89
- for i in range(image_feature.shape[0]):
90
- outputs.append(self.multi_modal_projector(image_feature[i : i + 1]))
91
-
92
- # FIXME:: This can be optimized using out= API of rbln runtime.
93
- outputs = torch.cat(outputs, dim=0)
94
- return outputs
95
-
96
- def __call__(self, *args: Any, **kwds: Any) -> Any:
97
- return self.forward(*args, **kwds)
98
-
99
- def __repr__(self) -> str:
100
- return repr(self.multi_modal_projector)
101
+ def _process_outputs(self, outputs: list, **kwargs):
102
+ output = kwargs["out"]
103
+ return output[0]
101
104
 
102
105
 
103
106
  class RBLNLlavaForConditionalGeneration(RBLNModel):
@@ -170,6 +173,23 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
170
173
  def can_generate(self):
171
174
  return True
172
175
 
176
+ @classmethod
177
+ def get_pytorch_model(cls, *args, **kwargs):
178
+ model = super().get_pytorch_model(*args, **kwargs)
179
+
180
+ with no_init_weights():
181
+ model_cls_name = model.model.language_model.__class__.__name__
182
+ causal_model_cls_name = model_cls_name.replace("Model", "ForCausalLM")
183
+ causal_model_cls = getattr(importlib.import_module("transformers"), causal_model_cls_name)
184
+ new_language_model = causal_model_cls(model.model.language_model.config)
185
+
186
+ new_language_model.lm_head = model.lm_head
187
+ new_language_model.model = model.model.language_model
188
+ model.model.language_model = new_language_model
189
+ model.lm_head = None
190
+ del model.lm_head
191
+ return model
192
+
173
193
  def __post_init__(self, **kwargs):
174
194
  self.vision_tower = LoopVisionTower(self.rbln_submodules[0])
175
195
  self.language_model = self.rbln_submodules[1]
@@ -201,7 +221,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
201
221
  # support for pixtral that needs padding
202
222
  if hasattr(rbln_config.vision_tower, "max_image_size"):
203
223
  num_positions = (
204
- rbln_config.vision_tower.batch_size
224
+ rbln_config.batch_size
205
225
  * (rbln_config.vision_tower.max_image_size[0] // model_config.vision_config.patch_size)
206
226
  * (rbln_config.vision_tower.max_image_size[1] // model_config.vision_config.patch_size)
207
227
  )
@@ -217,7 +237,11 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
217
237
  input_info = [
218
238
  (
219
239
  "image_features",
220
- [rbln_config.batch_size, selected_image_feature_dim, model_config.vision_config.hidden_size],
240
+ [
241
+ 1,
242
+ selected_image_feature_dim,
243
+ model_config.vision_config.hidden_size,
244
+ ],
221
245
  "float32",
222
246
  )
223
247
  ]
@@ -290,7 +314,31 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
290
314
  raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
291
315
 
292
316
  kwargs = {k: v for k, v in kwargs.items() if v is not None}
293
- image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
317
+
318
+ # prepare out buffer for pre-allocation
319
+ if hasattr(self.rbln_config.vision_tower, "max_image_size"):
320
+ vision_out_size = [
321
+ pixel_values.shape[0],
322
+ (self.rbln_config.vision_tower.max_image_size[0] // self.config.vision_config.patch_size)
323
+ * (self.rbln_config.vision_tower.max_image_size[1] // self.config.vision_config.patch_size),
324
+ self.config.vision_config.hidden_size,
325
+ ]
326
+ pooler_out_size = None
327
+ else:
328
+ vision_out_size = [
329
+ pixel_values.shape[0],
330
+ (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2 + 1,
331
+ self.config.vision_config.hidden_size,
332
+ ]
333
+ pooler_out_size = [pixel_values.shape[0], self.config.vision_config.hidden_size]
334
+
335
+ vision_out_buffer = []
336
+ for i in range(self.config.vision_config.num_hidden_layers + 2):
337
+ vision_out_buffer.append(torch.empty(size=vision_out_size, dtype=torch.float32, device="cpu"))
338
+ if pooler_out_size is not None:
339
+ vision_out_buffer.insert(1, torch.empty(size=pooler_out_size, dtype=torch.float32, device="cpu"))
340
+
341
+ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, out=vision_out_buffer, **kwargs)
294
342
 
295
343
  if isinstance(vision_feature_layer, int):
296
344
  selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
@@ -311,15 +359,24 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
311
359
  )
312
360
  num_padding_patches = max_patches - num_real_patches
313
361
 
362
+ projector_out_size = [1, max_patches, self.config.text_config.hidden_size]
363
+ projector_out_buffer = [torch.empty(size=projector_out_size, dtype=torch.float32, device="cpu")]
364
+
314
365
  padding_tensor = torch.zeros(
315
366
  (selected_image_feature.shape[0], num_padding_patches, selected_image_feature.shape[2]),
316
367
  dtype=selected_image_feature.dtype,
317
368
  )
318
369
  padded_feature = torch.cat([selected_image_feature, padding_tensor], dim=1)
319
- padded_projected_feature = self.multi_modal_projector(padded_feature)
370
+ padded_projected_feature = self.multi_modal_projector(padded_feature, out=projector_out_buffer)
320
371
  image_features = padded_projected_feature[:, :num_real_patches, :]
321
372
  else:
322
- image_features = self.multi_modal_projector(selected_image_feature)
373
+ projector_out_size = [
374
+ pixel_values.shape[0] * pixel_values.shape[1],
375
+ (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2,
376
+ self.config.text_config.hidden_size,
377
+ ]
378
+ projector_out_buffer = [torch.empty(size=projector_out_size, dtype=torch.float32, device="cpu")]
379
+ image_features = self.multi_modal_projector(selected_image_feature, out=projector_out_buffer)
323
380
 
324
381
  return image_features
325
382
 
@@ -16,7 +16,6 @@ from typing import Any, Optional
16
16
 
17
17
  from ....configuration_utils import RBLNModelConfig
18
18
  from ....utils.logging import get_logger
19
- from ...models.clip import RBLNCLIPVisionModelConfig
20
19
 
21
20
 
22
21
  logger = get_logger(__name__)
@@ -55,17 +54,16 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
55
54
  if not isinstance(self.batch_size, int) or self.batch_size < 0:
56
55
  raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
57
56
 
58
- self.vision_tower = self.init_submodule_config(
59
- RBLNCLIPVisionModelConfig,
60
- vision_tower,
61
- )
57
+ if self.batch_size != 1:
58
+ logger.warning("Ignore batch_size for LlavaNext vision tower. It will be set to 1.")
62
59
 
63
- if self.vision_tower.output_hidden_states is False:
64
- raise ValueError(
65
- f"LlavaNext requires output_hidden_states to be True, but found output_hidden_states={self.vision_tower.output_hidden_states}. "
66
- f"Please compile again with the correct argument."
67
- )
68
- else:
69
- self.vision_tower.output_hidden_states = True
60
+ self.vision_tower = self.initialize_submodule_config(
61
+ submodule_config=vision_tower,
62
+ batch_size=1, # vision_tower batch_size is always 1 in LlavaNext
63
+ output_hidden_states=True, # LlavaNext requires output_hidden_states to be True
64
+ force_kwargs=True,
65
+ )
70
66
 
71
- self.language_model = language_model
67
+ self.language_model = self.initialize_submodule_config(
68
+ submodule_config=language_model,
69
+ )