optimum-rbln 0.7.4a5__py3-none-any.whl → 0.7.4a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +16 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/modeling_base.py +22 -3
- optimum/rbln/transformers/__init__.py +16 -0
- optimum/rbln/transformers/models/__init__.py +24 -0
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +67 -41
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +198 -99
- optimum/rbln/transformers/models/idefics3/__init__.py +16 -0
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +51 -0
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +459 -0
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -0
- optimum/rbln/transformers/models/qwen2_5_vl/__init__.py +19 -0
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +68 -0
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +608 -0
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +214 -0
- optimum/rbln/utils/runtime_utils.py +33 -2
- optimum/rbln/utils/submodule.py +10 -1
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a7.dist-info}/METADATA +1 -1
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a7.dist-info}/RECORD +21 -14
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a7.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a7.dist-info}/licenses/LICENSE +0 -0
optimum/rbln/__init__.py
CHANGED
@@ -74,6 +74,10 @@ _import_structure = {
|
|
74
74
|
"RBLNGemmaForCausalLMConfig",
|
75
75
|
"RBLNGPT2LMHeadModel",
|
76
76
|
"RBLNGPT2LMHeadModelConfig",
|
77
|
+
"RBLNIdefics3VisionTransformer",
|
78
|
+
"RBLNIdefics3ForConditionalGeneration",
|
79
|
+
"RBLNIdefics3ForConditionalGenerationConfig",
|
80
|
+
"RBLNIdefics3VisionTransformerConfig",
|
77
81
|
"RBLNLlamaForCausalLM",
|
78
82
|
"RBLNLlamaForCausalLMConfig",
|
79
83
|
"RBLNLlavaNextForConditionalGeneration",
|
@@ -86,6 +90,10 @@ _import_structure = {
|
|
86
90
|
"RBLNPhiForCausalLMConfig",
|
87
91
|
"RBLNQwen2ForCausalLM",
|
88
92
|
"RBLNQwen2ForCausalLMConfig",
|
93
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
94
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
95
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
96
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
89
97
|
"RBLNResNetForImageClassification",
|
90
98
|
"RBLNResNetForImageClassificationConfig",
|
91
99
|
"RBLNRobertaForMaskedLM",
|
@@ -277,6 +285,10 @@ if TYPE_CHECKING:
|
|
277
285
|
RBLNGemmaForCausalLMConfig,
|
278
286
|
RBLNGPT2LMHeadModel,
|
279
287
|
RBLNGPT2LMHeadModelConfig,
|
288
|
+
RBLNIdefics3ForConditionalGeneration,
|
289
|
+
RBLNIdefics3ForConditionalGenerationConfig,
|
290
|
+
RBLNIdefics3VisionTransformer,
|
291
|
+
RBLNIdefics3VisionTransformerConfig,
|
280
292
|
RBLNLlamaForCausalLM,
|
281
293
|
RBLNLlamaForCausalLMConfig,
|
282
294
|
RBLNLlavaNextForConditionalGeneration,
|
@@ -287,6 +299,10 @@ if TYPE_CHECKING:
|
|
287
299
|
RBLNMistralForCausalLMConfig,
|
288
300
|
RBLNPhiForCausalLM,
|
289
301
|
RBLNPhiForCausalLMConfig,
|
302
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
303
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
304
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
305
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
290
306
|
RBLNQwen2ForCausalLM,
|
291
307
|
RBLNQwen2ForCausalLMConfig,
|
292
308
|
RBLNResNetForImageClassification,
|
optimum/rbln/__version__.py
CHANGED
@@ -17,5 +17,5 @@ __version__: str
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
18
18
|
version_tuple: VERSION_TUPLE
|
19
19
|
|
20
|
-
__version__ = version = '0.7.
|
21
|
-
__version_tuple__ = version_tuple = (0, 7, 4)
|
20
|
+
__version__ = version = '0.7.4a7'
|
21
|
+
__version_tuple__ = version_tuple = (0, 7, 4, 'a7')
|
optimum/rbln/modeling_base.py
CHANGED
@@ -314,10 +314,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
314
314
|
)
|
315
315
|
|
316
316
|
except rebel.core.exception.RBLNRuntimeError as e:
|
317
|
-
|
318
|
-
f"
|
317
|
+
error_msg = (
|
318
|
+
f"\nFailed to create RBLN runtime: {str(e)}\n\n"
|
319
|
+
f"If you only need to compile the model without loading it to NPU, you can use:\n"
|
320
|
+
f" from_pretrained(..., rbln_create_runtimes=False) or\n"
|
321
|
+
f" from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
|
322
|
+
f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
|
323
|
+
f"Make sure your NPU is properly installed and operational."
|
319
324
|
)
|
320
|
-
|
325
|
+
raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
|
321
326
|
|
322
327
|
return cls(
|
323
328
|
models,
|
@@ -423,6 +428,20 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
423
428
|
def to(self, *args, **kwargs):
|
424
429
|
return self
|
425
430
|
|
431
|
+
def parameters(self):
|
432
|
+
"""
|
433
|
+
Provides a dummy parameter generator for compatibility.
|
434
|
+
|
435
|
+
This method mimics the interface of torch.nn.Module.parameters()
|
436
|
+
specifically for code that uses `next(model.parameters())` to infer
|
437
|
+
the device or dtype. It yields a single dummy tensor on CPU with float32 dtype.
|
438
|
+
|
439
|
+
Warning:
|
440
|
+
This does NOT yield the actual model parameters used by the RBLN runtime.
|
441
|
+
Code relying on iterating through all model parameters will not work as expected.
|
442
|
+
"""
|
443
|
+
yield torch.tensor([1.0], dtype=torch.float32, device=torch.device("cpu"))
|
444
|
+
|
426
445
|
def __call__(self, *args, **kwargs):
|
427
446
|
return self.forward(*args, **kwargs)
|
428
447
|
|
@@ -68,6 +68,10 @@ _import_structure = {
|
|
68
68
|
"RBLNGemmaForCausalLMConfig",
|
69
69
|
"RBLNGPT2LMHeadModel",
|
70
70
|
"RBLNGPT2LMHeadModelConfig",
|
71
|
+
"RBLNIdefics3VisionTransformer",
|
72
|
+
"RBLNIdefics3ForConditionalGeneration",
|
73
|
+
"RBLNIdefics3ForConditionalGenerationConfig",
|
74
|
+
"RBLNIdefics3VisionTransformerConfig",
|
71
75
|
"RBLNLlamaForCausalLM",
|
72
76
|
"RBLNLlamaForCausalLMConfig",
|
73
77
|
"RBLNLlavaNextForConditionalGeneration",
|
@@ -80,6 +84,10 @@ _import_structure = {
|
|
80
84
|
"RBLNPhiForCausalLMConfig",
|
81
85
|
"RBLNQwen2ForCausalLM",
|
82
86
|
"RBLNQwen2ForCausalLMConfig",
|
87
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
88
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
89
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
90
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
83
91
|
"RBLNT5EncoderModel",
|
84
92
|
"RBLNT5EncoderModelConfig",
|
85
93
|
"RBLNT5ForConditionalGeneration",
|
@@ -165,6 +173,10 @@ if TYPE_CHECKING:
|
|
165
173
|
RBLNGemmaForCausalLMConfig,
|
166
174
|
RBLNGPT2LMHeadModel,
|
167
175
|
RBLNGPT2LMHeadModelConfig,
|
176
|
+
RBLNIdefics3ForConditionalGeneration,
|
177
|
+
RBLNIdefics3ForConditionalGenerationConfig,
|
178
|
+
RBLNIdefics3VisionTransformer,
|
179
|
+
RBLNIdefics3VisionTransformerConfig,
|
168
180
|
RBLNLlamaForCausalLM,
|
169
181
|
RBLNLlamaForCausalLMConfig,
|
170
182
|
RBLNLlavaNextForConditionalGeneration,
|
@@ -175,6 +187,10 @@ if TYPE_CHECKING:
|
|
175
187
|
RBLNMistralForCausalLMConfig,
|
176
188
|
RBLNPhiForCausalLM,
|
177
189
|
RBLNPhiForCausalLMConfig,
|
190
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
191
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
192
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
193
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
178
194
|
RBLNQwen2ForCausalLM,
|
179
195
|
RBLNQwen2ForCausalLMConfig,
|
180
196
|
RBLNT5EncoderModel,
|
@@ -56,6 +56,12 @@ _import_structure = {
|
|
56
56
|
"RBLNCLIPVisionModelWithProjection",
|
57
57
|
"RBLNCLIPVisionModelWithProjectionConfig",
|
58
58
|
],
|
59
|
+
"qwen2_5_vl": [
|
60
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
61
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
62
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
63
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
64
|
+
],
|
59
65
|
"decoderonly": [
|
60
66
|
"RBLNDecoderOnlyModelForCausalLM",
|
61
67
|
"RBLNDecoderOnlyModelForCausalLMConfig",
|
@@ -67,6 +73,12 @@ _import_structure = {
|
|
67
73
|
"exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
|
68
74
|
"gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig"],
|
69
75
|
"gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig"],
|
76
|
+
"idefics3": [
|
77
|
+
"RBLNIdefics3VisionTransformer",
|
78
|
+
"RBLNIdefics3ForConditionalGeneration",
|
79
|
+
"RBLNIdefics3ForConditionalGenerationConfig",
|
80
|
+
"RBLNIdefics3VisionTransformerConfig",
|
81
|
+
],
|
70
82
|
"llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig"],
|
71
83
|
"llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
|
72
84
|
"midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
|
@@ -138,12 +150,24 @@ if TYPE_CHECKING:
|
|
138
150
|
from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
|
139
151
|
from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig
|
140
152
|
from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig
|
153
|
+
from .idefics3 import (
|
154
|
+
RBLNIdefics3ForConditionalGeneration,
|
155
|
+
RBLNIdefics3ForConditionalGenerationConfig,
|
156
|
+
RBLNIdefics3VisionTransformer,
|
157
|
+
RBLNIdefics3VisionTransformerConfig,
|
158
|
+
)
|
141
159
|
from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig
|
142
160
|
from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
|
143
161
|
from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
|
144
162
|
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
|
145
163
|
from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig
|
146
164
|
from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig
|
165
|
+
from .qwen2_5_vl import (
|
166
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
167
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
168
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
169
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
170
|
+
)
|
147
171
|
from .t5 import (
|
148
172
|
RBLNT5EncoderModel,
|
149
173
|
RBLNT5EncoderModelConfig,
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import math
|
16
|
-
from typing import List, Optional, Tuple
|
16
|
+
from typing import List, Optional, Tuple, Union
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from torch import nn
|
@@ -184,6 +184,7 @@ class DecoderOnlyWrapper(nn.Module):
|
|
184
184
|
|
185
185
|
def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
|
186
186
|
new_layers = []
|
187
|
+
|
187
188
|
for layer in causal_lm.model.layers:
|
188
189
|
if self.attn_impl == "eager":
|
189
190
|
new_self_attn = DecoderOnlyAttention(
|
@@ -201,6 +202,7 @@ class DecoderOnlyWrapper(nn.Module):
|
|
201
202
|
|
202
203
|
new_layer = DecoderOnlyLayer(layer, new_self_attn)
|
203
204
|
new_layers.append(new_layer)
|
205
|
+
|
204
206
|
new_model = DecoderOnlyModel(
|
205
207
|
causal_lm.model,
|
206
208
|
new_layers,
|
@@ -220,6 +222,53 @@ class DecoderOnlyWrapper(nn.Module):
|
|
220
222
|
self._phase = phase
|
221
223
|
self.causal_lm.phase = phase
|
222
224
|
|
225
|
+
def forward_common(
|
226
|
+
self,
|
227
|
+
input_ids_or_inputs_embeds: torch.Tensor,
|
228
|
+
cache_position: torch.Tensor,
|
229
|
+
attention_mask: torch.Tensor,
|
230
|
+
query_position: torch.Tensor,
|
231
|
+
block_tables: torch.Tensor,
|
232
|
+
rotary_emb: Union[nn.Module, torch.Tensor],
|
233
|
+
*past_key_values: List[torch.Tensor],
|
234
|
+
):
|
235
|
+
if input_ids_or_inputs_embeds.ndim == 2:
|
236
|
+
input_ids = input_ids_or_inputs_embeds
|
237
|
+
inputs_embeds = None
|
238
|
+
elif input_ids_or_inputs_embeds.ndim == 3:
|
239
|
+
input_ids = None
|
240
|
+
inputs_embeds = input_ids_or_inputs_embeds
|
241
|
+
else:
|
242
|
+
raise NotImplementedError(f"Unknown ndim of input : {input_ids_or_inputs_embeds.ndim}")
|
243
|
+
|
244
|
+
if len(past_key_values) != 2 * self.num_hidden_layers:
|
245
|
+
raise ValueError(
|
246
|
+
f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
|
247
|
+
)
|
248
|
+
|
249
|
+
# [key, value] * n_layer -> ( (key, value) ) * n_layer
|
250
|
+
# cache shape : batch, n_heads, 1, max_seq_len, head_dim
|
251
|
+
_past_key_values = []
|
252
|
+
for i in range(self.config.num_hidden_layers):
|
253
|
+
key_states = past_key_values[i * 2]
|
254
|
+
value_states = past_key_values[i * 2 + 1]
|
255
|
+
past_key_value = [key_states, value_states]
|
256
|
+
_past_key_values.append(past_key_value)
|
257
|
+
past_key_values = _past_key_values
|
258
|
+
|
259
|
+
logit = self.causal_lm(
|
260
|
+
input_ids=input_ids,
|
261
|
+
inputs_embeds=inputs_embeds,
|
262
|
+
attention_mask=attention_mask,
|
263
|
+
cache_position=cache_position,
|
264
|
+
query_position=query_position,
|
265
|
+
past_key_values=past_key_values,
|
266
|
+
rotary_emb=rotary_emb,
|
267
|
+
block_tables=block_tables,
|
268
|
+
)
|
269
|
+
|
270
|
+
return logit
|
271
|
+
|
223
272
|
def forward(self, *args):
|
224
273
|
if self.phase == "decode":
|
225
274
|
if self.use_attention_mask:
|
@@ -262,43 +311,16 @@ class DecoderOnlyWrapper(nn.Module):
|
|
262
311
|
else:
|
263
312
|
raise ValueError(f"Unknown phase: {self.phase}")
|
264
313
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
if len(past_key_values) != 2 * self.num_hidden_layers:
|
275
|
-
raise ValueError(
|
276
|
-
f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
|
277
|
-
)
|
278
|
-
|
279
|
-
# [key, value] * n_layer -> ( (key, value) ) * n_layer
|
280
|
-
# cache shape : batch, n_heads, 1, max_seq_len, head_dim
|
281
|
-
_past_key_values = []
|
282
|
-
for i in range(self.config.num_hidden_layers):
|
283
|
-
key_states = past_key_values[i * 2]
|
284
|
-
value_states = past_key_values[i * 2 + 1]
|
285
|
-
past_key_value = [key_states, value_states]
|
286
|
-
_past_key_values.append(past_key_value)
|
287
|
-
past_key_values = _past_key_values
|
288
|
-
|
289
|
-
logit = self.causal_lm(
|
290
|
-
input_ids=input_ids,
|
291
|
-
inputs_embeds=inputs_embeds,
|
292
|
-
attention_mask=attention_mask,
|
293
|
-
cache_position=cache_position,
|
294
|
-
query_position=query_position,
|
295
|
-
past_key_values=past_key_values,
|
296
|
-
rotary_emb=self.rotary_emb,
|
297
|
-
block_tables=block_tables,
|
314
|
+
return self.forward_common(
|
315
|
+
input_ids_or_inputs_embeds,
|
316
|
+
cache_position,
|
317
|
+
attention_mask,
|
318
|
+
query_position,
|
319
|
+
block_tables,
|
320
|
+
self.rotary_emb,
|
321
|
+
*past_key_values,
|
298
322
|
)
|
299
323
|
|
300
|
-
return logit
|
301
|
-
|
302
324
|
|
303
325
|
class DecoderOnlyForCausalLM(nn.Module):
|
304
326
|
"""A specialized wrapper for Causal Language Models optimized for RBLN compilation.
|
@@ -322,12 +344,13 @@ class DecoderOnlyForCausalLM(nn.Module):
|
|
322
344
|
_phase: Current processing phase ("prefill" or "decode")
|
323
345
|
"""
|
324
346
|
|
325
|
-
def __init__(self, causal_lm: PreTrainedModel, model):
|
347
|
+
def __init__(self, causal_lm: PreTrainedModel, model: nn.Module):
|
326
348
|
super().__init__()
|
327
349
|
self.config = causal_lm.config
|
328
350
|
self._original_mod = causal_lm
|
329
351
|
self.model = model
|
330
352
|
self._phase = "prefill"
|
353
|
+
self.lm_head = self._original_mod.lm_head
|
331
354
|
|
332
355
|
@property
|
333
356
|
def phase(self):
|
@@ -363,7 +386,7 @@ class DecoderOnlyForCausalLM(nn.Module):
|
|
363
386
|
if self.phase == "prefill":
|
364
387
|
hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
|
365
388
|
|
366
|
-
logits = self.
|
389
|
+
logits = self.lm_head(hidden_states)
|
367
390
|
return logits
|
368
391
|
|
369
392
|
|
@@ -455,8 +478,12 @@ class DecoderOnlyModel(nn.Module):
|
|
455
478
|
|
456
479
|
# get cos,sin vector if needed
|
457
480
|
if rotary_emb is not None:
|
458
|
-
|
459
|
-
|
481
|
+
if isinstance(rotary_emb, torch.Tensor):
|
482
|
+
cos = rotary_emb[0]
|
483
|
+
sin = rotary_emb[1]
|
484
|
+
else:
|
485
|
+
cos, sin = rotary_emb(hidden_states, self.max_seq_len) # dtype carrier, max_seq_len
|
486
|
+
cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, cache_position)
|
460
487
|
else:
|
461
488
|
batch_size = inputs_embeds.shape[0]
|
462
489
|
if cache_position.shape[0] > 1:
|
@@ -833,7 +860,6 @@ def rotate_half(x):
|
|
833
860
|
|
834
861
|
def apply_rotary_pos_emb(q, k, cos, sin):
|
835
862
|
"""Applies Rotary Position Embedding to the query and key tensors."""
|
836
|
-
|
837
863
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
838
864
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
839
865
|
return q_embed, k_embed
|