optimum-rbln 0.7.4a5__py3-none-any.whl → 0.7.4a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +8 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/modeling_base.py +22 -3
- optimum/rbln/transformers/__init__.py +8 -0
- optimum/rbln/transformers/models/__init__.py +12 -0
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +65 -41
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +192 -99
- optimum/rbln/transformers/models/qwen2_5_vl/__init__.py +19 -0
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +68 -0
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +608 -0
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +214 -0
- optimum/rbln/utils/runtime_utils.py +33 -2
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a6.dist-info}/METADATA +1 -1
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a6.dist-info}/RECORD +16 -12
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a6.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.7.4a5.dist-info → optimum_rbln-0.7.4a6.dist-info}/licenses/LICENSE +0 -0
optimum/rbln/__init__.py
CHANGED
@@ -86,6 +86,10 @@ _import_structure = {
|
|
86
86
|
"RBLNPhiForCausalLMConfig",
|
87
87
|
"RBLNQwen2ForCausalLM",
|
88
88
|
"RBLNQwen2ForCausalLMConfig",
|
89
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
90
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
91
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
92
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
89
93
|
"RBLNResNetForImageClassification",
|
90
94
|
"RBLNResNetForImageClassificationConfig",
|
91
95
|
"RBLNRobertaForMaskedLM",
|
@@ -287,6 +291,10 @@ if TYPE_CHECKING:
|
|
287
291
|
RBLNMistralForCausalLMConfig,
|
288
292
|
RBLNPhiForCausalLM,
|
289
293
|
RBLNPhiForCausalLMConfig,
|
294
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
295
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
296
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
297
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
290
298
|
RBLNQwen2ForCausalLM,
|
291
299
|
RBLNQwen2ForCausalLMConfig,
|
292
300
|
RBLNResNetForImageClassification,
|
optimum/rbln/__version__.py
CHANGED
@@ -17,5 +17,5 @@ __version__: str
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
18
18
|
version_tuple: VERSION_TUPLE
|
19
19
|
|
20
|
-
__version__ = version = '0.7.
|
21
|
-
__version_tuple__ = version_tuple = (0, 7, 4)
|
20
|
+
__version__ = version = '0.7.4a6'
|
21
|
+
__version_tuple__ = version_tuple = (0, 7, 4, 'a6')
|
optimum/rbln/modeling_base.py
CHANGED
@@ -314,10 +314,15 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
314
314
|
)
|
315
315
|
|
316
316
|
except rebel.core.exception.RBLNRuntimeError as e:
|
317
|
-
|
318
|
-
f"
|
317
|
+
error_msg = (
|
318
|
+
f"\nFailed to create RBLN runtime: {str(e)}\n\n"
|
319
|
+
f"If you only need to compile the model without loading it to NPU, you can use:\n"
|
320
|
+
f" from_pretrained(..., rbln_create_runtimes=False) or\n"
|
321
|
+
f" from_pretrained(..., rbln_config={{..., 'create_runtimes': False}})\n\n"
|
322
|
+
f"To check your NPU status, run the 'rbln-stat' command in your terminal.\n"
|
323
|
+
f"Make sure your NPU is properly installed and operational."
|
319
324
|
)
|
320
|
-
|
325
|
+
raise rebel.core.exception.RBLNRuntimeError(error_msg) from e
|
321
326
|
|
322
327
|
return cls(
|
323
328
|
models,
|
@@ -423,6 +428,20 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
423
428
|
def to(self, *args, **kwargs):
|
424
429
|
return self
|
425
430
|
|
431
|
+
def parameters(self):
|
432
|
+
"""
|
433
|
+
Provides a dummy parameter generator for compatibility.
|
434
|
+
|
435
|
+
This method mimics the interface of torch.nn.Module.parameters()
|
436
|
+
specifically for code that uses `next(model.parameters())` to infer
|
437
|
+
the device or dtype. It yields a single dummy tensor on CPU with float32 dtype.
|
438
|
+
|
439
|
+
Warning:
|
440
|
+
This does NOT yield the actual model parameters used by the RBLN runtime.
|
441
|
+
Code relying on iterating through all model parameters will not work as expected.
|
442
|
+
"""
|
443
|
+
yield torch.tensor([1.0], dtype=torch.float32, device=torch.device("cpu"))
|
444
|
+
|
426
445
|
def __call__(self, *args, **kwargs):
|
427
446
|
return self.forward(*args, **kwargs)
|
428
447
|
|
@@ -80,6 +80,10 @@ _import_structure = {
|
|
80
80
|
"RBLNPhiForCausalLMConfig",
|
81
81
|
"RBLNQwen2ForCausalLM",
|
82
82
|
"RBLNQwen2ForCausalLMConfig",
|
83
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
84
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
85
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
86
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
83
87
|
"RBLNT5EncoderModel",
|
84
88
|
"RBLNT5EncoderModelConfig",
|
85
89
|
"RBLNT5ForConditionalGeneration",
|
@@ -175,6 +179,10 @@ if TYPE_CHECKING:
|
|
175
179
|
RBLNMistralForCausalLMConfig,
|
176
180
|
RBLNPhiForCausalLM,
|
177
181
|
RBLNPhiForCausalLMConfig,
|
182
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
183
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
184
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
185
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
178
186
|
RBLNQwen2ForCausalLM,
|
179
187
|
RBLNQwen2ForCausalLMConfig,
|
180
188
|
RBLNT5EncoderModel,
|
@@ -56,6 +56,12 @@ _import_structure = {
|
|
56
56
|
"RBLNCLIPVisionModelWithProjection",
|
57
57
|
"RBLNCLIPVisionModelWithProjectionConfig",
|
58
58
|
],
|
59
|
+
"qwen2_5_vl": [
|
60
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModel",
|
61
|
+
"RBLNQwen2_5_VisionTransformerPretrainedModelConfig",
|
62
|
+
"RBLNQwen2_5_VLForConditionalGeneration",
|
63
|
+
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
64
|
+
],
|
59
65
|
"decoderonly": [
|
60
66
|
"RBLNDecoderOnlyModelForCausalLM",
|
61
67
|
"RBLNDecoderOnlyModelForCausalLMConfig",
|
@@ -144,6 +150,12 @@ if TYPE_CHECKING:
|
|
144
150
|
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
|
145
151
|
from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig
|
146
152
|
from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig
|
153
|
+
from .qwen2_5_vl import (
|
154
|
+
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
155
|
+
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
156
|
+
RBLNQwen2_5_VLForConditionalGeneration,
|
157
|
+
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
158
|
+
)
|
147
159
|
from .t5 import (
|
148
160
|
RBLNT5EncoderModel,
|
149
161
|
RBLNT5EncoderModelConfig,
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import math
|
16
|
-
from typing import List, Optional, Tuple
|
16
|
+
from typing import List, Optional, Tuple, Union
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from torch import nn
|
@@ -220,6 +220,53 @@ class DecoderOnlyWrapper(nn.Module):
|
|
220
220
|
self._phase = phase
|
221
221
|
self.causal_lm.phase = phase
|
222
222
|
|
223
|
+
def forward_common(
|
224
|
+
self,
|
225
|
+
input_ids_or_inputs_embeds: torch.Tensor,
|
226
|
+
cache_position: torch.Tensor,
|
227
|
+
attention_mask: torch.Tensor,
|
228
|
+
query_position: torch.Tensor,
|
229
|
+
block_tables: torch.Tensor,
|
230
|
+
rotary_emb: Union[nn.Module, torch.Tensor],
|
231
|
+
*past_key_values: List[torch.Tensor],
|
232
|
+
):
|
233
|
+
if input_ids_or_inputs_embeds.ndim == 2:
|
234
|
+
input_ids = input_ids_or_inputs_embeds
|
235
|
+
inputs_embeds = None
|
236
|
+
elif input_ids_or_inputs_embeds.ndim == 3:
|
237
|
+
input_ids = None
|
238
|
+
inputs_embeds = input_ids_or_inputs_embeds
|
239
|
+
else:
|
240
|
+
raise NotImplementedError(f"Unknown ndim of input : {input_ids_or_inputs_embeds.ndim}")
|
241
|
+
|
242
|
+
if len(past_key_values) != 2 * self.num_hidden_layers:
|
243
|
+
raise ValueError(
|
244
|
+
f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
|
245
|
+
)
|
246
|
+
|
247
|
+
# [key, value] * n_layer -> ( (key, value) ) * n_layer
|
248
|
+
# cache shape : batch, n_heads, 1, max_seq_len, head_dim
|
249
|
+
_past_key_values = []
|
250
|
+
for i in range(self.config.num_hidden_layers):
|
251
|
+
key_states = past_key_values[i * 2]
|
252
|
+
value_states = past_key_values[i * 2 + 1]
|
253
|
+
past_key_value = [key_states, value_states]
|
254
|
+
_past_key_values.append(past_key_value)
|
255
|
+
past_key_values = _past_key_values
|
256
|
+
|
257
|
+
logit = self.causal_lm(
|
258
|
+
input_ids=input_ids,
|
259
|
+
inputs_embeds=inputs_embeds,
|
260
|
+
attention_mask=attention_mask,
|
261
|
+
cache_position=cache_position,
|
262
|
+
query_position=query_position,
|
263
|
+
past_key_values=past_key_values,
|
264
|
+
rotary_emb=rotary_emb,
|
265
|
+
block_tables=block_tables,
|
266
|
+
)
|
267
|
+
|
268
|
+
return logit
|
269
|
+
|
223
270
|
def forward(self, *args):
|
224
271
|
if self.phase == "decode":
|
225
272
|
if self.use_attention_mask:
|
@@ -262,43 +309,16 @@ class DecoderOnlyWrapper(nn.Module):
|
|
262
309
|
else:
|
263
310
|
raise ValueError(f"Unknown phase: {self.phase}")
|
264
311
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
if len(past_key_values) != 2 * self.num_hidden_layers:
|
275
|
-
raise ValueError(
|
276
|
-
f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
|
277
|
-
)
|
278
|
-
|
279
|
-
# [key, value] * n_layer -> ( (key, value) ) * n_layer
|
280
|
-
# cache shape : batch, n_heads, 1, max_seq_len, head_dim
|
281
|
-
_past_key_values = []
|
282
|
-
for i in range(self.config.num_hidden_layers):
|
283
|
-
key_states = past_key_values[i * 2]
|
284
|
-
value_states = past_key_values[i * 2 + 1]
|
285
|
-
past_key_value = [key_states, value_states]
|
286
|
-
_past_key_values.append(past_key_value)
|
287
|
-
past_key_values = _past_key_values
|
288
|
-
|
289
|
-
logit = self.causal_lm(
|
290
|
-
input_ids=input_ids,
|
291
|
-
inputs_embeds=inputs_embeds,
|
292
|
-
attention_mask=attention_mask,
|
293
|
-
cache_position=cache_position,
|
294
|
-
query_position=query_position,
|
295
|
-
past_key_values=past_key_values,
|
296
|
-
rotary_emb=self.rotary_emb,
|
297
|
-
block_tables=block_tables,
|
312
|
+
return self.forward_common(
|
313
|
+
input_ids_or_inputs_embeds,
|
314
|
+
cache_position,
|
315
|
+
attention_mask,
|
316
|
+
query_position,
|
317
|
+
block_tables,
|
318
|
+
self.rotary_emb,
|
319
|
+
*past_key_values,
|
298
320
|
)
|
299
321
|
|
300
|
-
return logit
|
301
|
-
|
302
322
|
|
303
323
|
class DecoderOnlyForCausalLM(nn.Module):
|
304
324
|
"""A specialized wrapper for Causal Language Models optimized for RBLN compilation.
|
@@ -322,12 +342,13 @@ class DecoderOnlyForCausalLM(nn.Module):
|
|
322
342
|
_phase: Current processing phase ("prefill" or "decode")
|
323
343
|
"""
|
324
344
|
|
325
|
-
def __init__(self, causal_lm: PreTrainedModel, model):
|
345
|
+
def __init__(self, causal_lm: PreTrainedModel, model: nn.Module):
|
326
346
|
super().__init__()
|
327
347
|
self.config = causal_lm.config
|
328
348
|
self._original_mod = causal_lm
|
329
349
|
self.model = model
|
330
350
|
self._phase = "prefill"
|
351
|
+
self.lm_head = self._original_mod.lm_head
|
331
352
|
|
332
353
|
@property
|
333
354
|
def phase(self):
|
@@ -363,7 +384,7 @@ class DecoderOnlyForCausalLM(nn.Module):
|
|
363
384
|
if self.phase == "prefill":
|
364
385
|
hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
|
365
386
|
|
366
|
-
logits = self.
|
387
|
+
logits = self.lm_head(hidden_states)
|
367
388
|
return logits
|
368
389
|
|
369
390
|
|
@@ -455,8 +476,12 @@ class DecoderOnlyModel(nn.Module):
|
|
455
476
|
|
456
477
|
# get cos,sin vector if needed
|
457
478
|
if rotary_emb is not None:
|
458
|
-
|
459
|
-
|
479
|
+
if isinstance(rotary_emb, torch.Tensor):
|
480
|
+
cos = rotary_emb[0]
|
481
|
+
sin = rotary_emb[1]
|
482
|
+
else:
|
483
|
+
cos, sin = rotary_emb(hidden_states, self.max_seq_len) # dtype carrier, max_seq_len
|
484
|
+
cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, cache_position)
|
460
485
|
else:
|
461
486
|
batch_size = inputs_embeds.shape[0]
|
462
487
|
if cache_position.shape[0] > 1:
|
@@ -833,7 +858,6 @@ def rotate_half(x):
|
|
833
858
|
|
834
859
|
def apply_rotary_pos_emb(q, k, cos, sin):
|
835
860
|
"""Applies Rotary Position Embedding to the query and key tensors."""
|
836
|
-
|
837
861
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
838
862
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
839
863
|
return q_embed, k_embed
|