optimum-rbln 0.8.1rc0__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of optimum-rbln might be problematic. Click here for more details.
- optimum/rbln/__init__.py +58 -9
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +24 -5
- optimum/rbln/diffusers/configurations/models/__init__.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +5 -3
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
- optimum/rbln/diffusers/configurations/models/{configuration_cosmos_transformer.py → configuration_transformer_cosmos.py} +7 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +10 -6
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +4 -4
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +2 -2
- optimum/rbln/diffusers/modeling_diffusers.py +4 -5
- optimum/rbln/diffusers/models/__init__.py +3 -13
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
- optimum/rbln/diffusers/pipelines/__init__.py +1 -5
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +12 -4
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -26
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +2 -2
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +2 -2
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
- optimum/rbln/modeling.py +4 -5
- optimum/rbln/modeling_base.py +18 -14
- optimum/rbln/ops/kv_cache_update.py +5 -0
- optimum/rbln/ops/linear.py +7 -0
- optimum/rbln/transformers/__init__.py +60 -0
- optimum/rbln/transformers/configuration_generic.py +4 -4
- optimum/rbln/transformers/modeling_attention_utils.py +252 -0
- optimum/rbln/transformers/modeling_generic.py +1 -4
- optimum/rbln/transformers/models/__init__.py +45 -30
- optimum/rbln/transformers/models/bart/bart_architecture.py +2 -7
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +2 -2
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -5
- optimum/rbln/transformers/models/clip/configuration_clip.py +14 -3
- optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +1 -4
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +2 -10
- optimum/rbln/transformers/models/decoderonly/__init__.py +2 -2
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +214 -45
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +323 -454
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +579 -362
- optimum/rbln/transformers/models/exaone/exaone_architecture.py +17 -42
- optimum/rbln/transformers/models/gemma/__init__.py +2 -2
- optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +3 -44
- optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +21 -9
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +9 -63
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +200 -292
- optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
- optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
- optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +19 -24
- optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +2 -2
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -9
- optimum/rbln/transformers/models/llama/__init__.py +2 -2
- optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
- optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
- optimum/rbln/transformers/models/llava/__init__.py +16 -0
- optimum/rbln/transformers/models/llava/configuration_llava.py +54 -0
- optimum/rbln/transformers/models/llava/modeling_llava.py +419 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +20 -3
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -16
- optimum/rbln/transformers/models/midm/midm_architecture.py +14 -22
- optimum/rbln/transformers/models/mistral/__init__.py +2 -2
- optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
- optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
- optimum/rbln/transformers/models/opt/__init__.py +2 -2
- optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
- optimum/rbln/transformers/models/opt/modeling_opt.py +41 -1
- optimum/rbln/transformers/models/opt/opt_architecture.py +16 -25
- optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
- optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +34 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +69 -0
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
- optimum/rbln/transformers/models/phi/__init__.py +2 -2
- optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
- optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
- optimum/rbln/transformers/models/phi/phi_architecture.py +16 -22
- optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +315 -0
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
- optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
- optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +3 -3
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -15
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +1 -4
- optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +133 -0
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +31 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -12
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +3 -1
- optimum/rbln/transformers/models/siglip/__init__.py +2 -6
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -5
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +3 -12
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +8 -2
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/hub.py +8 -47
- optimum/rbln/utils/runtime_utils.py +31 -5
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/METADATA +1 -1
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/RECORD +120 -103
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at:
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
from transformers import PretrainedConfig
|
|
18
|
+
|
|
19
|
+
from ....utils import logging
|
|
20
|
+
from ...models.decoderonly import (
|
|
21
|
+
RBLNDecoderOnlyModel,
|
|
22
|
+
RBLNDecoderOnlyModelForCausalLM,
|
|
23
|
+
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
24
|
+
)
|
|
25
|
+
from .qwen3_architecture import Qwen3Wrapper
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from transformers import PretrainedConfig
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
35
|
+
"""
|
|
36
|
+
The Qwen3 Model transformer with a language modeling head (linear layer) on top.
|
|
37
|
+
This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
38
|
+
A class to convert and run pre-trained transformers based Qwen3ForCausalLM model on RBLN devices.
|
|
39
|
+
It implements the methods to convert a pre-trained transformers Qwen3ForCausalLM model into a RBLN transformer model by:
|
|
40
|
+
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
41
|
+
- compiling the resulting graph using the RBLN compiler.
|
|
42
|
+
**Configuration:**
|
|
43
|
+
This model uses [`RBLNQwen3ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
|
|
44
|
+
the `rbln_config` parameter should be an instance of [`RBLNQwen3ForCausalLMConfig`] or a dictionary conforming to its structure.
|
|
45
|
+
See the [`RBLNQwen3ForCausalLMConfig`] class for all available configuration options.
|
|
46
|
+
Examples:
|
|
47
|
+
```python
|
|
48
|
+
from optimum.rbln import RBLNQwen3ForCausalLM
|
|
49
|
+
# Simple usage using rbln_* arguments
|
|
50
|
+
# `max_seq_len` is automatically inferred from the model config
|
|
51
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
52
|
+
"Qwen/Qwen3-4B",
|
|
53
|
+
export=True,
|
|
54
|
+
rbln_batch_size=1,
|
|
55
|
+
rbln_tensor_parallel_size=4,
|
|
56
|
+
)
|
|
57
|
+
# Using a config dictionary
|
|
58
|
+
rbln_config = {
|
|
59
|
+
"batch_size": 1,
|
|
60
|
+
"max_seq_len": 40_960,
|
|
61
|
+
"tensor_parallel_size": 4,
|
|
62
|
+
"kvcache_partition_len": 8192,
|
|
63
|
+
}
|
|
64
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
65
|
+
"Qwen/Qwen3-4B",
|
|
66
|
+
export=True,
|
|
67
|
+
rbln_config=rbln_config
|
|
68
|
+
)
|
|
69
|
+
# Using a RBLNQwen3ForCausalLMConfig instance (recommended for type checking)
|
|
70
|
+
from optimum.rbln import RBLNQwen3ForCausalLMConfig
|
|
71
|
+
config = RBLNQwen3ForCausalLMConfig(
|
|
72
|
+
batch_size=1,
|
|
73
|
+
max_seq_len=40_960,
|
|
74
|
+
tensor_parallel_size=4,
|
|
75
|
+
kvcache_partition_len=8192,
|
|
76
|
+
)
|
|
77
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
78
|
+
"Qwen/Qwen3-4B",
|
|
79
|
+
export=True,
|
|
80
|
+
rbln_config=config
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
_decoder_wrapper_cls = Qwen3Wrapper
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def _update_sliding_window_config(
|
|
89
|
+
cls, model_config: PretrainedConfig, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
|
|
90
|
+
):
|
|
91
|
+
# https://github.com/huggingface/transformers/issues/35896
|
|
92
|
+
# There seems to be a bug in transformers(v4.52.4). Therefore, similar to when attn_implementation is eager,
|
|
93
|
+
# we set all layers to use sliding window in this version. This should be updated once the bug is fixed.
|
|
94
|
+
|
|
95
|
+
rbln_config.cache_impl = "sliding_window"
|
|
96
|
+
rbln_config.sliding_window = model_config.sliding_window
|
|
97
|
+
rbln_config.sliding_window_layers = list(range(model_config.num_hidden_layers))
|
|
98
|
+
return rbln_config
|
|
99
|
+
|
|
100
|
+
def forward(self, *args, **kwargs):
|
|
101
|
+
kwargs["return_dict"] = True
|
|
102
|
+
return super().forward(*args, **kwargs)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class RBLNQwen3Model(RBLNDecoderOnlyModel):
|
|
106
|
+
"""
|
|
107
|
+
The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
|
|
108
|
+
This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
109
|
+
A class to convert and run pre-trained transformers based Qwen3Model on RBLN devices.
|
|
110
|
+
It implements the methods to convert a pre-trained transformers Qwen3Model into a RBLN transformer model by:
|
|
111
|
+
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
112
|
+
- compiling the resulting graph using the RBLN compiler.
|
|
113
|
+
**Configuration:**
|
|
114
|
+
This model uses [`RBLNQwen3ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
|
|
115
|
+
the `rbln_config` parameter should be an instance of [`RBLNQwen3ModelConfig`] or a dictionary conforming to its structure.
|
|
116
|
+
See the [`RBLNQwen3ModelConfig`] class for all available configuration options.
|
|
117
|
+
Examples:
|
|
118
|
+
```python
|
|
119
|
+
from optimum.rbln import RBLNQwen3Model
|
|
120
|
+
# Simple usage using rbln_* arguments
|
|
121
|
+
# `max_seq_len` is automatically inferred from the model config
|
|
122
|
+
model = RBLNQwen3Model.from_pretrained(
|
|
123
|
+
"Qwen/Qwen3-Embedding-4B",
|
|
124
|
+
export=True,
|
|
125
|
+
rbln_batch_size=1,
|
|
126
|
+
rbln_max_seq_len=40_960,
|
|
127
|
+
rbln_tensor_parallel_size=4,
|
|
128
|
+
rbln_kvcache_partition_len=8192,
|
|
129
|
+
)
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
_decoder_wrapper_cls = Qwen3Wrapper
|
|
133
|
+
_use_rotary_emb = True
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at:
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from ..decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Qwen3Wrapper(DecoderOnlyWrapper):
|
|
20
|
+
def get_rbln_attn_class(self):
|
|
21
|
+
return Qwen3Attention
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Qwen3Attention(DecoderOnlyAttention):
|
|
25
|
+
def __post_init__(self):
|
|
26
|
+
self.k_proj = self._original_mod.k_proj
|
|
27
|
+
self.v_proj = self._original_mod.v_proj
|
|
28
|
+
self.q_proj = self._original_mod.q_proj
|
|
29
|
+
self.o_proj = self._original_mod.o_proj
|
|
30
|
+
self.q_norm = self._original_mod.q_norm
|
|
31
|
+
self.k_norm = self._original_mod.k_norm
|
|
@@ -12,9 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
16
|
-
|
|
17
|
-
import rebel
|
|
15
|
+
from typing import Any, Optional
|
|
18
16
|
|
|
19
17
|
from ....configuration_utils import RBLNModelConfig
|
|
20
18
|
from ....utils.logging import get_logger
|
|
@@ -31,7 +29,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
31
29
|
dec_max_seq_len: Optional[int] = None,
|
|
32
30
|
use_attention_mask: Optional[bool] = None,
|
|
33
31
|
pad_token_id: Optional[int] = None,
|
|
34
|
-
**kwargs:
|
|
32
|
+
**kwargs: Any,
|
|
35
33
|
):
|
|
36
34
|
"""
|
|
37
35
|
Args:
|
|
@@ -39,7 +37,6 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
39
37
|
enc_max_seq_len (Optional[int]): Maximum sequence length for the encoder.
|
|
40
38
|
dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
|
|
41
39
|
use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
|
|
42
|
-
This is automatically set to True for RBLN-CA02 devices.
|
|
43
40
|
pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
|
|
44
41
|
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
45
42
|
|
|
@@ -55,12 +52,5 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
55
52
|
self.dec_max_seq_len = dec_max_seq_len
|
|
56
53
|
|
|
57
54
|
self.use_attention_mask = use_attention_mask
|
|
58
|
-
npu = self.npu or rebel.get_npu_name()
|
|
59
|
-
if npu == "RBLN-CA02":
|
|
60
|
-
if self.use_attention_mask is False:
|
|
61
|
-
logger.warning("Attention mask should be used with RBLN-CA02. Setting use_attention_mask to True.")
|
|
62
|
-
self.use_attention_mask = True
|
|
63
|
-
else:
|
|
64
|
-
self.use_attention_mask = self.use_attention_mask or False
|
|
65
55
|
|
|
66
56
|
self.pad_token_id = pad_token_id
|
|
@@ -38,7 +38,7 @@ if TYPE_CHECKING:
|
|
|
38
38
|
class RBLNRuntimeEncoder(RBLNPytorchRuntime):
|
|
39
39
|
mandatory_members = ["main_input_name"]
|
|
40
40
|
|
|
41
|
-
def forward(self, *args: List[torch.Tensor], **kwargs:
|
|
41
|
+
def forward(self, *args: List[torch.Tensor], **kwargs: torch.Tensor):
|
|
42
42
|
output = super().forward(*args, **kwargs)
|
|
43
43
|
return BaseModelOutput(last_hidden_state=output)
|
|
44
44
|
|
|
@@ -327,12 +327,14 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
|
|
|
327
327
|
tensor_type="pt",
|
|
328
328
|
device=rbln_config.device_map["encoder"],
|
|
329
329
|
activate_profiler=rbln_config.activate_profiler,
|
|
330
|
+
timeout=rbln_config.timeout,
|
|
330
331
|
),
|
|
331
332
|
rebel.Runtime(
|
|
332
333
|
compiled_models[1],
|
|
333
334
|
tensor_type="pt",
|
|
334
335
|
device=rbln_config.device_map["decoder"],
|
|
335
336
|
activate_profiler=rbln_config.activate_profiler,
|
|
337
|
+
timeout=rbln_config.timeout,
|
|
336
338
|
),
|
|
337
339
|
]
|
|
338
340
|
|
|
@@ -12,9 +12,5 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .configuration_siglip import
|
|
16
|
-
|
|
17
|
-
)
|
|
18
|
-
from .modeling_siglip import (
|
|
19
|
-
RBLNSiglipVisionModel,
|
|
20
|
-
)
|
|
15
|
+
from .configuration_siglip import RBLNSiglipVisionModelConfig
|
|
16
|
+
from .modeling_siglip import RBLNSiglipVisionModel
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import TYPE_CHECKING, Any,
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
18
|
from transformers import SiglipVisionConfig, SiglipVisionModel
|
|
@@ -126,7 +126,7 @@ class RBLNSiglipVisionModel(RBLNModel):
|
|
|
126
126
|
output_attentions: bool = None,
|
|
127
127
|
output_hidden_states: bool = None,
|
|
128
128
|
interpolate_pos_encoding: bool = False,
|
|
129
|
-
**kwargs:
|
|
129
|
+
**kwargs: Any,
|
|
130
130
|
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
|
131
131
|
if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
|
|
132
132
|
logger.warning(
|
optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any,
|
|
1
|
+
from typing import Any, Optional
|
|
2
2
|
|
|
3
3
|
from ....configuration_utils import RBLNModelConfig
|
|
4
4
|
|
|
@@ -17,7 +17,7 @@ class RBLNTimeSeriesTransformerForPredictionConfig(RBLNModelConfig):
|
|
|
17
17
|
enc_max_seq_len: Optional[int] = None,
|
|
18
18
|
dec_max_seq_len: Optional[int] = None,
|
|
19
19
|
num_parallel_samples: Optional[int] = None,
|
|
20
|
-
**kwargs:
|
|
20
|
+
**kwargs: Any,
|
|
21
21
|
):
|
|
22
22
|
"""
|
|
23
23
|
Args:
|
optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py
CHANGED
|
@@ -30,11 +30,7 @@ from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
|
|
|
30
30
|
import rebel
|
|
31
31
|
import torch
|
|
32
32
|
from rebel.compile_context import CompileContext
|
|
33
|
-
from transformers import
|
|
34
|
-
PretrainedConfig,
|
|
35
|
-
TimeSeriesTransformerForPrediction,
|
|
36
|
-
TimeSeriesTransformerModel,
|
|
37
|
-
)
|
|
33
|
+
from transformers import PretrainedConfig, TimeSeriesTransformerForPrediction, TimeSeriesTransformerModel
|
|
38
34
|
from transformers.modeling_outputs import ModelOutput, SampleTSPredictionOutput, Seq2SeqTSModelOutput
|
|
39
35
|
from transformers.modeling_utils import no_init_weights
|
|
40
36
|
|
|
@@ -331,12 +327,14 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
|
|
|
331
327
|
tensor_type="pt",
|
|
332
328
|
device=rbln_config.device_map["encoder"],
|
|
333
329
|
activate_profiler=rbln_config.activate_profiler,
|
|
330
|
+
timeout=rbln_config.timeout,
|
|
334
331
|
),
|
|
335
332
|
rebel.Runtime(
|
|
336
333
|
compiled_models[1],
|
|
337
334
|
tensor_type="pt",
|
|
338
335
|
device=rbln_config.device_map["decoder"],
|
|
339
336
|
activate_profiler=rbln_config.activate_profiler,
|
|
337
|
+
timeout=rbln_config.timeout,
|
|
340
338
|
),
|
|
341
339
|
]
|
|
342
340
|
|
|
@@ -12,9 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any
|
|
16
|
-
|
|
17
|
-
import rebel
|
|
15
|
+
from typing import Any
|
|
18
16
|
|
|
19
17
|
from ....configuration_utils import RBLNModelConfig
|
|
20
18
|
from ....utils.logging import get_logger
|
|
@@ -38,14 +36,13 @@ class RBLNWhisperForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
38
36
|
use_attention_mask: bool = None,
|
|
39
37
|
enc_max_seq_len: int = None,
|
|
40
38
|
dec_max_seq_len: int = None,
|
|
41
|
-
**kwargs:
|
|
39
|
+
**kwargs: Any,
|
|
42
40
|
):
|
|
43
41
|
"""
|
|
44
42
|
Args:
|
|
45
43
|
batch_size (int, optional): The batch size for inference. Defaults to 1.
|
|
46
44
|
token_timestamps (bool, optional): Whether to output token timestamps during generation. Defaults to False.
|
|
47
45
|
use_attention_mask (bool, optional): Whether to use attention masks during inference. This is automatically
|
|
48
|
-
set to True for RBLN-CA02 devices.
|
|
49
46
|
enc_max_seq_len (int, optional): Maximum sequence length for the encoder.
|
|
50
47
|
dec_max_seq_len (int, optional): Maximum sequence length for the decoder.
|
|
51
48
|
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
@@ -64,10 +61,4 @@ class RBLNWhisperForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
64
61
|
self.dec_max_seq_len = dec_max_seq_len
|
|
65
62
|
|
|
66
63
|
self.use_attention_mask = use_attention_mask
|
|
67
|
-
|
|
68
|
-
if npu == "RBLN-CA02":
|
|
69
|
-
if self.use_attention_mask is False:
|
|
70
|
-
logger.warning("Attention mask should be used with RBLN-CA02. Setting use_attention_mask to True.")
|
|
71
|
-
self.use_attention_mask = True
|
|
72
|
-
else:
|
|
73
|
-
self.use_attention_mask = self.use_attention_mask or False
|
|
64
|
+
self.use_attention_mask = self.use_attention_mask or False
|
|
@@ -46,7 +46,7 @@ if TYPE_CHECKING:
|
|
|
46
46
|
class RBLNRuntimeEncoder(RBLNPytorchRuntime):
|
|
47
47
|
mandatory_members = ["main_input_name"]
|
|
48
48
|
|
|
49
|
-
def forward(self, *args: List[torch.Tensor], **kwargs:
|
|
49
|
+
def forward(self, *args: List[torch.Tensor], **kwargs: torch.Tensor):
|
|
50
50
|
output = super().forward(*args, **kwargs)
|
|
51
51
|
return BaseModelOutput(last_hidden_state=output)
|
|
52
52
|
|
|
@@ -73,6 +73,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
|
|
|
73
73
|
decoder_input_ids: torch.Tensor = None,
|
|
74
74
|
decoder_attention_mask: torch.Tensor = None,
|
|
75
75
|
cache_position: torch.Tensor = None,
|
|
76
|
+
block_tables: torch.Tensor = None,
|
|
76
77
|
):
|
|
77
78
|
inputs_bsz = decoder_input_ids.shape[0]
|
|
78
79
|
padded_bsz = self.batch_size - inputs_bsz
|
|
@@ -89,11 +90,14 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
|
|
|
89
90
|
)
|
|
90
91
|
decoder_attention_mask[b_idx, : decoding_step + 1] = 1
|
|
91
92
|
|
|
93
|
+
if block_tables is None:
|
|
94
|
+
block_tables = self.default_block_tables
|
|
95
|
+
|
|
92
96
|
outputs = super().forward(
|
|
93
97
|
decoder_input_ids,
|
|
94
98
|
decoder_attention_mask if self.use_attention_mask else None,
|
|
95
99
|
cache_position,
|
|
96
|
-
block_tables=
|
|
100
|
+
block_tables=block_tables,
|
|
97
101
|
)
|
|
98
102
|
|
|
99
103
|
if isinstance(outputs, torch.Tensor):
|
|
@@ -345,12 +349,14 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
|
|
|
345
349
|
tensor_type="pt",
|
|
346
350
|
device=rbln_config.device_map["encoder"],
|
|
347
351
|
activate_profiler=rbln_config.activate_profiler,
|
|
352
|
+
timeout=rbln_config.timeout,
|
|
348
353
|
),
|
|
349
354
|
rebel.Runtime(
|
|
350
355
|
compiled_models[1],
|
|
351
356
|
tensor_type="pt",
|
|
352
357
|
device=rbln_config.device_map["decoder"],
|
|
353
358
|
activate_profiler=rbln_config.activate_profiler,
|
|
359
|
+
timeout=rbln_config.timeout,
|
|
354
360
|
),
|
|
355
361
|
]
|
|
356
362
|
|
|
@@ -12,14 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .configuration_xlm_roberta import
|
|
16
|
-
|
|
17
|
-
RBLNXLMRobertaModelConfig,
|
|
18
|
-
)
|
|
19
|
-
from .modeling_xlm_roberta import (
|
|
20
|
-
RBLNXLMRobertaForSequenceClassification,
|
|
21
|
-
RBLNXLMRobertaModel,
|
|
22
|
-
)
|
|
15
|
+
from .configuration_xlm_roberta import RBLNXLMRobertaForSequenceClassificationConfig, RBLNXLMRobertaModelConfig
|
|
16
|
+
from .modeling_xlm_roberta import RBLNXLMRobertaForSequenceClassification, RBLNXLMRobertaModel
|
|
23
17
|
|
|
24
18
|
|
|
25
19
|
__all__ = [
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import rebel
|
|
4
|
+
|
|
5
|
+
from .logging import get_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def warn_deprecated_npu(npu: Optional[str] = None):
|
|
12
|
+
npu = npu or rebel.get_npu_name()
|
|
13
|
+
if npu == "RBLN-CA02":
|
|
14
|
+
logger.warning_once(
|
|
15
|
+
"Support for the RBLN-CA02 device is provided only up to optimum-rbln v0.8.0 and has reached end of life.",
|
|
16
|
+
)
|
optimum/rbln/utils/hub.py
CHANGED
|
@@ -12,59 +12,23 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import os
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from typing import List, Optional, Union
|
|
18
17
|
|
|
19
|
-
from huggingface_hub import HfApi,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class PushToHubMixin:
|
|
23
|
-
def push_to_hub(
|
|
24
|
-
self,
|
|
25
|
-
save_directory: str,
|
|
26
|
-
repository_id: str,
|
|
27
|
-
private: Optional[bool] = None,
|
|
28
|
-
use_auth_token: Union[bool, str] = True,
|
|
29
|
-
) -> str:
|
|
30
|
-
huggingface_token = _get_huggingface_token(use_auth_token)
|
|
31
|
-
api = HfApi()
|
|
32
|
-
|
|
33
|
-
api.create_repo(
|
|
34
|
-
token=huggingface_token,
|
|
35
|
-
repo_id=repository_id,
|
|
36
|
-
exist_ok=True,
|
|
37
|
-
private=private,
|
|
38
|
-
)
|
|
39
|
-
for path, subdirs, files in os.walk(save_directory):
|
|
40
|
-
for name in files:
|
|
41
|
-
local_file_path = os.path.join(path, name)
|
|
42
|
-
_, hub_file_path = os.path.split(local_file_path)
|
|
43
|
-
# FIXME: when huggingface_hub fixes the return of upload_file
|
|
44
|
-
try:
|
|
45
|
-
api.upload_file(
|
|
46
|
-
token=huggingface_token,
|
|
47
|
-
repo_id=f"{repository_id}",
|
|
48
|
-
path_or_fileobj=os.path.join(os.getcwd(), local_file_path),
|
|
49
|
-
path_in_repo=hub_file_path,
|
|
50
|
-
)
|
|
51
|
-
except KeyError:
|
|
52
|
-
pass
|
|
53
|
-
except NameError:
|
|
54
|
-
pass
|
|
18
|
+
from huggingface_hub import HfApi, get_token, hf_hub_download
|
|
55
19
|
|
|
56
20
|
|
|
57
21
|
def pull_compiled_model_from_hub(
|
|
58
22
|
model_id: Union[str, Path],
|
|
59
23
|
subfolder: str,
|
|
60
|
-
|
|
24
|
+
token: Union[bool, str],
|
|
61
25
|
revision: Optional[str],
|
|
62
26
|
cache_dir: Optional[str],
|
|
63
27
|
force_download: bool,
|
|
64
28
|
local_files_only: bool,
|
|
65
29
|
) -> Path:
|
|
66
30
|
"""Pull model files from the HuggingFace Hub."""
|
|
67
|
-
huggingface_token = _get_huggingface_token(
|
|
31
|
+
huggingface_token = _get_huggingface_token(token)
|
|
68
32
|
repo_files = list(
|
|
69
33
|
map(
|
|
70
34
|
Path,
|
|
@@ -87,7 +51,7 @@ def pull_compiled_model_from_hub(
|
|
|
87
51
|
repo_id=model_id,
|
|
88
52
|
filename=filename,
|
|
89
53
|
subfolder=subfolder,
|
|
90
|
-
|
|
54
|
+
token=token,
|
|
91
55
|
revision=revision,
|
|
92
56
|
cache_dir=cache_dir,
|
|
93
57
|
force_download=force_download,
|
|
@@ -113,10 +77,7 @@ def validate_files(
|
|
|
113
77
|
raise FileExistsError(f"Multiple rbln_config.json files found in {location}. This is not expected.")
|
|
114
78
|
|
|
115
79
|
|
|
116
|
-
def _get_huggingface_token(
|
|
117
|
-
if isinstance(
|
|
118
|
-
return
|
|
119
|
-
|
|
120
|
-
return HfFolder.get_token()
|
|
121
|
-
else:
|
|
122
|
-
raise ValueError("`use_auth_token` must be provided to interact with the HuggingFace Hub.")
|
|
80
|
+
def _get_huggingface_token(token: Union[bool, str]) -> str:
|
|
81
|
+
if isinstance(token, str):
|
|
82
|
+
return token
|
|
83
|
+
return get_token()
|
|
@@ -12,13 +12,29 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import re
|
|
15
16
|
import threading
|
|
16
|
-
from typing import Any,
|
|
17
|
+
from typing import Any, List, Optional, Union
|
|
17
18
|
|
|
18
19
|
import rebel
|
|
19
20
|
import torch
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
def normalize_npu(npu: str) -> str:
|
|
24
|
+
"""Normalize the NPU string by removing the form factor."""
|
|
25
|
+
match = re.match(r"(RBLN-CA|RBLN-CR)(\d+)", npu)
|
|
26
|
+
if match:
|
|
27
|
+
prefix, num = match.groups()
|
|
28
|
+
if len(num) == 1:
|
|
29
|
+
# Convert "RBLN-CAx" → "RBLN-CA0"
|
|
30
|
+
# (e.g., "RBLN-CA2" -> "RBLN-CA0")
|
|
31
|
+
npu = f"{prefix}0"
|
|
32
|
+
elif len(num) == 2:
|
|
33
|
+
# Strip form factor (e.g., "RBLN-CA15" → "RBLN-CA1")
|
|
34
|
+
npu = f"{prefix}{num[:-1]}"
|
|
35
|
+
return npu
|
|
36
|
+
|
|
37
|
+
|
|
22
38
|
def tp_and_devices_are_ok(
|
|
23
39
|
tensor_parallel_size: Optional[int] = None,
|
|
24
40
|
device: Optional[Union[int, List[int]]] = None,
|
|
@@ -58,7 +74,7 @@ def tp_and_devices_are_ok(
|
|
|
58
74
|
if npu is not None:
|
|
59
75
|
for device_id in device:
|
|
60
76
|
npu_name = rebel.get_npu_name(device_id)
|
|
61
|
-
if npu_name != npu:
|
|
77
|
+
if normalize_npu(npu_name) != normalize_npu(npu):
|
|
62
78
|
return f"Device {device_id} ({npu_name}) is not on the same NPU as {npu}."
|
|
63
79
|
|
|
64
80
|
return None
|
|
@@ -78,7 +94,7 @@ class RBLNPytorchRuntime:
|
|
|
78
94
|
def __call__(self, *args: Any, **kwds: Any) -> Any:
|
|
79
95
|
return self.forward(*args, **kwds)
|
|
80
96
|
|
|
81
|
-
def forward(self, *args: List["torch.Tensor"], **kwargs:
|
|
97
|
+
def forward(self, *args: List["torch.Tensor"], **kwargs: "torch.Tensor"):
|
|
82
98
|
# filtering useless args or kwarg such as None.
|
|
83
99
|
args = list(filter(lambda arg: isinstance(arg, torch.Tensor), args))
|
|
84
100
|
kwargs = dict(filter(lambda kwarg: isinstance(kwarg[1], torch.Tensor) or kwarg[0] == "out", kwargs.items()))
|
|
@@ -126,7 +142,7 @@ class UnavailableRuntime:
|
|
|
126
142
|
"""Returns an iterator with self as the only item."""
|
|
127
143
|
return iter([self])
|
|
128
144
|
|
|
129
|
-
def forward(self, *args: List["torch.Tensor"], **kwargs:
|
|
145
|
+
def forward(self, *args: List["torch.Tensor"], **kwargs: "torch.Tensor"):
|
|
130
146
|
"""Raises a detailed RuntimeError explaining why inference cannot be performed."""
|
|
131
147
|
raise RuntimeError(
|
|
132
148
|
"Cannot perform inference: RBLN runtime is not available.\n\n"
|
|
@@ -147,13 +163,20 @@ class ContextRblnConfig:
|
|
|
147
163
|
_local = threading.local()
|
|
148
164
|
|
|
149
165
|
def __init__(
|
|
150
|
-
self,
|
|
166
|
+
self,
|
|
167
|
+
device=None,
|
|
168
|
+
device_map=None,
|
|
169
|
+
create_runtimes=None,
|
|
170
|
+
optimize_host_mem=None,
|
|
171
|
+
activate_profiler=None,
|
|
172
|
+
timeout=None,
|
|
151
173
|
):
|
|
152
174
|
self.device = device
|
|
153
175
|
self.device_map = device_map
|
|
154
176
|
self.create_runtimes = create_runtimes
|
|
155
177
|
self.optimize_host_mem = optimize_host_mem
|
|
156
178
|
self.activate_profiler = activate_profiler
|
|
179
|
+
self.timeout = timeout
|
|
157
180
|
|
|
158
181
|
def __enter__(self):
|
|
159
182
|
self._local.device = self.device
|
|
@@ -161,6 +184,7 @@ class ContextRblnConfig:
|
|
|
161
184
|
self._local.create_runtimes = self.create_runtimes
|
|
162
185
|
self._local.optimize_host_memory = self.optimize_host_mem
|
|
163
186
|
self._local.activate_profiler = self.activate_profiler
|
|
187
|
+
self._local.timeout = self.timeout
|
|
164
188
|
return self
|
|
165
189
|
|
|
166
190
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
@@ -169,6 +193,7 @@ class ContextRblnConfig:
|
|
|
169
193
|
self._local.create_runtimes = None
|
|
170
194
|
self._local.optimize_host_memory = None
|
|
171
195
|
self._local.activate_profiler = None
|
|
196
|
+
self._local.timeout = None
|
|
172
197
|
|
|
173
198
|
@classmethod
|
|
174
199
|
def get_current_context(cls):
|
|
@@ -178,4 +203,5 @@ class ContextRblnConfig:
|
|
|
178
203
|
"create_runtimes": getattr(cls._local, "create_runtimes", None),
|
|
179
204
|
"optimize_host_memory": getattr(cls._local, "optimize_host_memory", None),
|
|
180
205
|
"activate_profiler": getattr(cls._local, "activate_profiler", None),
|
|
206
|
+
"timeout": getattr(cls._local, "timeout", None),
|
|
181
207
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: optimum-rbln
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.2
|
|
4
4
|
Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
|
|
5
5
|
Project-URL: Homepage, https://rebellions.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.rbln.ai
|