optimum-rbln 0.8.2a7__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of optimum-rbln might be problematic. Click here for more details.
- optimum/rbln/__init__.py +36 -9
- optimum/rbln/__version__.py +16 -3
- optimum/rbln/configuration_utils.py +20 -4
- optimum/rbln/diffusers/__init__.py +7 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +4 -4
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +2 -2
- optimum/rbln/diffusers/modeling_diffusers.py +1 -1
- optimum/rbln/diffusers/models/__init__.py +3 -13
- optimum/rbln/diffusers/pipelines/__init__.py +11 -5
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +237 -0
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +11 -6
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +14 -18
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -1
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
- optimum/rbln/modeling.py +3 -2
- optimum/rbln/modeling_base.py +29 -4
- optimum/rbln/ops/attn.py +158 -0
- optimum/rbln/ops/flash_attn.py +166 -0
- optimum/rbln/transformers/__init__.py +28 -0
- optimum/rbln/transformers/configuration_generic.py +6 -4
- optimum/rbln/transformers/modeling_generic.py +13 -8
- optimum/rbln/transformers/modeling_outputs.py +37 -0
- optimum/rbln/transformers/models/__init__.py +35 -16
- optimum/rbln/transformers/models/auto/__init__.py +2 -0
- optimum/rbln/transformers/models/auto/modeling_auto.py +14 -0
- optimum/rbln/transformers/models/bart/bart_architecture.py +1 -3
- optimum/rbln/transformers/models/bart/configuration_bart.py +2 -0
- optimum/rbln/transformers/models/bert/bert_architecture.py +16 -0
- optimum/rbln/transformers/models/bert/modeling_bert.py +8 -4
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +2 -2
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +7 -6
- optimum/rbln/transformers/models/clip/configuration_clip.py +3 -3
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +1 -4
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +2 -10
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +43 -174
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +102 -93
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +450 -0
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +88 -0
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +297 -987
- optimum/rbln/transformers/models/depth_anything/__init__.py +16 -0
- optimum/rbln/transformers/models/depth_anything/configuration_depth_anything.py +24 -0
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +25 -0
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +1 -4
- optimum/rbln/transformers/models/gemma/modeling_gemma.py +9 -0
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +14 -3
- optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +217 -0
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +64 -258
- optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +2 -0
- optimum/rbln/transformers/models/grounding_dino/__init__.py +10 -0
- optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +86 -0
- optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +507 -0
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +1032 -0
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +2 -2
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -9
- optimum/rbln/transformers/models/llama/modeling_llama.py +12 -3
- optimum/rbln/transformers/models/llava/configuration_llava.py +2 -2
- optimum/rbln/transformers/models/llava/modeling_llava.py +53 -14
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +2 -2
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -16
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -30
- optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +4 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +2 -0
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +1 -3
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +2 -2
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -4
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +3 -3
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +6 -15
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +4 -7
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +77 -3
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +1 -4
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +19 -2
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +20 -1
- optimum/rbln/transformers/models/siglip/__init__.py +2 -6
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +2 -2
- optimum/rbln/transformers/models/swin/__init__.py +16 -0
- optimum/rbln/transformers/models/swin/configuration_swin.py +42 -0
- optimum/rbln/transformers/models/swin/modeling_swin.py +341 -0
- optimum/rbln/transformers/models/t5/configuration_t5.py +2 -0
- optimum/rbln/transformers/models/t5/t5_architecture.py +8 -1
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -14
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +10 -2
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +20 -1
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
- optimum/rbln/transformers/utils/rbln_quantization.py +365 -65
- optimum/rbln/utils/runtime_utils.py +3 -3
- optimum/rbln/utils/submodule.py +10 -4
- {optimum_rbln-0.8.2a7.dist-info → optimum_rbln-0.8.3.dist-info}/METADATA +1 -1
- {optimum_rbln-0.8.2a7.dist-info → optimum_rbln-0.8.3.dist-info}/RECORD +105 -89
- {optimum_rbln-0.8.2a7.dist-info → optimum_rbln-0.8.3.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.2a7.dist-info → optimum_rbln-0.8.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
|
|
@@ -39,7 +39,7 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
39
39
|
batch_size: Optional[int] = None,
|
|
40
40
|
vision_model: Optional[RBLNModelConfig] = None,
|
|
41
41
|
text_model: Optional[RBLNModelConfig] = None,
|
|
42
|
-
**kwargs:
|
|
42
|
+
**kwargs: Any,
|
|
43
43
|
):
|
|
44
44
|
"""
|
|
45
45
|
Args:
|
|
@@ -34,17 +34,11 @@ from transformers.models.idefics3.modeling_idefics3 import Idefics3CausalLMOutpu
|
|
|
34
34
|
from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
|
|
35
35
|
from ....modeling import RBLNModel
|
|
36
36
|
from ....utils.runtime_utils import RBLNPytorchRuntime
|
|
37
|
-
from
|
|
38
|
-
RBLNDecoderOnlyForCausalLMOutput,
|
|
39
|
-
)
|
|
37
|
+
from ...modeling_outputs import RBLNDecoderOnlyOutput
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
if TYPE_CHECKING:
|
|
43
|
-
from transformers import
|
|
44
|
-
AutoFeatureExtractor,
|
|
45
|
-
AutoProcessor,
|
|
46
|
-
AutoTokenizer,
|
|
47
|
-
)
|
|
41
|
+
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
|
|
48
42
|
|
|
49
43
|
|
|
50
44
|
class RBLNRuntimeVisionModel(RBLNPytorchRuntime):
|
|
@@ -494,7 +488,7 @@ class RBLNIdefics3ForConditionalGeneration(RBLNModel):
|
|
|
494
488
|
if not return_dict:
|
|
495
489
|
return logits, generate_idx
|
|
496
490
|
else:
|
|
497
|
-
return
|
|
491
|
+
return RBLNDecoderOnlyOutput(
|
|
498
492
|
logits=logits,
|
|
499
493
|
generate_idx=generate_idx,
|
|
500
494
|
)
|
|
@@ -85,11 +85,20 @@ class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
85
85
|
|
|
86
86
|
class RBLNLlamaModel(RBLNDecoderOnlyModel):
|
|
87
87
|
"""
|
|
88
|
-
The Llama Model transformer
|
|
88
|
+
The Llama Model transformer outputting raw hidden-states without any specific head on top.
|
|
89
89
|
This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
90
90
|
|
|
91
|
-
A class to convert and run pre-trained transformers based LlamaModel
|
|
92
|
-
It implements the methods to convert a pre-trained transformers LlamaModel
|
|
91
|
+
A class to convert and run pre-trained transformers based LlamaModel on RBLN devices.
|
|
92
|
+
It implements the methods to convert a pre-trained transformers LlamaModel into a RBLN transformer model by:
|
|
93
|
+
|
|
94
|
+
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
95
|
+
- compiling the resulting graph using the RBLN compiler.
|
|
96
|
+
|
|
97
|
+
**Configuration:**
|
|
98
|
+
This model uses [`RBLNLlamaModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
|
|
99
|
+
the `rbln_config` parameter should be an instance of [`RBLNLlamaModelConfig`] or a dictionary conforming to its structure.
|
|
100
|
+
|
|
101
|
+
See the [`RBLNLlamaModelConfig`] class for all available configuration options.
|
|
93
102
|
"""
|
|
94
103
|
|
|
95
104
|
_decoder_wrapper_cls = LlamaWrapper
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
|
|
@@ -33,7 +33,7 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
33
33
|
batch_size: Optional[int] = None,
|
|
34
34
|
vision_tower: Optional[RBLNModelConfig] = None,
|
|
35
35
|
language_model: Optional[RBLNModelConfig] = None,
|
|
36
|
-
**kwargs:
|
|
36
|
+
**kwargs: Any,
|
|
37
37
|
):
|
|
38
38
|
"""
|
|
39
39
|
Args:
|
|
@@ -16,30 +16,20 @@ import inspect
|
|
|
16
16
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
from transformers import
|
|
20
|
-
AutoModelForImageTextToText,
|
|
21
|
-
LlavaForConditionalGeneration,
|
|
22
|
-
PretrainedConfig,
|
|
23
|
-
PreTrainedModel,
|
|
24
|
-
)
|
|
19
|
+
from transformers import AutoModelForImageTextToText, LlavaForConditionalGeneration, PretrainedConfig, PreTrainedModel
|
|
25
20
|
from transformers.modeling_outputs import BaseModelOutputWithPooling
|
|
26
21
|
from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
|
|
27
22
|
|
|
28
23
|
from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
|
|
29
24
|
from ....modeling import RBLNModel
|
|
30
25
|
from ....utils.logging import get_logger
|
|
31
|
-
from
|
|
26
|
+
from ...modeling_outputs import RBLNDecoderOnlyOutput
|
|
32
27
|
|
|
33
28
|
|
|
34
29
|
logger = get_logger(__name__)
|
|
35
30
|
|
|
36
31
|
if TYPE_CHECKING:
|
|
37
|
-
from transformers import
|
|
38
|
-
AutoFeatureExtractor,
|
|
39
|
-
AutoProcessor,
|
|
40
|
-
AutoTokenizer,
|
|
41
|
-
PretrainedConfig,
|
|
42
|
-
)
|
|
32
|
+
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
|
|
43
33
|
|
|
44
34
|
|
|
45
35
|
class LoopVisionTower:
|
|
@@ -111,6 +101,55 @@ class LoopProjector:
|
|
|
111
101
|
|
|
112
102
|
|
|
113
103
|
class RBLNLlavaForConditionalGeneration(RBLNModel):
|
|
104
|
+
"""
|
|
105
|
+
RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
|
|
106
|
+
optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
|
|
107
|
+
This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
108
|
+
Important Note:
|
|
109
|
+
This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
|
|
110
|
+
tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
|
|
111
|
+
`from_pretrained` method. Refer to the `from_pretrained` documentation and the RBLNLlavaForConditionalGeneration class for details.
|
|
112
|
+
Examples:
|
|
113
|
+
```python
|
|
114
|
+
from optimum.rbln import RBLNLlavaForConditionalGeneration
|
|
115
|
+
model = RBLNLlavaForConditionalGeneration.from_pretrained(
|
|
116
|
+
"llava-hf/llava-1.5-7b-hf",
|
|
117
|
+
export=True,
|
|
118
|
+
rbln_config={
|
|
119
|
+
"vision_tower": {"output_hidden_states": True},
|
|
120
|
+
"language_model": {
|
|
121
|
+
"tensor_parallel_size": 4,
|
|
122
|
+
"use_inputs_embeds": True, # In Llava, language model must use inputs_embeds as input.
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
)
|
|
126
|
+
model.save_pretrained("compiled-llava-1.5-7b-hf")
|
|
127
|
+
|
|
128
|
+
# Using a RBLNLlavaForConditionalGenerationConfig instance (recommended for type checking)
|
|
129
|
+
from optimum.rbln import RBLNLlavaForConditionalGenerationConfig
|
|
130
|
+
vision_config = RBLNCLIPVisionModelConfig(
|
|
131
|
+
batch_size=1,
|
|
132
|
+
output_hidden_states=True
|
|
133
|
+
)
|
|
134
|
+
language_model_config = RBLNLlamaForCausalLMConfig(
|
|
135
|
+
batch_size=1,
|
|
136
|
+
max_seq_len=4096,
|
|
137
|
+
use_inputs_embeds=True,
|
|
138
|
+
tensor_parallel_size=4
|
|
139
|
+
)
|
|
140
|
+
llava_config = RBLNLlavaForConditionalGenerationConfig(
|
|
141
|
+
batch_size=1,
|
|
142
|
+
vision_tower=vision_config,
|
|
143
|
+
language_model=language_model_config
|
|
144
|
+
)
|
|
145
|
+
model = RBLNLlavaForConditionalGeneration.from_pretrained(
|
|
146
|
+
"llava-hf/llava-1.5-7b-hf",
|
|
147
|
+
export=True,
|
|
148
|
+
rbln_config=llava_config
|
|
149
|
+
)
|
|
150
|
+
```
|
|
151
|
+
"""
|
|
152
|
+
|
|
114
153
|
auto_model_class = AutoModelForImageTextToText
|
|
115
154
|
_rbln_submodules = [
|
|
116
155
|
{"name": "vision_tower"},
|
|
@@ -374,7 +413,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
|
|
|
374
413
|
if not return_dict:
|
|
375
414
|
return logits, generate_idx
|
|
376
415
|
else:
|
|
377
|
-
return
|
|
416
|
+
return RBLNDecoderOnlyOutput(
|
|
378
417
|
logits=logits,
|
|
379
418
|
generate_idx=generate_idx,
|
|
380
419
|
)
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
from ....utils.logging import get_logger
|
|
@@ -38,7 +38,7 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
38
38
|
batch_size: Optional[int] = None,
|
|
39
39
|
vision_tower: Optional[RBLNModelConfig] = None,
|
|
40
40
|
language_model: Optional[RBLNModelConfig] = None,
|
|
41
|
-
**kwargs:
|
|
41
|
+
**kwargs: Any,
|
|
42
42
|
):
|
|
43
43
|
"""
|
|
44
44
|
Args:
|
|
@@ -18,29 +18,19 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
|
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
import torch
|
|
21
|
-
from transformers import
|
|
22
|
-
AutoModelForVision2Seq,
|
|
23
|
-
LlavaNextForConditionalGeneration,
|
|
24
|
-
PretrainedConfig,
|
|
25
|
-
PreTrainedModel,
|
|
26
|
-
)
|
|
21
|
+
from transformers import AutoModelForVision2Seq, LlavaNextForConditionalGeneration, PretrainedConfig, PreTrainedModel
|
|
27
22
|
from transformers.modeling_outputs import BaseModelOutputWithPooling
|
|
28
23
|
|
|
29
24
|
from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
|
|
30
25
|
from ....modeling import RBLNModel
|
|
31
26
|
from ....utils.logging import get_logger
|
|
32
|
-
from ..decoderonly.modeling_decoderonly import
|
|
27
|
+
from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyOutput
|
|
33
28
|
|
|
34
29
|
|
|
35
30
|
logger = get_logger(__name__)
|
|
36
31
|
|
|
37
32
|
if TYPE_CHECKING:
|
|
38
|
-
from transformers import
|
|
39
|
-
AutoFeatureExtractor,
|
|
40
|
-
AutoProcessor,
|
|
41
|
-
AutoTokenizer,
|
|
42
|
-
PretrainedConfig,
|
|
43
|
-
)
|
|
33
|
+
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
|
|
44
34
|
|
|
45
35
|
|
|
46
36
|
class LoopVisionTower:
|
|
@@ -258,7 +248,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
258
248
|
|
|
259
249
|
def _update_model_kwargs_for_generation(
|
|
260
250
|
self,
|
|
261
|
-
outputs:
|
|
251
|
+
outputs: RBLNDecoderOnlyOutput,
|
|
262
252
|
model_kwargs: Dict[str, Any],
|
|
263
253
|
**kwargs,
|
|
264
254
|
) -> Dict[str, Any]:
|
|
@@ -359,7 +349,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
359
349
|
generate_idx: Optional[torch.Tensor] = None,
|
|
360
350
|
batch_idx: Optional[int] = None,
|
|
361
351
|
**kwargs,
|
|
362
|
-
) -> Union[Tuple,
|
|
352
|
+
) -> Union[Tuple, RBLNDecoderOnlyOutput]:
|
|
363
353
|
vision_feature_layer = (
|
|
364
354
|
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
365
355
|
)
|
|
@@ -418,7 +408,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
|
|
|
418
408
|
cache_position=cache_position,
|
|
419
409
|
)
|
|
420
410
|
logits = output.logits
|
|
421
|
-
return
|
|
411
|
+
return RBLNDecoderOnlyOutput(logits=logits, generate_idx=generate_idx)
|
|
422
412
|
|
|
423
413
|
# Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
|
|
424
414
|
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
|
@@ -70,24 +70,10 @@ class RBLNOPTForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
70
70
|
|
|
71
71
|
@classmethod
|
|
72
72
|
def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
73
|
-
wrapper_cfg = {
|
|
74
|
-
"max_seq_len": rbln_config.max_seq_len,
|
|
75
|
-
"attn_impl": rbln_config.attn_impl,
|
|
76
|
-
"kvcache_partition_len": rbln_config.kvcache_partition_len,
|
|
77
|
-
"kvcache_block_size": rbln_config.kvcache_block_size,
|
|
78
|
-
"use_rotary_emb": cls._use_rotary_emb,
|
|
79
|
-
"use_attention_mask": rbln_config.use_attention_mask,
|
|
80
|
-
"use_position_ids": rbln_config.use_position_ids,
|
|
81
|
-
"use_inputs_embeds": rbln_config.use_inputs_embeds,
|
|
82
|
-
"cache_impl": rbln_config.cache_impl,
|
|
83
|
-
"sliding_window": rbln_config.sliding_window,
|
|
84
|
-
"sliding_window_layers": rbln_config.sliding_window_layers,
|
|
85
|
-
}
|
|
86
|
-
|
|
87
73
|
for i in range(len(model.model.decoder.layers)):
|
|
88
74
|
model.model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.model.decoder.layers[i])
|
|
89
75
|
|
|
90
|
-
return cls._decoder_wrapper_cls(model,
|
|
76
|
+
return cls._decoder_wrapper_cls(model, rbln_config=rbln_config, use_rotary_emb=cls._use_rotary_emb).eval()
|
|
91
77
|
|
|
92
78
|
|
|
93
79
|
class RBLNOPTModel(RBLNDecoderOnlyModel):
|
|
@@ -110,21 +96,7 @@ class RBLNOPTModel(RBLNDecoderOnlyModel):
|
|
|
110
96
|
|
|
111
97
|
@classmethod
|
|
112
98
|
def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
113
|
-
wrapper_cfg = {
|
|
114
|
-
"max_seq_len": rbln_config.max_seq_len,
|
|
115
|
-
"attn_impl": rbln_config.attn_impl,
|
|
116
|
-
"kvcache_partition_len": rbln_config.kvcache_partition_len,
|
|
117
|
-
"kvcache_block_size": rbln_config.kvcache_block_size,
|
|
118
|
-
"use_rotary_emb": cls._use_rotary_emb,
|
|
119
|
-
"use_attention_mask": rbln_config.use_attention_mask,
|
|
120
|
-
"use_position_ids": rbln_config.use_position_ids,
|
|
121
|
-
"use_inputs_embeds": rbln_config.use_inputs_embeds,
|
|
122
|
-
"cache_impl": rbln_config.cache_impl,
|
|
123
|
-
"sliding_window": rbln_config.sliding_window,
|
|
124
|
-
"sliding_window_layers": rbln_config.sliding_window_layers,
|
|
125
|
-
}
|
|
126
|
-
|
|
127
99
|
for i in range(len(model.decoder.layers)):
|
|
128
100
|
model.decoder.layers[i] = cls.modify_opt_decoder_layer(model.decoder.layers[i])
|
|
129
101
|
|
|
130
|
-
return cls._decoder_wrapper_cls(model,
|
|
102
|
+
return cls._decoder_wrapper_cls(model, rbln_config=rbln_config, use_rotary_emb=cls._use_rotary_emb).eval()
|
|
@@ -24,6 +24,8 @@ class RBLNPegasusModelConfig(RBLNTransformerEncoderForFeatureExtractionConfig):
|
|
|
24
24
|
RBLN-optimized PEGASUS models for feature extraction tasks.
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
+
rbln_model_input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
class RBLNPegasusForConditionalGenerationConfig(RBLNModelForSeq2SeqLMConfig):
|
|
29
31
|
"""
|
|
@@ -32,3 +34,5 @@ class RBLNPegasusForConditionalGenerationConfig(RBLNModelForSeq2SeqLMConfig):
|
|
|
32
34
|
This configuration class stores the configuration parameters specific to
|
|
33
35
|
RBLN-optimized PEGASUS models for conditional text generation tasks.
|
|
34
36
|
"""
|
|
37
|
+
|
|
38
|
+
support_paged_attention = True
|
|
@@ -39,6 +39,8 @@ class RBLNPegasusModel(RBLNTransformerEncoderForFeatureExtraction):
|
|
|
39
39
|
on RBLN devices, optimized for feature extraction use cases.
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
+
rbln_model_input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
|
|
43
|
+
|
|
42
44
|
|
|
43
45
|
class RBLNPegasusForConditionalGeneration(RBLNModelForSeq2SeqLM):
|
|
44
46
|
"""
|
|
@@ -16,9 +16,7 @@ from typing import Tuple
|
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
18
|
from torch import nn
|
|
19
|
-
from transformers.modeling_attn_mask_utils import
|
|
20
|
-
_prepare_4d_attention_mask,
|
|
21
|
-
)
|
|
19
|
+
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
|
22
20
|
from transformers.utils import logging
|
|
23
21
|
|
|
24
22
|
from ..seq2seq.seq2seq_architecture import (
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional, Tuple
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
|
|
@@ -23,7 +23,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
|
|
|
23
23
|
max_image_size: Tuple = None,
|
|
24
24
|
batch_size: Optional[int] = None,
|
|
25
25
|
output_hidden_states: Optional[bool] = None,
|
|
26
|
-
**kwargs:
|
|
26
|
+
**kwargs: Any,
|
|
27
27
|
):
|
|
28
28
|
"""
|
|
29
29
|
Args:
|
|
@@ -21,10 +21,7 @@ import torch.nn as nn
|
|
|
21
21
|
from transformers import PixtralVisionConfig, PixtralVisionModel
|
|
22
22
|
from transformers.modeling_outputs import BaseModelOutput
|
|
23
23
|
from transformers.modeling_utils import no_init_weights
|
|
24
|
-
from transformers.models.pixtral.modeling_pixtral import
|
|
25
|
-
PixtralRMSNorm,
|
|
26
|
-
PixtralRotaryEmbedding,
|
|
27
|
-
)
|
|
24
|
+
from transformers.models.pixtral.modeling_pixtral import PixtralRMSNorm, PixtralRotaryEmbedding
|
|
28
25
|
|
|
29
26
|
from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
|
|
30
27
|
from ....modeling import RBLNModel
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, List, Optional, Union
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
|
|
@@ -33,7 +33,7 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
|
|
|
33
33
|
self,
|
|
34
34
|
visual: Optional[RBLNModelConfig] = None,
|
|
35
35
|
use_inputs_embeds: bool = True,
|
|
36
|
-
**kwargs:
|
|
36
|
+
**kwargs: Any,
|
|
37
37
|
):
|
|
38
38
|
super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
|
|
39
39
|
if not self.use_inputs_embeds:
|
|
@@ -53,7 +53,7 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
|
|
|
53
53
|
mechanisms for processing images and videos.
|
|
54
54
|
"""
|
|
55
55
|
|
|
56
|
-
def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs:
|
|
56
|
+
def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs: Any):
|
|
57
57
|
"""
|
|
58
58
|
Args:
|
|
59
59
|
max_seq_lens (Optional[Union[int, List[int]]]): Maximum sequence lengths for Vision
|
|
@@ -17,12 +17,7 @@ from pathlib import Path
|
|
|
17
17
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
|
-
from transformers import
|
|
21
|
-
AutoModelForVision2Seq,
|
|
22
|
-
PretrainedConfig,
|
|
23
|
-
PreTrainedModel,
|
|
24
|
-
Qwen2_5_VLForConditionalGeneration,
|
|
25
|
-
)
|
|
20
|
+
from transformers import AutoModelForVision2Seq, PretrainedConfig, PreTrainedModel, Qwen2_5_VLForConditionalGeneration
|
|
26
21
|
from transformers.modeling_utils import no_init_weights
|
|
27
22
|
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
28
23
|
Qwen2_5_VisionPatchEmbed,
|
|
@@ -34,7 +29,8 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
|
34
29
|
from ....configuration_utils import RBLNCompileConfig
|
|
35
30
|
from ....modeling import RBLNModel
|
|
36
31
|
from ....utils.logging import get_logger
|
|
37
|
-
from
|
|
32
|
+
from ...modeling_outputs import RBLNDecoderOnlyOutput
|
|
33
|
+
from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM
|
|
38
34
|
from .configuration_qwen2_5_vl import (
|
|
39
35
|
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
|
40
36
|
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
|
@@ -45,12 +41,7 @@ from .qwen2_5_vl_architecture import Qwen2_5_VisionTransformerWrapper, Qwen2_5_V
|
|
|
45
41
|
logger = get_logger(__name__)
|
|
46
42
|
|
|
47
43
|
if TYPE_CHECKING:
|
|
48
|
-
from transformers import
|
|
49
|
-
AutoFeatureExtractor,
|
|
50
|
-
AutoProcessor,
|
|
51
|
-
AutoTokenizer,
|
|
52
|
-
PretrainedConfig,
|
|
53
|
-
)
|
|
44
|
+
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
|
|
54
45
|
|
|
55
46
|
|
|
56
47
|
class RBLNQwen2_5_VisionTransformerPretrainedModel(RBLNModel):
|
|
@@ -595,7 +586,7 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
|
|
|
595
586
|
generate_idx: Optional[torch.Tensor] = None,
|
|
596
587
|
return_dict: Optional[bool] = None,
|
|
597
588
|
**kwargs,
|
|
598
|
-
) ->
|
|
589
|
+
) -> RBLNDecoderOnlyOutput:
|
|
599
590
|
# Prefill
|
|
600
591
|
if cache_position is None:
|
|
601
592
|
inputs_embeds, position_embed, rope_deltas = self._preprocess_prefill(
|
|
@@ -637,7 +628,7 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
|
|
|
637
628
|
if not return_dict:
|
|
638
629
|
return logits, generate_idx
|
|
639
630
|
else:
|
|
640
|
-
return
|
|
631
|
+
return RBLNDecoderOnlyOutput(
|
|
641
632
|
logits=logits,
|
|
642
633
|
generate_idx=generate_idx,
|
|
643
634
|
)
|
|
@@ -4,10 +4,7 @@ from typing import Tuple
|
|
|
4
4
|
import torch
|
|
5
5
|
import torch.nn as nn
|
|
6
6
|
|
|
7
|
-
from ..decoderonly.decoderonly_architecture import
|
|
8
|
-
DecoderOnlyWrapper,
|
|
9
|
-
apply_rotary_pos_emb,
|
|
10
|
-
)
|
|
7
|
+
from ..decoderonly.decoderonly_architecture import DecoderOnlyWrapper, apply_rotary_pos_emb
|
|
11
8
|
|
|
12
9
|
|
|
13
10
|
class Qwen2_5_VisionTransformerWrapper(nn.Module):
|
|
@@ -159,15 +156,15 @@ class Qwen2_5_VLVisionWindowAttention(nn.Module):
|
|
|
159
156
|
class Qwen2_5_VL_LanguageModelWrapper(DecoderOnlyWrapper):
|
|
160
157
|
def prepare_forward_args(self, *args):
|
|
161
158
|
args = list(args)
|
|
162
|
-
input_ids = None if self.use_inputs_embeds else args.pop(0)
|
|
163
|
-
inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
|
|
159
|
+
input_ids = None if self.rbln_config.use_inputs_embeds else args.pop(0)
|
|
160
|
+
inputs_embeds = args.pop(0) if self.rbln_config.use_inputs_embeds else None
|
|
164
161
|
cache_position = args.pop(0)
|
|
165
162
|
global_block_tables = args.pop(0)
|
|
166
163
|
local_block_tables = None
|
|
167
164
|
position_embeds = args.pop(0)
|
|
168
165
|
query_position = args.pop(0) if self.phase == "prefill" else None
|
|
169
166
|
position_ids = None
|
|
170
|
-
attention_mask = args.pop(0) if self.use_attention_mask else None
|
|
167
|
+
attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
|
|
171
168
|
past_key_values = args
|
|
172
169
|
|
|
173
170
|
if len(past_key_values) != 2 * self.num_hidden_layers:
|
|
@@ -28,12 +28,60 @@ from .qwen3_architecture import Qwen3Wrapper
|
|
|
28
28
|
logger = logging.get_logger(__name__)
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
|
-
from transformers import
|
|
32
|
-
PretrainedConfig,
|
|
33
|
-
)
|
|
31
|
+
from transformers import PretrainedConfig
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
35
|
+
"""
|
|
36
|
+
The Qwen3 Model transformer with a language modeling head (linear layer) on top.
|
|
37
|
+
This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
38
|
+
A class to convert and run pre-trained transformers based Qwen3ForCausalLM model on RBLN devices.
|
|
39
|
+
It implements the methods to convert a pre-trained transformers Qwen3ForCausalLM model into a RBLN transformer model by:
|
|
40
|
+
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
41
|
+
- compiling the resulting graph using the RBLN compiler.
|
|
42
|
+
**Configuration:**
|
|
43
|
+
This model uses [`RBLNQwen3ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
|
|
44
|
+
the `rbln_config` parameter should be an instance of [`RBLNQwen3ForCausalLMConfig`] or a dictionary conforming to its structure.
|
|
45
|
+
See the [`RBLNQwen3ForCausalLMConfig`] class for all available configuration options.
|
|
46
|
+
Examples:
|
|
47
|
+
```python
|
|
48
|
+
from optimum.rbln import RBLNQwen3ForCausalLM
|
|
49
|
+
# Simple usage using rbln_* arguments
|
|
50
|
+
# `max_seq_len` is automatically inferred from the model config
|
|
51
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
52
|
+
"Qwen/Qwen3-4B",
|
|
53
|
+
export=True,
|
|
54
|
+
rbln_batch_size=1,
|
|
55
|
+
rbln_tensor_parallel_size=4,
|
|
56
|
+
)
|
|
57
|
+
# Using a config dictionary
|
|
58
|
+
rbln_config = {
|
|
59
|
+
"batch_size": 1,
|
|
60
|
+
"max_seq_len": 40_960,
|
|
61
|
+
"tensor_parallel_size": 4,
|
|
62
|
+
"kvcache_partition_len": 8192,
|
|
63
|
+
}
|
|
64
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
65
|
+
"Qwen/Qwen3-4B",
|
|
66
|
+
export=True,
|
|
67
|
+
rbln_config=rbln_config
|
|
68
|
+
)
|
|
69
|
+
# Using a RBLNQwen3ForCausalLMConfig instance (recommended for type checking)
|
|
70
|
+
from optimum.rbln import RBLNQwen3ForCausalLMConfig
|
|
71
|
+
config = RBLNQwen3ForCausalLMConfig(
|
|
72
|
+
batch_size=1,
|
|
73
|
+
max_seq_len=40_960,
|
|
74
|
+
tensor_parallel_size=4,
|
|
75
|
+
kvcache_partition_len=8192,
|
|
76
|
+
)
|
|
77
|
+
model = RBLNQwen3ForCausalLM.from_pretrained(
|
|
78
|
+
"Qwen/Qwen3-4B",
|
|
79
|
+
export=True,
|
|
80
|
+
rbln_config=config
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
"""
|
|
84
|
+
|
|
37
85
|
_decoder_wrapper_cls = Qwen3Wrapper
|
|
38
86
|
|
|
39
87
|
@classmethod
|
|
@@ -55,5 +103,31 @@ class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
|
|
|
55
103
|
|
|
56
104
|
|
|
57
105
|
class RBLNQwen3Model(RBLNDecoderOnlyModel):
|
|
106
|
+
"""
|
|
107
|
+
The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
|
|
108
|
+
This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
109
|
+
A class to convert and run pre-trained transformers based Qwen3Model on RBLN devices.
|
|
110
|
+
It implements the methods to convert a pre-trained transformers Qwen3Model into a RBLN transformer model by:
|
|
111
|
+
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
112
|
+
- compiling the resulting graph using the RBLN compiler.
|
|
113
|
+
**Configuration:**
|
|
114
|
+
This model uses [`RBLNQwen3ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
|
|
115
|
+
the `rbln_config` parameter should be an instance of [`RBLNQwen3ModelConfig`] or a dictionary conforming to its structure.
|
|
116
|
+
See the [`RBLNQwen3ModelConfig`] class for all available configuration options.
|
|
117
|
+
Examples:
|
|
118
|
+
```python
|
|
119
|
+
from optimum.rbln import RBLNQwen3Model
|
|
120
|
+
# Simple usage using rbln_* arguments
|
|
121
|
+
# `max_seq_len` is automatically inferred from the model config
|
|
122
|
+
model = RBLNQwen3Model.from_pretrained(
|
|
123
|
+
"Qwen/Qwen3-Embedding-4B",
|
|
124
|
+
export=True,
|
|
125
|
+
rbln_batch_size=1,
|
|
126
|
+
rbln_max_seq_len=40_960,
|
|
127
|
+
rbln_tensor_parallel_size=4,
|
|
128
|
+
rbln_kvcache_partition_len=8192,
|
|
129
|
+
)
|
|
130
|
+
"""
|
|
131
|
+
|
|
58
132
|
_decoder_wrapper_cls = Qwen3Wrapper
|
|
59
133
|
_use_rotary_emb = True
|
|
@@ -13,10 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from ..decoderonly.decoderonly_architecture import
|
|
17
|
-
DecoderOnlyAttention,
|
|
18
|
-
DecoderOnlyWrapper,
|
|
19
|
-
)
|
|
16
|
+
from ..decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyWrapper
|
|
20
17
|
|
|
21
18
|
|
|
22
19
|
class Qwen3Wrapper(DecoderOnlyWrapper):
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
from ....utils.logging import get_logger
|
|
@@ -22,6 +22,8 @@ logger = get_logger()
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
25
|
+
support_paged_attention = None
|
|
26
|
+
|
|
25
27
|
def __init__(
|
|
26
28
|
self,
|
|
27
29
|
batch_size: Optional[int] = None,
|
|
@@ -29,7 +31,9 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
29
31
|
dec_max_seq_len: Optional[int] = None,
|
|
30
32
|
use_attention_mask: Optional[bool] = None,
|
|
31
33
|
pad_token_id: Optional[int] = None,
|
|
32
|
-
|
|
34
|
+
kvcache_num_blocks: Optional[int] = None,
|
|
35
|
+
kvcache_block_size: Optional[int] = None,
|
|
36
|
+
**kwargs: Any,
|
|
33
37
|
):
|
|
34
38
|
"""
|
|
35
39
|
Args:
|
|
@@ -38,6 +42,10 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
38
42
|
dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
|
|
39
43
|
use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
|
|
40
44
|
pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
|
|
45
|
+
kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
|
|
46
|
+
PagedAttention KV cache for the SelfAttention. Defaults to batch_size.
|
|
47
|
+
kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
|
|
48
|
+
in the PagedAttention KV cache for the SelfAttention. Defaults to dec_max_seq_len.
|
|
41
49
|
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
42
50
|
|
|
43
51
|
Raises:
|
|
@@ -54,3 +62,12 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
|
|
|
54
62
|
self.use_attention_mask = use_attention_mask
|
|
55
63
|
|
|
56
64
|
self.pad_token_id = pad_token_id
|
|
65
|
+
|
|
66
|
+
if self.support_paged_attention:
|
|
67
|
+
self.kvcache_num_blocks = kvcache_num_blocks
|
|
68
|
+
self.kvcache_block_size = kvcache_block_size
|
|
69
|
+
else:
|
|
70
|
+
if kvcache_num_blocks is not None or kvcache_block_size is not None:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"You cannot set kvcache_num_blocks or kvcache_block_size as paged attention is not supported for the model."
|
|
73
|
+
)
|