optimum-rbln 0.8.1rc0__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of optimum-rbln might be problematic. Click here for more details.
- optimum/rbln/__init__.py +58 -9
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +24 -5
- optimum/rbln/diffusers/configurations/models/__init__.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +5 -3
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
- optimum/rbln/diffusers/configurations/models/{configuration_cosmos_transformer.py → configuration_transformer_cosmos.py} +7 -2
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +2 -2
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +10 -6
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +4 -4
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +2 -2
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +2 -2
- optimum/rbln/diffusers/modeling_diffusers.py +4 -5
- optimum/rbln/diffusers/models/__init__.py +3 -13
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
- optimum/rbln/diffusers/pipelines/__init__.py +1 -5
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +12 -4
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -26
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +2 -2
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +2 -2
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
- optimum/rbln/modeling.py +4 -5
- optimum/rbln/modeling_base.py +18 -14
- optimum/rbln/ops/kv_cache_update.py +5 -0
- optimum/rbln/ops/linear.py +7 -0
- optimum/rbln/transformers/__init__.py +60 -0
- optimum/rbln/transformers/configuration_generic.py +4 -4
- optimum/rbln/transformers/modeling_attention_utils.py +252 -0
- optimum/rbln/transformers/modeling_generic.py +1 -4
- optimum/rbln/transformers/models/__init__.py +45 -30
- optimum/rbln/transformers/models/bart/bart_architecture.py +2 -7
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +2 -2
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -5
- optimum/rbln/transformers/models/clip/configuration_clip.py +14 -3
- optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +1 -4
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +2 -10
- optimum/rbln/transformers/models/decoderonly/__init__.py +2 -2
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +214 -45
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +323 -454
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +579 -362
- optimum/rbln/transformers/models/exaone/exaone_architecture.py +17 -42
- optimum/rbln/transformers/models/gemma/__init__.py +2 -2
- optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +3 -44
- optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +21 -9
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +9 -63
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +200 -292
- optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
- optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
- optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +19 -24
- optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +2 -2
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -9
- optimum/rbln/transformers/models/llama/__init__.py +2 -2
- optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
- optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
- optimum/rbln/transformers/models/llava/__init__.py +16 -0
- optimum/rbln/transformers/models/llava/configuration_llava.py +54 -0
- optimum/rbln/transformers/models/llava/modeling_llava.py +419 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +20 -3
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -16
- optimum/rbln/transformers/models/midm/midm_architecture.py +14 -22
- optimum/rbln/transformers/models/mistral/__init__.py +2 -2
- optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
- optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
- optimum/rbln/transformers/models/opt/__init__.py +2 -2
- optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
- optimum/rbln/transformers/models/opt/modeling_opt.py +41 -1
- optimum/rbln/transformers/models/opt/opt_architecture.py +16 -25
- optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
- optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +34 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +69 -0
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
- optimum/rbln/transformers/models/phi/__init__.py +2 -2
- optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
- optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
- optimum/rbln/transformers/models/phi/phi_architecture.py +16 -22
- optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +315 -0
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
- optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
- optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +3 -3
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -15
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +1 -4
- optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +133 -0
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +31 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -12
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +3 -1
- optimum/rbln/transformers/models/siglip/__init__.py +2 -6
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -5
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +3 -12
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +8 -2
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/hub.py +8 -47
- optimum/rbln/utils/runtime_utils.py +31 -5
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/METADATA +1 -1
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/RECORD +120 -103
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from optimum.rbln.transformers.models.decoderonly.configuration_decoderonly import (
|
|
5
|
+
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from ..utils.logging import get_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from rebel import RBLNCompiledModel
|
|
15
|
+
from transformers import PretrainedConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
|
|
19
|
+
DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
|
|
20
|
+
MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
|
|
21
|
+
MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
|
|
22
|
+
MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
|
|
23
|
+
MAX_SLIDING_WINDOW_SIZE = 32_768
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def set_default_values(
|
|
27
|
+
attn_impl: Optional[str] = None,
|
|
28
|
+
kvcache_partition_len: Optional[int] = None,
|
|
29
|
+
kvcache_block_size: Optional[int] = None,
|
|
30
|
+
max_seq_len: Optional[int] = None,
|
|
31
|
+
) -> Tuple[str, int, int]:
|
|
32
|
+
if attn_impl is None:
|
|
33
|
+
attn_impl = "eager"
|
|
34
|
+
|
|
35
|
+
if kvcache_partition_len is not None:
|
|
36
|
+
if attn_impl == "eager":
|
|
37
|
+
attn_impl = "flash_attn"
|
|
38
|
+
logger.warning(
|
|
39
|
+
"A non-null `kvcache_partition_len` was provided, but `attn_impl` was not explicitly set or "
|
|
40
|
+
"set to 'eager'. Since KV cache partitioning is only supported with flash attention, "
|
|
41
|
+
"`attn_impl` has been automatically switched to 'flash_attn'."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if kvcache_partition_len is None and attn_impl == "flash_attn":
|
|
45
|
+
kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
|
|
46
|
+
|
|
47
|
+
if kvcache_block_size is None:
|
|
48
|
+
if attn_impl == "eager":
|
|
49
|
+
kvcache_block_size = max_seq_len
|
|
50
|
+
else:
|
|
51
|
+
kvcache_block_size = kvcache_partition_len
|
|
52
|
+
|
|
53
|
+
return attn_impl, kvcache_partition_len, kvcache_block_size
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcache_block_size: int, max_seq_len: int):
|
|
57
|
+
if attn_impl not in ["eager", "flash_attn"]:
|
|
58
|
+
raise ValueError(f"Unknown `attn_impl` : {attn_impl}. (Available : 'eager', 'flash_attn`)")
|
|
59
|
+
|
|
60
|
+
## Checking Constraints...
|
|
61
|
+
# Constraint of eager attention:
|
|
62
|
+
# - `max_seq_len` <= 32k
|
|
63
|
+
|
|
64
|
+
# Constraints of flash attention:
|
|
65
|
+
# 1. `max_seq_len` should be multiple of `partition_len`.
|
|
66
|
+
# 2. 4k <= `partition_len` <= 32k.
|
|
67
|
+
# 3. `max_seq_len` should be larger then 8k.
|
|
68
|
+
if attn_impl == "eager" and max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"`max_seq_len` is set to {max_seq_len}, "
|
|
71
|
+
f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
|
|
72
|
+
f"Please reduce the `max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
|
|
73
|
+
" or consider switching `attn_impl` to 'flash_attn' for larger sequence lengths."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if attn_impl == "flash_attn":
|
|
77
|
+
if max_seq_len // kvcache_partition_len < 2 or max_seq_len % kvcache_partition_len != 0:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"`max_seq_len` ({max_seq_len}) must be a multiple of `kvcache_partition_len` ({kvcache_partition_len}) "
|
|
80
|
+
f"when using 'flash_attn'. Please adjust either value to meet this requirement."
|
|
81
|
+
)
|
|
82
|
+
elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"`kvcache_partition_len` ({kvcache_partition_len}) is out of the supported range for 'flash_attn' "
|
|
85
|
+
f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
|
|
86
|
+
f"Please provide a valid value within this range."
|
|
87
|
+
)
|
|
88
|
+
elif max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"`max_seq_len` ({max_seq_len}) is too small for 'flash_attn'. The minimum "
|
|
91
|
+
f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `max_seq_len` to meet "
|
|
92
|
+
"this requirement, or consider switching `attn_impl` to 'eager' for shorter lengths."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if kvcache_block_size is not None:
|
|
96
|
+
if attn_impl == "flash_attn" and kvcache_partition_len != kvcache_block_size:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f" When using 'flash attention', the `kvcache_block_size` ({kvcache_block_size}) "
|
|
99
|
+
f"must always be set equal to the `kvcache_partition_len` {kvcache_partition_len}."
|
|
100
|
+
)
|
|
101
|
+
elif attn_impl == "eager" and kvcache_block_size != max_seq_len:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f" When using 'eager attention', the `kvcache_block_size` ({kvcache_block_size}) "
|
|
104
|
+
f"must always be set equal to the `max_seq_len` {max_seq_len}."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_sliding_window(rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
109
|
+
if rbln_config.sliding_window > MAX_SLIDING_WINDOW_SIZE - rbln_config.prefill_chunk_size:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"Sliding window size ({rbln_config.sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - rbln_config.prefill_chunk_size})"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if rbln_config.cache_impl == "sliding_window" and rbln_config.use_attention_mask:
|
|
115
|
+
raise ValueError("`use_attention_mask` must be set to False when `cache_impl` is set to 'sliding_window'.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class RBLNDecoderOnlyFlashAttentionMixin:
|
|
119
|
+
@classmethod
|
|
120
|
+
def get_maximum_num_blocks(
|
|
121
|
+
cls,
|
|
122
|
+
config: "PretrainedConfig",
|
|
123
|
+
tensor_parallel_size: int,
|
|
124
|
+
kvcache_block_size: int,
|
|
125
|
+
nbits_per_param: Optional[int] = None,
|
|
126
|
+
n_model_params: Optional[int] = None,
|
|
127
|
+
kernel_size: Optional[int] = None,
|
|
128
|
+
buffer: Optional[int] = None,
|
|
129
|
+
num_runtimes: int = 2,
|
|
130
|
+
) -> int:
|
|
131
|
+
# We are finding max_n_blocks(x) that satisfies the following equation:
|
|
132
|
+
|
|
133
|
+
# available_dram - kernel_size - buffer
|
|
134
|
+
# - num_layers * 2 * tensor_parallel_size
|
|
135
|
+
# * align_2MB(
|
|
136
|
+
# x
|
|
137
|
+
# * block_size
|
|
138
|
+
# * align_64(head_dim)
|
|
139
|
+
# * math.ceil(num_key_value_heads / tensor_parallel_size)
|
|
140
|
+
# * 2
|
|
141
|
+
# ) > 0
|
|
142
|
+
|
|
143
|
+
# This inequality can be rewritten as follows:
|
|
144
|
+
|
|
145
|
+
# a - c * align_2MB(b * x) > 0
|
|
146
|
+
# where
|
|
147
|
+
# a = available_dram - kernel_size - buffer
|
|
148
|
+
# b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
|
149
|
+
# c = num_layers * 2 * tensor_parallel_size
|
|
150
|
+
|
|
151
|
+
# We can rewrite the inequality as follows:
|
|
152
|
+
# k > align_2MB(b*x)
|
|
153
|
+
# where
|
|
154
|
+
# k = a / c
|
|
155
|
+
|
|
156
|
+
# After that, we can derive the following equation:
|
|
157
|
+
# x = floor(2**21 / b * floor((k - 1) / 2**21))
|
|
158
|
+
|
|
159
|
+
def align(x: int, nbytes: int) -> int:
|
|
160
|
+
return int(math.ceil(x / nbytes) * nbytes)
|
|
161
|
+
|
|
162
|
+
def align_2MB(x: int) -> int:
|
|
163
|
+
return align(x, 2**21)
|
|
164
|
+
|
|
165
|
+
num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
|
|
166
|
+
num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
|
|
167
|
+
head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
|
|
168
|
+
vocab_size = config.vocab_size
|
|
169
|
+
hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
|
|
170
|
+
num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
|
|
171
|
+
|
|
172
|
+
# TODO(jongho): Update if target npu is REBEL.
|
|
173
|
+
ATOM_DRAM_NBYTES = 16 * 2**30
|
|
174
|
+
ATOM_SYS_DRAM_NBYTES = 288 * 2**20
|
|
175
|
+
available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
|
|
176
|
+
|
|
177
|
+
if kernel_size is None:
|
|
178
|
+
if n_model_params is None:
|
|
179
|
+
raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
|
|
180
|
+
# Get estimated kernel size (approximated)
|
|
181
|
+
lm_heads_params = align(vocab_size, 64) * hidden_size
|
|
182
|
+
lm_heads_nbytes = (
|
|
183
|
+
align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
|
|
184
|
+
)
|
|
185
|
+
params = n_model_params - lm_heads_params
|
|
186
|
+
layer_nbytes = (
|
|
187
|
+
align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
|
|
188
|
+
* num_layers
|
|
189
|
+
* tensor_parallel_size
|
|
190
|
+
)
|
|
191
|
+
kernel_size = layer_nbytes + lm_heads_nbytes
|
|
192
|
+
elif n_model_params is not None:
|
|
193
|
+
raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
|
|
194
|
+
|
|
195
|
+
available_dram -= kernel_size
|
|
196
|
+
|
|
197
|
+
if buffer is None:
|
|
198
|
+
# TODO: Accurate buffer estimation
|
|
199
|
+
buffer_per_runtime_per_core = 2**28 # 256MB per runtime
|
|
200
|
+
buffer_per_core = buffer_per_runtime_per_core * num_runtimes # 1 for prefill, 1 for decoder
|
|
201
|
+
buffer = buffer_per_core * tensor_parallel_size
|
|
202
|
+
available_dram -= buffer
|
|
203
|
+
|
|
204
|
+
b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
|
205
|
+
c = num_layers * 2 * tensor_parallel_size
|
|
206
|
+
k = available_dram / c
|
|
207
|
+
max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
|
|
208
|
+
|
|
209
|
+
return max_n_blocks
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def maybe_suggest_kvcache_num_blocks(
|
|
213
|
+
cls,
|
|
214
|
+
compiled_models: Dict[str, "RBLNCompiledModel"],
|
|
215
|
+
model_config: "PretrainedConfig",
|
|
216
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
217
|
+
) -> None:
|
|
218
|
+
# Get the actual memory allocation of each node by key
|
|
219
|
+
alloc_memory_per_node_by_key: Dict[str, List[int]] = compiled_models["prefill"].get_alloc_per_node_by_key()
|
|
220
|
+
alloc_memory_by_key: Dict[str, int] = {
|
|
221
|
+
key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
|
|
222
|
+
}
|
|
223
|
+
for batch_size in rbln_config.decoder_batch_sizes:
|
|
224
|
+
for key, memory_per_node in (
|
|
225
|
+
compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
|
|
226
|
+
):
|
|
227
|
+
alloc_memory_by_key[key] += sum(memory_per_node)
|
|
228
|
+
alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
|
|
229
|
+
alloc_memory_by_key.pop("DramTensor", None) # kv-cache
|
|
230
|
+
kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
|
|
231
|
+
|
|
232
|
+
# Get the maximum number of blocks that can be allocated
|
|
233
|
+
buffer = sum(alloc_memory_by_key.values())
|
|
234
|
+
max_num_blocks = cls.get_maximum_num_blocks(
|
|
235
|
+
config=model_config,
|
|
236
|
+
tensor_parallel_size=rbln_config.tensor_parallel_size,
|
|
237
|
+
kvcache_block_size=rbln_config.kvcache_block_size,
|
|
238
|
+
kernel_size=kernel_size,
|
|
239
|
+
buffer=buffer,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Since our estimation logic is not always accurate,
|
|
243
|
+
# users can set `kvcache_num_blocks` to `max_num_blocks`.
|
|
244
|
+
# If the memory is not enough, the model will fail to compile.
|
|
245
|
+
if rbln_config.kvcache_num_blocks < max_num_blocks:
|
|
246
|
+
logger.warning(
|
|
247
|
+
f"Current `kvcache_num_blocks` setting is {rbln_config.kvcache_num_blocks}. "
|
|
248
|
+
"Our analysis indicates that additional memory is available for more blocks. "
|
|
249
|
+
f"Consider increasing `kvcache_num_blocks` to {max_num_blocks} for potentially improved performance. "
|
|
250
|
+
"Please be advised that our memory estimation algorithm has limitations, "
|
|
251
|
+
"and increasing this value may not guarantee successful model compilation."
|
|
252
|
+
)
|
|
@@ -34,10 +34,7 @@ from transformers import (
|
|
|
34
34
|
AutoModelForTextEncoding,
|
|
35
35
|
PretrainedConfig,
|
|
36
36
|
)
|
|
37
|
-
from transformers.modeling_outputs import
|
|
38
|
-
BaseModelOutput,
|
|
39
|
-
QuestionAnsweringModelOutput,
|
|
40
|
-
)
|
|
37
|
+
from transformers.modeling_outputs import BaseModelOutput, QuestionAnsweringModelOutput
|
|
41
38
|
|
|
42
39
|
from ..configuration_utils import RBLNCompileConfig
|
|
43
40
|
from ..modeling import RBLNModel
|
|
@@ -84,6 +84,8 @@ _import_structure = {
|
|
|
84
84
|
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
|
85
85
|
],
|
|
86
86
|
"decoderonly": [
|
|
87
|
+
"RBLNDecoderOnlyModelConfig",
|
|
88
|
+
"RBLNDecoderOnlyModel",
|
|
87
89
|
"RBLNDecoderOnlyModelForCausalLM",
|
|
88
90
|
"RBLNDecoderOnlyModelForCausalLMConfig",
|
|
89
91
|
],
|
|
@@ -92,27 +94,41 @@ _import_structure = {
|
|
|
92
94
|
"RBLNDPTForDepthEstimationConfig",
|
|
93
95
|
],
|
|
94
96
|
"exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
|
|
95
|
-
"gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig"],
|
|
97
|
+
"gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig", "RBLNGemmaModel", "RBLNGemmaModelConfig"],
|
|
96
98
|
"gemma3": [
|
|
97
99
|
"RBLNGemma3ForCausalLM",
|
|
98
100
|
"RBLNGemma3ForCausalLMConfig",
|
|
99
101
|
"RBLNGemma3ForConditionalGeneration",
|
|
100
102
|
"RBLNGemma3ForConditionalGenerationConfig",
|
|
101
103
|
],
|
|
102
|
-
"gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig"],
|
|
104
|
+
"gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig", "RBLNGPT2Model", "RBLNGPT2ModelConfig"],
|
|
103
105
|
"idefics3": [
|
|
104
106
|
"RBLNIdefics3VisionTransformer",
|
|
105
107
|
"RBLNIdefics3ForConditionalGeneration",
|
|
106
108
|
"RBLNIdefics3ForConditionalGenerationConfig",
|
|
107
109
|
"RBLNIdefics3VisionTransformerConfig",
|
|
108
110
|
],
|
|
109
|
-
"
|
|
110
|
-
"
|
|
111
|
+
"llava": ["RBLNLlavaForConditionalGeneration", "RBLNLlavaForConditionalGenerationConfig"],
|
|
112
|
+
"llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig", "RBLNLlamaModel", "RBLNLlamaModelConfig"],
|
|
113
|
+
"opt": ["RBLNOPTForCausalLM", "RBLNOPTForCausalLMConfig", "RBLNOPTModel", "RBLNOPTModelConfig"],
|
|
114
|
+
"pegasus": [
|
|
115
|
+
"RBLNPegasusForConditionalGeneration",
|
|
116
|
+
"RBLNPegasusModel",
|
|
117
|
+
"RBLNPegasusForConditionalGenerationConfig",
|
|
118
|
+
"RBLNPegasusModelConfig",
|
|
119
|
+
],
|
|
111
120
|
"llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
|
|
112
121
|
"midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
|
|
122
|
+
"pixtral": ["RBLNPixtralVisionModel", "RBLNPixtralVisionModelConfig"],
|
|
123
|
+
"mistral": [
|
|
124
|
+
"RBLNMistralForCausalLM",
|
|
125
|
+
"RBLNMistralForCausalLMConfig",
|
|
126
|
+
"RBLNMistralModel",
|
|
127
|
+
"RBLNMistralModelConfig",
|
|
128
|
+
],
|
|
129
|
+
"phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig", "RBLNPhiModel", "RBLNPhiModelConfig"],
|
|
130
|
+
"qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig", "RBLNQwen2Model", "RBLNQwen2ModelConfig"],
|
|
131
|
+
"qwen3": ["RBLNQwen3ForCausalLM", "RBLNQwen3ForCausalLMConfig", "RBLNQwen3Model", "RBLNQwen3ModelConfig"],
|
|
116
132
|
"resnet": ["RBLNResNetForImageClassification", "RBLNResNetForImageClassificationConfig"],
|
|
117
133
|
"roberta": [
|
|
118
134
|
"RBLNRobertaForMaskedLM",
|
|
@@ -146,10 +162,7 @@ _import_structure = {
|
|
|
146
162
|
}
|
|
147
163
|
|
|
148
164
|
if TYPE_CHECKING:
|
|
149
|
-
from .audio_spectrogram_transformer import
|
|
150
|
-
RBLNASTForAudioClassification,
|
|
151
|
-
RBLNASTForAudioClassificationConfig,
|
|
152
|
-
)
|
|
165
|
+
from .audio_spectrogram_transformer import RBLNASTForAudioClassification, RBLNASTForAudioClassificationConfig
|
|
153
166
|
from .auto import (
|
|
154
167
|
RBLNAutoModel,
|
|
155
168
|
RBLNAutoModelForAudioClassification,
|
|
@@ -197,50 +210,52 @@ if TYPE_CHECKING:
|
|
|
197
210
|
RBLNCLIPVisionModelWithProjection,
|
|
198
211
|
RBLNCLIPVisionModelWithProjectionConfig,
|
|
199
212
|
)
|
|
200
|
-
from .colpali import
|
|
201
|
-
RBLNColPaliForRetrieval,
|
|
202
|
-
RBLNColPaliForRetrievalConfig,
|
|
203
|
-
)
|
|
213
|
+
from .colpali import RBLNColPaliForRetrieval, RBLNColPaliForRetrievalConfig
|
|
204
214
|
from .decoderonly import (
|
|
215
|
+
RBLNDecoderOnlyModel,
|
|
216
|
+
RBLNDecoderOnlyModelConfig,
|
|
205
217
|
RBLNDecoderOnlyModelForCausalLM,
|
|
206
218
|
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
207
219
|
)
|
|
208
|
-
from .distilbert import
|
|
209
|
-
|
|
210
|
-
RBLNDistilBertForQuestionAnsweringConfig,
|
|
211
|
-
)
|
|
212
|
-
from .dpt import (
|
|
213
|
-
RBLNDPTForDepthEstimation,
|
|
214
|
-
RBLNDPTForDepthEstimationConfig,
|
|
215
|
-
)
|
|
220
|
+
from .distilbert import RBLNDistilBertForQuestionAnswering, RBLNDistilBertForQuestionAnsweringConfig
|
|
221
|
+
from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
|
|
216
222
|
from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
|
|
217
|
-
from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig
|
|
223
|
+
from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig, RBLNGemmaModel, RBLNGemmaModelConfig
|
|
218
224
|
from .gemma3 import (
|
|
219
225
|
RBLNGemma3ForCausalLM,
|
|
220
226
|
RBLNGemma3ForCausalLMConfig,
|
|
221
227
|
RBLNGemma3ForConditionalGeneration,
|
|
222
228
|
RBLNGemma3ForConditionalGenerationConfig,
|
|
223
229
|
)
|
|
224
|
-
from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig
|
|
230
|
+
from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig, RBLNGPT2Model, RBLNGPT2ModelConfig
|
|
225
231
|
from .idefics3 import (
|
|
226
232
|
RBLNIdefics3ForConditionalGeneration,
|
|
227
233
|
RBLNIdefics3ForConditionalGenerationConfig,
|
|
228
234
|
RBLNIdefics3VisionTransformer,
|
|
229
235
|
RBLNIdefics3VisionTransformerConfig,
|
|
230
236
|
)
|
|
231
|
-
from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig
|
|
237
|
+
from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig, RBLNLlamaModel, RBLNLlamaModelConfig
|
|
238
|
+
from .llava import RBLNLlavaForConditionalGeneration, RBLNLlavaForConditionalGenerationConfig
|
|
232
239
|
from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
|
|
233
240
|
from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
|
|
234
|
-
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
|
|
235
|
-
from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig
|
|
236
|
-
from .
|
|
237
|
-
|
|
241
|
+
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
|
|
242
|
+
from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
|
|
243
|
+
from .pegasus import (
|
|
244
|
+
RBLNPegasusForConditionalGeneration,
|
|
245
|
+
RBLNPegasusForConditionalGenerationConfig,
|
|
246
|
+
RBLNPegasusModel,
|
|
247
|
+
RBLNPegasusModelConfig,
|
|
248
|
+
)
|
|
249
|
+
from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig, RBLNPhiModel, RBLNPhiModelConfig
|
|
250
|
+
from .pixtral import RBLNPixtralVisionModel, RBLNPixtralVisionModelConfig
|
|
251
|
+
from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig, RBLNQwen2Model, RBLNQwen2ModelConfig
|
|
238
252
|
from .qwen2_5_vl import (
|
|
239
253
|
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
|
240
254
|
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
|
241
255
|
RBLNQwen2_5_VLForConditionalGeneration,
|
|
242
256
|
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
|
243
257
|
)
|
|
258
|
+
from .qwen3 import RBLNQwen3ForCausalLM, RBLNQwen3ForCausalLMConfig, RBLNQwen3Model, RBLNQwen3ModelConfig
|
|
244
259
|
from .resnet import RBLNResNetForImageClassification, RBLNResNetForImageClassificationConfig
|
|
245
260
|
from .roberta import (
|
|
246
261
|
RBLNRobertaForMaskedLM,
|
|
@@ -16,9 +16,7 @@ from typing import Tuple
|
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
18
|
from torch import nn
|
|
19
|
-
from transformers.modeling_attn_mask_utils import
|
|
20
|
-
_prepare_4d_attention_mask,
|
|
21
|
-
)
|
|
19
|
+
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
|
22
20
|
from transformers.utils import logging
|
|
23
21
|
|
|
24
22
|
from ..seq2seq.seq2seq_architecture import (
|
|
@@ -56,10 +54,7 @@ class BartDecoderWrapper(Seq2SeqDecoderWrapper):
|
|
|
56
54
|
|
|
57
55
|
|
|
58
56
|
class BartForConditionalGeneration(Seq2SeqForConditionalGeneration):
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def __post_init__(self):
|
|
62
|
-
self.scaling = self.config.d_model**-0.5
|
|
57
|
+
pass
|
|
63
58
|
|
|
64
59
|
|
|
65
60
|
class BartDecoder(Seq2SeqDecoder):
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
|
|
@@ -62,7 +62,7 @@ class RBLNBlip2ForConditionalGenerationConfig(RBLNModelConfig):
|
|
|
62
62
|
vision_model: Optional[RBLNModelConfig] = None,
|
|
63
63
|
qformer: Optional[RBLNModelConfig] = None,
|
|
64
64
|
language_model: Optional[RBLNModelConfig] = None,
|
|
65
|
-
**kwargs,
|
|
65
|
+
**kwargs: Any,
|
|
66
66
|
):
|
|
67
67
|
"""
|
|
68
68
|
Args:
|
|
@@ -35,11 +35,7 @@ from ....modeling import RBLNModel
|
|
|
35
35
|
logger = logging.get_logger(__name__)
|
|
36
36
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
|
-
from transformers import
|
|
39
|
-
AutoFeatureExtractor,
|
|
40
|
-
AutoProcessor,
|
|
41
|
-
AutoTokenizer,
|
|
42
|
-
)
|
|
38
|
+
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
|
|
43
39
|
|
|
44
40
|
|
|
45
41
|
class LoopProjector:
|
|
@@ -12,13 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class RBLNCLIPTextModelConfig(RBLNModelConfig):
|
|
21
|
-
def __init__(self, batch_size: Optional[int] = None, **kwargs:
|
|
21
|
+
def __init__(self, batch_size: Optional[int] = None, **kwargs: Any):
|
|
22
22
|
"""
|
|
23
23
|
Args:
|
|
24
24
|
batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
|
|
@@ -43,7 +43,15 @@ class RBLNCLIPTextModelWithProjectionConfig(RBLNCLIPTextModelConfig):
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
class RBLNCLIPVisionModelConfig(RBLNModelConfig):
|
|
46
|
-
def __init__(
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
batch_size: Optional[int] = None,
|
|
49
|
+
image_size: Optional[int] = None,
|
|
50
|
+
interpolate_pos_encoding: Optional[bool] = None,
|
|
51
|
+
output_hidden_states: Optional[bool] = None,
|
|
52
|
+
output_attentions: Optional[bool] = None,
|
|
53
|
+
**kwargs: Any,
|
|
54
|
+
):
|
|
47
55
|
"""
|
|
48
56
|
Args:
|
|
49
57
|
batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
|
|
@@ -60,6 +68,9 @@ class RBLNCLIPVisionModelConfig(RBLNModelConfig):
|
|
|
60
68
|
raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
|
|
61
69
|
|
|
62
70
|
self.image_size = image_size
|
|
71
|
+
self.interpolate_pos_encoding = interpolate_pos_encoding or False
|
|
72
|
+
self.output_hidden_states = output_hidden_states
|
|
73
|
+
self.output_attentions = output_attentions
|
|
63
74
|
|
|
64
75
|
@property
|
|
65
76
|
def image_width(self):
|