optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.4a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +12 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +16 -6
- optimum/rbln/diffusers/__init__.py +12 -0
- optimum/rbln/diffusers/configurations/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
- optimum/rbln/diffusers/modeling_diffusers.py +1 -1
- optimum/rbln/diffusers/models/__init__.py +17 -3
- optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
- optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
- optimum/rbln/diffusers/models/controlnet.py +17 -2
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +16 -2
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +16 -1
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +14 -1
- optimum/rbln/diffusers/models/unets/__init__.py +1 -0
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +18 -2
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
- optimum/rbln/diffusers/pipelines/__init__.py +4 -0
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +2 -2
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -4
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -2
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
- optimum/rbln/modeling.py +20 -45
- optimum/rbln/modeling_base.py +12 -8
- optimum/rbln/transformers/configuration_generic.py +0 -27
- optimum/rbln/transformers/modeling_attention_utils.py +242 -109
- optimum/rbln/transformers/modeling_generic.py +2 -61
- optimum/rbln/transformers/modeling_outputs.py +1 -0
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
- optimum/rbln/transformers/models/auto/auto_factory.py +1 -0
- optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
- optimum/rbln/transformers/models/bert/modeling_bert.py +86 -1
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +42 -15
- optimum/rbln/transformers/models/clip/modeling_clip.py +40 -2
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +2 -2
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +6 -45
- optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +0 -2
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +10 -1
- optimum/rbln/transformers/models/decoderonly/configuration_lora.py +1 -1
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +92 -43
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +207 -64
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +17 -9
- optimum/rbln/transformers/models/decoderonly/lora_architecture.py +1 -1
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +140 -46
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +17 -0
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +7 -1
- optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +42 -70
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +46 -31
- optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +1 -1
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +24 -9
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -5
- optimum/rbln/transformers/models/llava/modeling_llava.py +37 -25
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +3 -5
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -22
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +13 -1
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +2 -2
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -28
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +8 -9
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -7
- optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +1 -1
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +0 -20
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -4
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +36 -12
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +17 -1
- optimum/rbln/transformers/models/swin/modeling_swin.py +17 -4
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/t5/t5_architecture.py +1 -1
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +25 -10
- optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +60 -8
- optimum/rbln/transformers/models/whisper/generation_whisper.py +48 -14
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +53 -0
- optimum/rbln/transformers/utils/rbln_quantization.py +9 -0
- optimum/rbln/utils/deprecation.py +213 -0
- optimum/rbln/utils/hub.py +14 -3
- optimum/rbln/utils/import_utils.py +7 -1
- optimum/rbln/utils/runtime_utils.py +32 -0
- optimum/rbln/utils/submodule.py +3 -1
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/METADATA +2 -2
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/RECORD +106 -99
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/WHEEL +1 -1
- optimum/rbln/utils/depreacate_utils.py +0 -16
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/entry_points.txt +0 -0
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/licenses/LICENSE +0 -0
optimum/rbln/modeling_base.py
CHANGED
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
import importlib
|
|
16
16
|
import os
|
|
17
17
|
import shutil
|
|
18
|
-
from abc import ABC
|
|
19
18
|
from pathlib import Path
|
|
20
19
|
from tempfile import TemporaryDirectory
|
|
21
20
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
|
@@ -39,7 +38,7 @@ if TYPE_CHECKING:
|
|
|
39
38
|
logger = get_logger(__name__)
|
|
40
39
|
|
|
41
40
|
|
|
42
|
-
class PreTrainedModel
|
|
41
|
+
class PreTrainedModel: # noqa: F811
|
|
43
42
|
pass
|
|
44
43
|
|
|
45
44
|
|
|
@@ -63,7 +62,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
63
62
|
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
|
|
64
63
|
subfolder: str = "",
|
|
65
64
|
rbln_compiled_models: Optional[rebel.RBLNCompiledModel] = None,
|
|
66
|
-
rbln_submodules: List["RBLNBaseModel"] =
|
|
65
|
+
rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
|
|
67
66
|
**kwargs,
|
|
68
67
|
):
|
|
69
68
|
self.model = models
|
|
@@ -71,7 +70,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
71
70
|
self.rbln_config = rbln_config
|
|
72
71
|
if not rbln_config.is_frozen():
|
|
73
72
|
raise RuntimeError("`rbln_config` must be frozen. Please call `rbln_config.freeze()` first.")
|
|
74
|
-
|
|
75
73
|
self.compiled_models = rbln_compiled_models
|
|
76
74
|
|
|
77
75
|
# Registers the RBLN classes into the transformers AutoModel classes to avoid warnings when creating
|
|
@@ -107,6 +105,8 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
107
105
|
self.model_save_dir = model_save_dir
|
|
108
106
|
self.subfolder = subfolder
|
|
109
107
|
|
|
108
|
+
if rbln_submodules is None:
|
|
109
|
+
rbln_submodules = []
|
|
110
110
|
self.rbln_submodules = rbln_submodules
|
|
111
111
|
self.__post_init__(**kwargs)
|
|
112
112
|
|
|
@@ -182,7 +182,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
182
182
|
# passed from compile function
|
|
183
183
|
rbln_config: Optional[RBLNModelConfig] = None,
|
|
184
184
|
rbln_compiled_models: Optional[Dict[str, rebel.RBLNCompiledModel]] = None,
|
|
185
|
-
rbln_submodules: List["RBLNBaseModel"] =
|
|
185
|
+
rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
|
|
186
186
|
**kwargs,
|
|
187
187
|
) -> "RBLNBaseModel":
|
|
188
188
|
if rbln_compiled_models is None:
|
|
@@ -218,8 +218,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
218
218
|
)
|
|
219
219
|
|
|
220
220
|
if len(cls._rbln_submodules) > 0:
|
|
221
|
-
rbln_submodules
|
|
222
|
-
|
|
221
|
+
if rbln_submodules is None:
|
|
222
|
+
rbln_submodules = cls._load_submodules(model_save_dir=model_id, rbln_config=rbln_config, **kwargs)
|
|
223
|
+
elif rbln_submodules is None:
|
|
223
224
|
rbln_submodules = []
|
|
224
225
|
|
|
225
226
|
rbln_config.freeze()
|
|
@@ -280,9 +281,12 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
|
|
|
280
281
|
config: "PretrainedConfig",
|
|
281
282
|
model_save_dir: Union[Path, str],
|
|
282
283
|
subfolder: Union[Path, str],
|
|
283
|
-
rbln_submodules: List["RBLNBaseModel"] =
|
|
284
|
+
rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
|
|
284
285
|
**kwargs,
|
|
285
286
|
):
|
|
287
|
+
if rbln_submodules is None:
|
|
288
|
+
rbln_submodules = []
|
|
289
|
+
|
|
286
290
|
if isinstance(model_save_dir, str):
|
|
287
291
|
model_save_dir = Path(model_save_dir)
|
|
288
292
|
|
|
@@ -118,30 +118,3 @@ class RBLNModelForImageClassificationConfig(RBLNImageModelConfig):
|
|
|
118
118
|
|
|
119
119
|
class RBLNModelForDepthEstimationConfig(RBLNImageModelConfig):
|
|
120
120
|
pass
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class RBLNModelForAudioClassificationConfig(RBLNModelConfig):
|
|
124
|
-
def __init__(
|
|
125
|
-
self,
|
|
126
|
-
batch_size: Optional[int] = None,
|
|
127
|
-
max_length: Optional[int] = None,
|
|
128
|
-
num_mel_bins: Optional[int] = None,
|
|
129
|
-
**kwargs: Any,
|
|
130
|
-
):
|
|
131
|
-
"""
|
|
132
|
-
Args:
|
|
133
|
-
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
134
|
-
max_length (Optional[int]): Maximum length of the audio input in time dimension.
|
|
135
|
-
num_mel_bins (Optional[int]): Number of Mel frequency bins for audio processing.
|
|
136
|
-
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
137
|
-
|
|
138
|
-
Raises:
|
|
139
|
-
ValueError: If batch_size is not a positive integer.
|
|
140
|
-
"""
|
|
141
|
-
super().__init__(**kwargs)
|
|
142
|
-
self.batch_size = batch_size or 1
|
|
143
|
-
if not isinstance(self.batch_size, int) or self.batch_size < 0:
|
|
144
|
-
raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
|
|
145
|
-
|
|
146
|
-
self.max_length = max_length
|
|
147
|
-
self.num_mel_bins = num_mel_bins
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from
|
|
2
|
+
from collections import Counter, defaultdict
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, Optional, Tuple
|
|
3
4
|
|
|
4
|
-
|
|
5
|
-
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
6
|
-
)
|
|
5
|
+
import rebel
|
|
7
6
|
|
|
8
7
|
from ..utils.logging import get_logger
|
|
8
|
+
from ..utils.runtime_utils import get_available_dram
|
|
9
|
+
from .models.decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
|
-
from
|
|
15
|
-
from transformers import PretrainedConfig
|
|
15
|
+
from transformers import PretrainedConfig, PreTrainedModel
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
|
|
@@ -115,128 +115,261 @@ def validate_sliding_window(rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
|
115
115
|
raise ValueError("`use_attention_mask` must be set to False when `cache_impl` is set to 'sliding_window'.")
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
def align(x: int, nbytes: int) -> int:
|
|
119
|
+
return int(math.ceil(x / nbytes) * nbytes)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def align_2MB(x: int) -> int:
|
|
123
|
+
return align(x, 2**21)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_alloc_memory_by_key(compiled_models: Dict[str, "rebel.RBLNCompiledModel"]) -> Dict[str, int]:
|
|
127
|
+
alloc_memory_by_key = defaultdict(int)
|
|
128
|
+
# Get the actual memory allocation of each node by key
|
|
129
|
+
for compiled_model in compiled_models.values():
|
|
130
|
+
alloc_per_node_by_key = compiled_model.get_alloc_per_node_by_key()
|
|
131
|
+
for key, memory_per_node in alloc_per_node_by_key.items():
|
|
132
|
+
alloc_memory_by_key[key] += sum(memory_per_node)
|
|
133
|
+
|
|
134
|
+
return alloc_memory_by_key
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def format_byte_size(nbytes: int) -> str:
|
|
138
|
+
if nbytes < 1024:
|
|
139
|
+
return f"{nbytes} B"
|
|
140
|
+
elif nbytes < 1024**2:
|
|
141
|
+
return f"{nbytes / 1024:.2f} KB"
|
|
142
|
+
elif nbytes < 1024**3:
|
|
143
|
+
return f"{nbytes / 1024**2:.2f} MB"
|
|
144
|
+
else:
|
|
145
|
+
return f"{nbytes / 1024**3:.2f} GB"
|
|
146
|
+
|
|
147
|
+
|
|
118
148
|
class RBLNDecoderOnlyFlashAttentionMixin:
|
|
119
149
|
@classmethod
|
|
120
|
-
def
|
|
150
|
+
def get_maximum_num_blocks_by_model(
|
|
121
151
|
cls,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
nbits_per_param: Optional[int] = None,
|
|
126
|
-
n_model_params: Optional[int] = None,
|
|
127
|
-
kernel_size: Optional[int] = None,
|
|
128
|
-
buffer: Optional[int] = None,
|
|
129
|
-
num_runtimes: int = 2,
|
|
152
|
+
model: "PreTrainedModel",
|
|
153
|
+
model_config: "PretrainedConfig",
|
|
154
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
130
155
|
) -> int:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
# a - c * align_2MB(b * x) > 0
|
|
146
|
-
# where
|
|
147
|
-
# a = available_dram - kernel_size - buffer
|
|
148
|
-
# b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
|
149
|
-
# c = num_layers * 2 * tensor_parallel_size
|
|
150
|
-
|
|
151
|
-
# We can rewrite the inequality as follows:
|
|
152
|
-
# k > align_2MB(b*x)
|
|
153
|
-
# where
|
|
154
|
-
# k = a / c
|
|
155
|
-
|
|
156
|
-
# After that, we can derive the following equation:
|
|
157
|
-
# x = floor(2**21 / b * floor((k - 1) / 2**21))
|
|
158
|
-
|
|
159
|
-
def align(x: int, nbytes: int) -> int:
|
|
160
|
-
return int(math.ceil(x / nbytes) * nbytes)
|
|
161
|
-
|
|
162
|
-
def align_2MB(x: int) -> int:
|
|
163
|
-
return align(x, 2**21)
|
|
164
|
-
|
|
165
|
-
num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
|
|
166
|
-
num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
|
|
167
|
-
head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
|
|
168
|
-
vocab_size = config.vocab_size
|
|
169
|
-
hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
|
|
170
|
-
num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
|
|
171
|
-
|
|
172
|
-
# TODO(jongho): Update if target npu is REBEL.
|
|
173
|
-
ATOM_DRAM_NBYTES = 16 * 2**30
|
|
174
|
-
ATOM_SYS_DRAM_NBYTES = 288 * 2**20
|
|
175
|
-
available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
|
|
176
|
-
|
|
177
|
-
if kernel_size is None:
|
|
178
|
-
if n_model_params is None:
|
|
179
|
-
raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
|
|
180
|
-
# Get estimated kernel size (approximated)
|
|
181
|
-
lm_heads_params = align(vocab_size, 64) * hidden_size
|
|
182
|
-
lm_heads_nbytes = (
|
|
183
|
-
align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
|
|
156
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
157
|
+
available_dram = get_available_dram(rbln_config.npu) * tensor_parallel_size
|
|
158
|
+
|
|
159
|
+
kernel_memory = cls._get_kernel_memory(model, model_config=model_config, rbln_config=rbln_config)
|
|
160
|
+
buffer = cls._get_buffer(rbln_config)
|
|
161
|
+
|
|
162
|
+
remaining_dram = available_dram - kernel_memory - buffer
|
|
163
|
+
if remaining_dram <= 0:
|
|
164
|
+
raise ValueError(
|
|
165
|
+
"Insufficient available DRAM after accounting for kernel memory and buffer. "
|
|
166
|
+
"Cannot allocate any KV cache blocks."
|
|
167
|
+
f" (Available DRAM: {format_byte_size(available_dram)}, "
|
|
168
|
+
f"Kernel Memory: {format_byte_size(kernel_memory)}, "
|
|
169
|
+
f"Buffer: {format_byte_size(buffer)})"
|
|
184
170
|
)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
171
|
+
estimated_num_blocks = cls._estimate_num_blocks(
|
|
172
|
+
remaining_dram, model_config=model_config, rbln_config=rbln_config
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return estimated_num_blocks
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def _get_kernel_memory(
|
|
179
|
+
cls,
|
|
180
|
+
model: "PreTrainedModel",
|
|
181
|
+
model_config: "PretrainedConfig",
|
|
182
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
183
|
+
) -> int:
|
|
184
|
+
if model.get_output_embeddings() is None:
|
|
185
|
+
lm_head_nbytes = 0
|
|
186
|
+
else:
|
|
187
|
+
lm_head_nbytes = cls._get_lm_head_memory(model_config, rbln_config)
|
|
188
|
+
|
|
189
|
+
layer_nbytes = cls._get_layer_memory(model, model_config, rbln_config)
|
|
190
|
+
return lm_head_nbytes + layer_nbytes
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def _get_lm_head_memory(
|
|
194
|
+
cls, model_config: "PretrainedConfig", rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
|
|
195
|
+
) -> int:
|
|
196
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
197
|
+
vocab_size = model_config.vocab_size
|
|
198
|
+
hidden_size = getattr(model_config, "n_embd", None) or model_config.hidden_size
|
|
199
|
+
lm_head_params = align(vocab_size, 64) * hidden_size
|
|
200
|
+
|
|
201
|
+
nbytes_per_param = 2 # Assuming lm_head is always not quantized
|
|
202
|
+
lm_head_memory_in_bytes = (
|
|
203
|
+
align_2MB(lm_head_params * nbytes_per_param / tensor_parallel_size) * tensor_parallel_size
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return lm_head_memory_in_bytes
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def _get_layer_memory(
|
|
210
|
+
cls,
|
|
211
|
+
model: "PreTrainedModel",
|
|
212
|
+
model_config: "PretrainedConfig",
|
|
213
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
214
|
+
) -> int:
|
|
215
|
+
# This is an *APPROXIMATE* calculation based on the number of parameters
|
|
216
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
217
|
+
num_hidden_layers = getattr(model_config, "n_layer", None) or model_config.num_hidden_layers
|
|
218
|
+
|
|
219
|
+
n_model_params = sum(p.numel() for p in model.parameters())
|
|
220
|
+
embed_token_params = sum(p.numel() for p in model.get_input_embeddings().parameters())
|
|
221
|
+
|
|
222
|
+
# Check : `embed_token` is same as `lm_head`
|
|
223
|
+
if model.get_output_embeddings() is not None:
|
|
224
|
+
params = n_model_params - 2 * embed_token_params
|
|
225
|
+
else:
|
|
226
|
+
params = n_model_params - embed_token_params
|
|
227
|
+
|
|
228
|
+
# Assuming all layers have the same number of parameters
|
|
229
|
+
# and all linear layers are quantized if quantization is enabled (This is not always true)
|
|
230
|
+
# TODO(jongho): More accurate calculation
|
|
231
|
+
nbits_per_param = rbln_config.nbits_per_param
|
|
232
|
+
layer_nbytes = (
|
|
233
|
+
(align_2MB(params // num_hidden_layers * nbits_per_param // 8 / tensor_parallel_size))
|
|
234
|
+
* num_hidden_layers
|
|
235
|
+
* tensor_parallel_size
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return layer_nbytes
|
|
239
|
+
|
|
240
|
+
@classmethod
|
|
241
|
+
def _get_buffer(cls, rbln_config) -> int:
|
|
242
|
+
# TODO(jongho): Accurate buffer estimation
|
|
243
|
+
buffer_per_runtime_per_core = 2**28 # 256MB per runtime
|
|
244
|
+
num_runtimes = 1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes)
|
|
245
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
246
|
+
|
|
247
|
+
buffer_per_core = buffer_per_runtime_per_core * num_runtimes
|
|
248
|
+
buffer = buffer_per_core * tensor_parallel_size
|
|
249
|
+
return buffer
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def get_maximum_num_blocks_by_compiled_model(
|
|
253
|
+
cls,
|
|
254
|
+
compiled_models: Dict[str, "rebel.RBLNCompiledModel"],
|
|
255
|
+
model_config: "PretrainedConfig",
|
|
256
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
257
|
+
) -> int:
|
|
258
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
259
|
+
available_dram = get_available_dram(rbln_config.npu) * tensor_parallel_size
|
|
260
|
+
|
|
261
|
+
alloc_memory_by_key = get_alloc_memory_by_key(compiled_models)
|
|
262
|
+
alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
|
|
263
|
+
alloc_memory_by_key.pop("DramTensor", None) # kv-cache
|
|
264
|
+
used_memory = sum(alloc_memory_by_key.values())
|
|
265
|
+
|
|
266
|
+
remaining_dram = available_dram - used_memory
|
|
267
|
+
|
|
268
|
+
if remaining_dram <= 0:
|
|
269
|
+
logger.warning(
|
|
270
|
+
"Insufficient available DRAM after accounting for kernel memory and buffer. "
|
|
271
|
+
"Model cannot allocate any KV cache blocks."
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
estimated_num_blocks = cls._estimate_num_blocks(
|
|
275
|
+
remaining_dram, model_config=model_config, rbln_config=rbln_config
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return estimated_num_blocks
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def _estimate_num_blocks(
|
|
282
|
+
cls, available_dram: int, model_config: "PretrainedConfig", rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
|
|
283
|
+
) -> int:
|
|
284
|
+
"""
|
|
285
|
+
Estimate the maximum number of KV cache blocks that can be allocated.
|
|
286
|
+
|
|
287
|
+
if all of the layers are full attention, the dram_per_block can be calculated simply as follows:
|
|
288
|
+
num_blocks = available_dram // dram_per_block
|
|
289
|
+
|
|
290
|
+
However, if the model contains a mix of full attention and sliding window attention layers,
|
|
291
|
+
we need to consider the memory occupied by the sliding window attention layers first,
|
|
292
|
+
since their memory usage is constant regardless of the number of blocks.
|
|
293
|
+
num_blocks = (available_dram - swa_kv_nbytes) // dram_per_block
|
|
294
|
+
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
def get_dram_per_block(seq_len: int, num_key_value_heads: int, tensor_parallel_size: int) -> int:
|
|
298
|
+
nbytes_per_param = 2 # Assuming kv-cache is always not quantized
|
|
299
|
+
dram_per_block = (
|
|
300
|
+
seq_len
|
|
301
|
+
* align(head_dim, 64)
|
|
302
|
+
* math.ceil(num_key_value_heads / tensor_parallel_size)
|
|
303
|
+
* nbytes_per_param
|
|
189
304
|
* tensor_parallel_size
|
|
305
|
+
* 2
|
|
306
|
+
) # *2 for key and value
|
|
307
|
+
|
|
308
|
+
return dram_per_block
|
|
309
|
+
|
|
310
|
+
num_attention_heads = getattr(model_config, "n_head", None) or model_config.num_attention_heads
|
|
311
|
+
head_dim = getattr(model_config, "head_dim", None) or model_config.hidden_size // num_attention_heads
|
|
312
|
+
num_hidden_layers = getattr(model_config, "n_layer", None) or model_config.num_hidden_layers
|
|
313
|
+
num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
|
|
314
|
+
tensor_parallel_size = rbln_config.tensor_parallel_size or 1
|
|
315
|
+
|
|
316
|
+
# Consider layer types if available
|
|
317
|
+
# If layer types are not found, assume all layers are full attention
|
|
318
|
+
layer_types = getattr(model_config, "layer_types", None)
|
|
319
|
+
if layer_types:
|
|
320
|
+
layer_types_dict = Counter(layer_types)
|
|
321
|
+
num_full_attention = layer_types_dict.pop("full_attention", 0)
|
|
322
|
+
num_sliding_window_attention = layer_types_dict.pop("sliding_attention", 0)
|
|
323
|
+
if len(layer_types_dict) > 0:
|
|
324
|
+
raise ValueError(f"Unknown layer types found in the config: {layer_types_dict.keys()}")
|
|
325
|
+
|
|
326
|
+
else:
|
|
327
|
+
num_full_attention = num_hidden_layers
|
|
328
|
+
num_sliding_window_attention = 0
|
|
329
|
+
|
|
330
|
+
# Reduce available DRAM by sliding window attention kv-cache
|
|
331
|
+
# Since memory occupation of swa layer is constant regardless of num_blocks
|
|
332
|
+
swa_kv_nbytes = 0
|
|
333
|
+
if num_sliding_window_attention > 0:
|
|
334
|
+
sliding_window = getattr(model_config, "sliding_window", None)
|
|
335
|
+
if sliding_window is None:
|
|
336
|
+
logger.warning(
|
|
337
|
+
"`sliding_window` is not found in the config while `sliding_attention` layers are present. "
|
|
338
|
+
"Assuming maximum sliding window size for estimation."
|
|
339
|
+
)
|
|
340
|
+
sliding_window = rbln_config.kvcache_block_size
|
|
341
|
+
|
|
342
|
+
swa_kv_nbytes = num_sliding_window_attention * get_dram_per_block(
|
|
343
|
+
seq_len=sliding_window,
|
|
344
|
+
num_key_value_heads=num_key_value_heads,
|
|
345
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
190
346
|
)
|
|
191
|
-
kernel_size = layer_nbytes + lm_heads_nbytes
|
|
192
|
-
elif n_model_params is not None:
|
|
193
|
-
raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
|
|
194
347
|
|
|
195
|
-
|
|
348
|
+
available_dram -= swa_kv_nbytes
|
|
196
349
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
available_dram -= buffer
|
|
350
|
+
dram_per_block = num_full_attention * get_dram_per_block(
|
|
351
|
+
seq_len=rbln_config.kvcache_block_size,
|
|
352
|
+
num_key_value_heads=num_key_value_heads,
|
|
353
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
354
|
+
)
|
|
203
355
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
k = available_dram / c
|
|
207
|
-
max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
|
|
356
|
+
if dram_per_block == 0:
|
|
357
|
+
raise ValueError("DRAM per block is calculated as zero, cannot estimate maximum number of blocks.")
|
|
208
358
|
|
|
359
|
+
max_n_blocks = available_dram // dram_per_block
|
|
209
360
|
return max_n_blocks
|
|
210
361
|
|
|
211
362
|
@classmethod
|
|
212
363
|
def maybe_suggest_kvcache_num_blocks(
|
|
213
364
|
cls,
|
|
214
|
-
compiled_models: Dict[str, "RBLNCompiledModel"],
|
|
365
|
+
compiled_models: Dict[str, "rebel.RBLNCompiledModel"],
|
|
215
366
|
model_config: "PretrainedConfig",
|
|
216
367
|
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
217
368
|
) -> None:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
}
|
|
223
|
-
for batch_size in rbln_config.decoder_batch_sizes:
|
|
224
|
-
for key, memory_per_node in (
|
|
225
|
-
compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
|
|
226
|
-
):
|
|
227
|
-
alloc_memory_by_key[key] += sum(memory_per_node)
|
|
228
|
-
alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
|
|
229
|
-
alloc_memory_by_key.pop("DramTensor", None) # kv-cache
|
|
230
|
-
kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
|
|
231
|
-
|
|
232
|
-
# Get the maximum number of blocks that can be allocated
|
|
233
|
-
buffer = sum(alloc_memory_by_key.values())
|
|
234
|
-
max_num_blocks = cls.get_maximum_num_blocks(
|
|
235
|
-
config=model_config,
|
|
236
|
-
tensor_parallel_size=rbln_config.tensor_parallel_size,
|
|
237
|
-
kvcache_block_size=rbln_config.kvcache_block_size,
|
|
238
|
-
kernel_size=kernel_size,
|
|
239
|
-
buffer=buffer,
|
|
369
|
+
max_num_blocks = cls.get_maximum_num_blocks_by_compiled_model(
|
|
370
|
+
compiled_models=compiled_models,
|
|
371
|
+
model_config=model_config,
|
|
372
|
+
rbln_config=rbln_config,
|
|
240
373
|
)
|
|
241
374
|
|
|
242
375
|
# Since our estimation logic is not always accurate,
|
|
@@ -26,7 +26,6 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
|
26
26
|
from torch import nn
|
|
27
27
|
from transformers import (
|
|
28
28
|
AutoModel,
|
|
29
|
-
AutoModelForAudioClassification,
|
|
30
29
|
AutoModelForDepthEstimation,
|
|
31
30
|
AutoModelForImageClassification,
|
|
32
31
|
AutoModelForMaskedLM,
|
|
@@ -42,7 +41,6 @@ from ..modeling import RBLNModel
|
|
|
42
41
|
from ..utils.logging import get_logger
|
|
43
42
|
from .configuration_generic import (
|
|
44
43
|
RBLNImageModelConfig,
|
|
45
|
-
RBLNModelForAudioClassificationConfig,
|
|
46
44
|
RBLNTransformerEncoderConfig,
|
|
47
45
|
)
|
|
48
46
|
|
|
@@ -59,7 +57,7 @@ class RBLNTransformerEncoder(RBLNModel):
|
|
|
59
57
|
rbln_dtype = "int64"
|
|
60
58
|
|
|
61
59
|
@classmethod
|
|
62
|
-
def
|
|
60
|
+
def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNTransformerEncoderConfig) -> nn.Module:
|
|
63
61
|
class TransformerEncoderWrapper(nn.Module):
|
|
64
62
|
# Parameters to disable for RBLN compilation
|
|
65
63
|
DISABLED_PARAMS = {"return_dict", "use_cache"}
|
|
@@ -268,7 +266,7 @@ class RBLNModelForDepthEstimation(RBLNImageModel):
|
|
|
268
266
|
auto_model_class = AutoModelForDepthEstimation
|
|
269
267
|
|
|
270
268
|
@classmethod
|
|
271
|
-
def
|
|
269
|
+
def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
|
|
272
270
|
class ImageModelWrapper(nn.Module):
|
|
273
271
|
def __init__(self, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
|
|
274
272
|
super().__init__()
|
|
@@ -280,60 +278,3 @@ class RBLNModelForDepthEstimation(RBLNImageModel):
|
|
|
280
278
|
return output.predicted_depth
|
|
281
279
|
|
|
282
280
|
return ImageModelWrapper(model, rbln_config).eval()
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
class RBLNModelForAudioClassification(RBLNModel):
|
|
286
|
-
"""
|
|
287
|
-
This is a generic model class that will be instantiated as one of the model classes of the library (with a audio classification head) when created with the from_pretrained() class method
|
|
288
|
-
This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
|
|
289
|
-
|
|
290
|
-
A class to convert and run pre-trained transformers based AudioClassification models on RBLN devices.
|
|
291
|
-
It implements the methods to convert a pre-trained transformers AudioClassification model into a RBLN transformer model by:
|
|
292
|
-
|
|
293
|
-
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
294
|
-
- compiling the resulting graph using the RBLN compiler.
|
|
295
|
-
|
|
296
|
-
Currently, this model class only supports the 'AST' model from the transformers library. Future updates may include support for additional model types.
|
|
297
|
-
"""
|
|
298
|
-
|
|
299
|
-
auto_model_class = AutoModelForAudioClassification
|
|
300
|
-
|
|
301
|
-
@classmethod
|
|
302
|
-
def _update_rbln_config(
|
|
303
|
-
cls,
|
|
304
|
-
preprocessors: "AutoFeatureExtractor" = None,
|
|
305
|
-
model: Optional["PreTrainedModel"] = None,
|
|
306
|
-
model_config: "PretrainedConfig" = None,
|
|
307
|
-
rbln_config: Optional[RBLNModelForAudioClassificationConfig] = None,
|
|
308
|
-
) -> RBLNModelForAudioClassificationConfig:
|
|
309
|
-
if rbln_config.num_mel_bins is None:
|
|
310
|
-
rbln_config.num_mel_bins = getattr(model_config, "num_mel_bins", None)
|
|
311
|
-
if rbln_config.num_mel_bins is None:
|
|
312
|
-
for feature_extractor in preprocessors:
|
|
313
|
-
if hasattr(feature_extractor, "num_mel_bins"):
|
|
314
|
-
rbln_config.num_mel_bins = feature_extractor.num_mel_bins
|
|
315
|
-
break
|
|
316
|
-
|
|
317
|
-
if rbln_config.num_mel_bins is None:
|
|
318
|
-
raise ValueError("`num_mel_bins` should be specified!")
|
|
319
|
-
|
|
320
|
-
if rbln_config.max_length is None:
|
|
321
|
-
rbln_config.max_length = getattr(model_config, "max_length", None)
|
|
322
|
-
for feature_extractor in preprocessors:
|
|
323
|
-
if hasattr(feature_extractor, "max_length"):
|
|
324
|
-
rbln_config.max_length = feature_extractor.max_length
|
|
325
|
-
break
|
|
326
|
-
|
|
327
|
-
if rbln_config.max_length is None:
|
|
328
|
-
raise ValueError("`max_length` should be specified!")
|
|
329
|
-
|
|
330
|
-
input_info = [
|
|
331
|
-
(
|
|
332
|
-
"input_values",
|
|
333
|
-
[rbln_config.batch_size, rbln_config.max_length, rbln_config.num_mel_bins],
|
|
334
|
-
"float32",
|
|
335
|
-
),
|
|
336
|
-
]
|
|
337
|
-
|
|
338
|
-
rbln_config.set_compile_cfgs([RBLNCompileConfig(input_info=input_info)])
|
|
339
|
-
return rbln_config
|
|
@@ -12,10 +12,36 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
|
+
from ....configuration_utils import RBLNModelConfig
|
|
18
|
+
from ....utils.deprecation import deprecate_kwarg
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
|
|
21
|
+
class RBLNASTForAudioClassificationConfig(RBLNModelConfig):
|
|
19
22
|
"""
|
|
20
23
|
Configuration class for RBLNASTForAudioClassification.
|
|
21
24
|
"""
|
|
25
|
+
|
|
26
|
+
@deprecate_kwarg(old_name="num_mel_bins", version="0.10.0")
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
batch_size: Optional[int] = None,
|
|
30
|
+
max_length: Optional[int] = None,
|
|
31
|
+
**kwargs: Any,
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Args:
|
|
35
|
+
batch_size (Optional[int]): The batch size for inference. Defaults to 1.
|
|
36
|
+
max_length (Optional[int]): Maximum length of the audio input in time dimension.
|
|
37
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If batch_size is not a positive integer.
|
|
41
|
+
"""
|
|
42
|
+
super().__init__(**kwargs)
|
|
43
|
+
self.batch_size = batch_size or 1
|
|
44
|
+
if not isinstance(self.batch_size, int) or self.batch_size < 0:
|
|
45
|
+
raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
|
|
46
|
+
|
|
47
|
+
self.max_length = max_length
|