optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.4a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. optimum/rbln/__init__.py +12 -0
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +16 -6
  4. optimum/rbln/diffusers/__init__.py +12 -0
  5. optimum/rbln/diffusers/configurations/__init__.py +3 -0
  6. optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
  7. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
  8. optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
  9. optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
  10. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
  11. optimum/rbln/diffusers/modeling_diffusers.py +1 -1
  12. optimum/rbln/diffusers/models/__init__.py +17 -3
  13. optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
  14. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
  15. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
  16. optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
  17. optimum/rbln/diffusers/models/controlnet.py +17 -2
  18. optimum/rbln/diffusers/models/transformers/prior_transformer.py +16 -2
  19. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +16 -1
  20. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +14 -1
  21. optimum/rbln/diffusers/models/unets/__init__.py +1 -0
  22. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +18 -2
  23. optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
  24. optimum/rbln/diffusers/pipelines/__init__.py +4 -0
  25. optimum/rbln/diffusers/pipelines/auto_pipeline.py +2 -2
  26. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
  27. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +13 -4
  28. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +13 -4
  29. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +13 -4
  30. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -4
  31. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +1 -1
  32. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  33. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -2
  34. optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
  35. optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
  36. optimum/rbln/modeling.py +20 -45
  37. optimum/rbln/modeling_base.py +12 -8
  38. optimum/rbln/transformers/configuration_generic.py +0 -27
  39. optimum/rbln/transformers/modeling_attention_utils.py +242 -109
  40. optimum/rbln/transformers/modeling_generic.py +2 -61
  41. optimum/rbln/transformers/modeling_outputs.py +1 -0
  42. optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
  43. optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
  44. optimum/rbln/transformers/models/auto/auto_factory.py +1 -0
  45. optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
  46. optimum/rbln/transformers/models/bert/modeling_bert.py +86 -1
  47. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +42 -15
  48. optimum/rbln/transformers/models/clip/modeling_clip.py +40 -2
  49. optimum/rbln/transformers/models/colpali/colpali_architecture.py +2 -2
  50. optimum/rbln/transformers/models/colpali/modeling_colpali.py +6 -45
  51. optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +0 -2
  52. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +10 -1
  53. optimum/rbln/transformers/models/decoderonly/configuration_lora.py +1 -1
  54. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +92 -43
  55. optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +207 -64
  56. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +17 -9
  57. optimum/rbln/transformers/models/decoderonly/lora_architecture.py +1 -1
  58. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +140 -46
  59. optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +17 -0
  60. optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
  61. optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
  62. optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +7 -1
  63. optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +42 -70
  64. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +46 -31
  65. optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +1 -1
  66. optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +24 -9
  67. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -5
  68. optimum/rbln/transformers/models/llava/modeling_llava.py +37 -25
  69. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +3 -5
  70. optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -22
  71. optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
  72. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
  73. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +13 -1
  74. optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +2 -2
  75. optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -28
  76. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +8 -9
  77. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -7
  78. optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +1 -1
  79. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +0 -20
  80. optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
  81. optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
  82. optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
  83. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -4
  84. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +36 -12
  85. optimum/rbln/transformers/models/siglip/modeling_siglip.py +17 -1
  86. optimum/rbln/transformers/models/swin/modeling_swin.py +17 -4
  87. optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
  88. optimum/rbln/transformers/models/t5/t5_architecture.py +1 -1
  89. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +25 -10
  90. optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
  91. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
  92. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +60 -8
  93. optimum/rbln/transformers/models/whisper/generation_whisper.py +48 -14
  94. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
  95. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +53 -0
  96. optimum/rbln/transformers/utils/rbln_quantization.py +9 -0
  97. optimum/rbln/utils/deprecation.py +213 -0
  98. optimum/rbln/utils/hub.py +14 -3
  99. optimum/rbln/utils/import_utils.py +7 -1
  100. optimum/rbln/utils/runtime_utils.py +32 -0
  101. optimum/rbln/utils/submodule.py +3 -1
  102. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/METADATA +2 -2
  103. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/RECORD +106 -99
  104. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/WHEEL +1 -1
  105. optimum/rbln/utils/depreacate_utils.py +0 -16
  106. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/entry_points.txt +0 -0
  107. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.4a2.dist-info}/licenses/LICENSE +0 -0
@@ -15,7 +15,6 @@
15
15
  import importlib
16
16
  import os
17
17
  import shutil
18
- from abc import ABC
19
18
  from pathlib import Path
20
19
  from tempfile import TemporaryDirectory
21
20
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
@@ -39,7 +38,7 @@ if TYPE_CHECKING:
39
38
  logger = get_logger(__name__)
40
39
 
41
40
 
42
- class PreTrainedModel(ABC): # noqa: F811
41
+ class PreTrainedModel: # noqa: F811
43
42
  pass
44
43
 
45
44
 
@@ -63,7 +62,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
63
62
  model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
64
63
  subfolder: str = "",
65
64
  rbln_compiled_models: Optional[rebel.RBLNCompiledModel] = None,
66
- rbln_submodules: List["RBLNBaseModel"] = [],
65
+ rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
67
66
  **kwargs,
68
67
  ):
69
68
  self.model = models
@@ -71,7 +70,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
71
70
  self.rbln_config = rbln_config
72
71
  if not rbln_config.is_frozen():
73
72
  raise RuntimeError("`rbln_config` must be frozen. Please call `rbln_config.freeze()` first.")
74
-
75
73
  self.compiled_models = rbln_compiled_models
76
74
 
77
75
  # Registers the RBLN classes into the transformers AutoModel classes to avoid warnings when creating
@@ -107,6 +105,8 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
107
105
  self.model_save_dir = model_save_dir
108
106
  self.subfolder = subfolder
109
107
 
108
+ if rbln_submodules is None:
109
+ rbln_submodules = []
110
110
  self.rbln_submodules = rbln_submodules
111
111
  self.__post_init__(**kwargs)
112
112
 
@@ -182,7 +182,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
182
182
  # passed from compile function
183
183
  rbln_config: Optional[RBLNModelConfig] = None,
184
184
  rbln_compiled_models: Optional[Dict[str, rebel.RBLNCompiledModel]] = None,
185
- rbln_submodules: List["RBLNBaseModel"] = [],
185
+ rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
186
186
  **kwargs,
187
187
  ) -> "RBLNBaseModel":
188
188
  if rbln_compiled_models is None:
@@ -218,8 +218,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
218
218
  )
219
219
 
220
220
  if len(cls._rbln_submodules) > 0:
221
- rbln_submodules = cls._load_submodules(model_save_dir=model_id, rbln_config=rbln_config, **kwargs)
222
- else:
221
+ if rbln_submodules is None:
222
+ rbln_submodules = cls._load_submodules(model_save_dir=model_id, rbln_config=rbln_config, **kwargs)
223
+ elif rbln_submodules is None:
223
224
  rbln_submodules = []
224
225
 
225
226
  rbln_config.freeze()
@@ -280,9 +281,12 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
280
281
  config: "PretrainedConfig",
281
282
  model_save_dir: Union[Path, str],
282
283
  subfolder: Union[Path, str],
283
- rbln_submodules: List["RBLNBaseModel"] = [],
284
+ rbln_submodules: Optional[List["RBLNBaseModel"]] = None,
284
285
  **kwargs,
285
286
  ):
287
+ if rbln_submodules is None:
288
+ rbln_submodules = []
289
+
286
290
  if isinstance(model_save_dir, str):
287
291
  model_save_dir = Path(model_save_dir)
288
292
 
@@ -118,30 +118,3 @@ class RBLNModelForImageClassificationConfig(RBLNImageModelConfig):
118
118
 
119
119
  class RBLNModelForDepthEstimationConfig(RBLNImageModelConfig):
120
120
  pass
121
-
122
-
123
- class RBLNModelForAudioClassificationConfig(RBLNModelConfig):
124
- def __init__(
125
- self,
126
- batch_size: Optional[int] = None,
127
- max_length: Optional[int] = None,
128
- num_mel_bins: Optional[int] = None,
129
- **kwargs: Any,
130
- ):
131
- """
132
- Args:
133
- batch_size (Optional[int]): The batch size for inference. Defaults to 1.
134
- max_length (Optional[int]): Maximum length of the audio input in time dimension.
135
- num_mel_bins (Optional[int]): Number of Mel frequency bins for audio processing.
136
- kwargs: Additional arguments passed to the parent RBLNModelConfig.
137
-
138
- Raises:
139
- ValueError: If batch_size is not a positive integer.
140
- """
141
- super().__init__(**kwargs)
142
- self.batch_size = batch_size or 1
143
- if not isinstance(self.batch_size, int) or self.batch_size < 0:
144
- raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
145
-
146
- self.max_length = max_length
147
- self.num_mel_bins = num_mel_bins
@@ -1,18 +1,18 @@
1
1
  import math
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
2
+ from collections import Counter, defaultdict
3
+ from typing import TYPE_CHECKING, Dict, Optional, Tuple
3
4
 
4
- from optimum.rbln.transformers.models.decoderonly.configuration_decoderonly import (
5
- RBLNDecoderOnlyModelForCausalLMConfig,
6
- )
5
+ import rebel
7
6
 
8
7
  from ..utils.logging import get_logger
8
+ from ..utils.runtime_utils import get_available_dram
9
+ from .models.decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
9
10
 
10
11
 
11
12
  logger = get_logger()
12
13
 
13
14
  if TYPE_CHECKING:
14
- from rebel import RBLNCompiledModel
15
- from transformers import PretrainedConfig
15
+ from transformers import PretrainedConfig, PreTrainedModel
16
16
 
17
17
 
18
18
  DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
@@ -115,128 +115,261 @@ def validate_sliding_window(rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
115
115
  raise ValueError("`use_attention_mask` must be set to False when `cache_impl` is set to 'sliding_window'.")
116
116
 
117
117
 
118
+ def align(x: int, nbytes: int) -> int:
119
+ return int(math.ceil(x / nbytes) * nbytes)
120
+
121
+
122
+ def align_2MB(x: int) -> int:
123
+ return align(x, 2**21)
124
+
125
+
126
+ def get_alloc_memory_by_key(compiled_models: Dict[str, "rebel.RBLNCompiledModel"]) -> Dict[str, int]:
127
+ alloc_memory_by_key = defaultdict(int)
128
+ # Get the actual memory allocation of each node by key
129
+ for compiled_model in compiled_models.values():
130
+ alloc_per_node_by_key = compiled_model.get_alloc_per_node_by_key()
131
+ for key, memory_per_node in alloc_per_node_by_key.items():
132
+ alloc_memory_by_key[key] += sum(memory_per_node)
133
+
134
+ return alloc_memory_by_key
135
+
136
+
137
+ def format_byte_size(nbytes: int) -> str:
138
+ if nbytes < 1024:
139
+ return f"{nbytes} B"
140
+ elif nbytes < 1024**2:
141
+ return f"{nbytes / 1024:.2f} KB"
142
+ elif nbytes < 1024**3:
143
+ return f"{nbytes / 1024**2:.2f} MB"
144
+ else:
145
+ return f"{nbytes / 1024**3:.2f} GB"
146
+
147
+
118
148
  class RBLNDecoderOnlyFlashAttentionMixin:
119
149
  @classmethod
120
- def get_maximum_num_blocks(
150
+ def get_maximum_num_blocks_by_model(
121
151
  cls,
122
- config: "PretrainedConfig",
123
- tensor_parallel_size: int,
124
- kvcache_block_size: int,
125
- nbits_per_param: Optional[int] = None,
126
- n_model_params: Optional[int] = None,
127
- kernel_size: Optional[int] = None,
128
- buffer: Optional[int] = None,
129
- num_runtimes: int = 2,
152
+ model: "PreTrainedModel",
153
+ model_config: "PretrainedConfig",
154
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
130
155
  ) -> int:
131
- # We are finding max_n_blocks(x) that satisfies the following equation:
132
-
133
- # available_dram - kernel_size - buffer
134
- # - num_layers * 2 * tensor_parallel_size
135
- # * align_2MB(
136
- # x
137
- # * block_size
138
- # * align_64(head_dim)
139
- # * math.ceil(num_key_value_heads / tensor_parallel_size)
140
- # * 2
141
- # ) > 0
142
-
143
- # This inequality can be rewritten as follows:
144
-
145
- # a - c * align_2MB(b * x) > 0
146
- # where
147
- # a = available_dram - kernel_size - buffer
148
- # b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
149
- # c = num_layers * 2 * tensor_parallel_size
150
-
151
- # We can rewrite the inequality as follows:
152
- # k > align_2MB(b*x)
153
- # where
154
- # k = a / c
155
-
156
- # After that, we can derive the following equation:
157
- # x = floor(2**21 / b * floor((k - 1) / 2**21))
158
-
159
- def align(x: int, nbytes: int) -> int:
160
- return int(math.ceil(x / nbytes) * nbytes)
161
-
162
- def align_2MB(x: int) -> int:
163
- return align(x, 2**21)
164
-
165
- num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
166
- num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
167
- head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
168
- vocab_size = config.vocab_size
169
- hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
170
- num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
171
-
172
- # TODO(jongho): Update if target npu is REBEL.
173
- ATOM_DRAM_NBYTES = 16 * 2**30
174
- ATOM_SYS_DRAM_NBYTES = 288 * 2**20
175
- available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
176
-
177
- if kernel_size is None:
178
- if n_model_params is None:
179
- raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
180
- # Get estimated kernel size (approximated)
181
- lm_heads_params = align(vocab_size, 64) * hidden_size
182
- lm_heads_nbytes = (
183
- align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
156
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
157
+ available_dram = get_available_dram(rbln_config.npu) * tensor_parallel_size
158
+
159
+ kernel_memory = cls._get_kernel_memory(model, model_config=model_config, rbln_config=rbln_config)
160
+ buffer = cls._get_buffer(rbln_config)
161
+
162
+ remaining_dram = available_dram - kernel_memory - buffer
163
+ if remaining_dram <= 0:
164
+ raise ValueError(
165
+ "Insufficient available DRAM after accounting for kernel memory and buffer. "
166
+ "Cannot allocate any KV cache blocks."
167
+ f" (Available DRAM: {format_byte_size(available_dram)}, "
168
+ f"Kernel Memory: {format_byte_size(kernel_memory)}, "
169
+ f"Buffer: {format_byte_size(buffer)})"
184
170
  )
185
- params = n_model_params - lm_heads_params
186
- layer_nbytes = (
187
- align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
188
- * num_layers
171
+ estimated_num_blocks = cls._estimate_num_blocks(
172
+ remaining_dram, model_config=model_config, rbln_config=rbln_config
173
+ )
174
+
175
+ return estimated_num_blocks
176
+
177
+ @classmethod
178
+ def _get_kernel_memory(
179
+ cls,
180
+ model: "PreTrainedModel",
181
+ model_config: "PretrainedConfig",
182
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
183
+ ) -> int:
184
+ if model.get_output_embeddings() is None:
185
+ lm_head_nbytes = 0
186
+ else:
187
+ lm_head_nbytes = cls._get_lm_head_memory(model_config, rbln_config)
188
+
189
+ layer_nbytes = cls._get_layer_memory(model, model_config, rbln_config)
190
+ return lm_head_nbytes + layer_nbytes
191
+
192
+ @classmethod
193
+ def _get_lm_head_memory(
194
+ cls, model_config: "PretrainedConfig", rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
195
+ ) -> int:
196
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
197
+ vocab_size = model_config.vocab_size
198
+ hidden_size = getattr(model_config, "n_embd", None) or model_config.hidden_size
199
+ lm_head_params = align(vocab_size, 64) * hidden_size
200
+
201
+ nbytes_per_param = 2 # Assuming lm_head is always not quantized
202
+ lm_head_memory_in_bytes = (
203
+ align_2MB(lm_head_params * nbytes_per_param / tensor_parallel_size) * tensor_parallel_size
204
+ )
205
+
206
+ return lm_head_memory_in_bytes
207
+
208
+ @classmethod
209
+ def _get_layer_memory(
210
+ cls,
211
+ model: "PreTrainedModel",
212
+ model_config: "PretrainedConfig",
213
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
214
+ ) -> int:
215
+ # This is an *APPROXIMATE* calculation based on the number of parameters
216
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
217
+ num_hidden_layers = getattr(model_config, "n_layer", None) or model_config.num_hidden_layers
218
+
219
+ n_model_params = sum(p.numel() for p in model.parameters())
220
+ embed_token_params = sum(p.numel() for p in model.get_input_embeddings().parameters())
221
+
222
+ # Check : `embed_token` is same as `lm_head`
223
+ if model.get_output_embeddings() is not None:
224
+ params = n_model_params - 2 * embed_token_params
225
+ else:
226
+ params = n_model_params - embed_token_params
227
+
228
+ # Assuming all layers have the same number of parameters
229
+ # and all linear layers are quantized if quantization is enabled (This is not always true)
230
+ # TODO(jongho): More accurate calculation
231
+ nbits_per_param = rbln_config.nbits_per_param
232
+ layer_nbytes = (
233
+ (align_2MB(params // num_hidden_layers * nbits_per_param // 8 / tensor_parallel_size))
234
+ * num_hidden_layers
235
+ * tensor_parallel_size
236
+ )
237
+
238
+ return layer_nbytes
239
+
240
+ @classmethod
241
+ def _get_buffer(cls, rbln_config) -> int:
242
+ # TODO(jongho): Accurate buffer estimation
243
+ buffer_per_runtime_per_core = 2**28 # 256MB per runtime
244
+ num_runtimes = 1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes)
245
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
246
+
247
+ buffer_per_core = buffer_per_runtime_per_core * num_runtimes
248
+ buffer = buffer_per_core * tensor_parallel_size
249
+ return buffer
250
+
251
+ @classmethod
252
+ def get_maximum_num_blocks_by_compiled_model(
253
+ cls,
254
+ compiled_models: Dict[str, "rebel.RBLNCompiledModel"],
255
+ model_config: "PretrainedConfig",
256
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
257
+ ) -> int:
258
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
259
+ available_dram = get_available_dram(rbln_config.npu) * tensor_parallel_size
260
+
261
+ alloc_memory_by_key = get_alloc_memory_by_key(compiled_models)
262
+ alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
263
+ alloc_memory_by_key.pop("DramTensor", None) # kv-cache
264
+ used_memory = sum(alloc_memory_by_key.values())
265
+
266
+ remaining_dram = available_dram - used_memory
267
+
268
+ if remaining_dram <= 0:
269
+ logger.warning(
270
+ "Insufficient available DRAM after accounting for kernel memory and buffer. "
271
+ "Model cannot allocate any KV cache blocks."
272
+ )
273
+
274
+ estimated_num_blocks = cls._estimate_num_blocks(
275
+ remaining_dram, model_config=model_config, rbln_config=rbln_config
276
+ )
277
+
278
+ return estimated_num_blocks
279
+
280
+ @classmethod
281
+ def _estimate_num_blocks(
282
+ cls, available_dram: int, model_config: "PretrainedConfig", rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
283
+ ) -> int:
284
+ """
285
+ Estimate the maximum number of KV cache blocks that can be allocated.
286
+
287
+ if all of the layers are full attention, the dram_per_block can be calculated simply as follows:
288
+ num_blocks = available_dram // dram_per_block
289
+
290
+ However, if the model contains a mix of full attention and sliding window attention layers,
291
+ we need to consider the memory occupied by the sliding window attention layers first,
292
+ since their memory usage is constant regardless of the number of blocks.
293
+ num_blocks = (available_dram - swa_kv_nbytes) // dram_per_block
294
+
295
+ """
296
+
297
+ def get_dram_per_block(seq_len: int, num_key_value_heads: int, tensor_parallel_size: int) -> int:
298
+ nbytes_per_param = 2 # Assuming kv-cache is always not quantized
299
+ dram_per_block = (
300
+ seq_len
301
+ * align(head_dim, 64)
302
+ * math.ceil(num_key_value_heads / tensor_parallel_size)
303
+ * nbytes_per_param
189
304
  * tensor_parallel_size
305
+ * 2
306
+ ) # *2 for key and value
307
+
308
+ return dram_per_block
309
+
310
+ num_attention_heads = getattr(model_config, "n_head", None) or model_config.num_attention_heads
311
+ head_dim = getattr(model_config, "head_dim", None) or model_config.hidden_size // num_attention_heads
312
+ num_hidden_layers = getattr(model_config, "n_layer", None) or model_config.num_hidden_layers
313
+ num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
314
+ tensor_parallel_size = rbln_config.tensor_parallel_size or 1
315
+
316
+ # Consider layer types if available
317
+ # If layer types are not found, assume all layers are full attention
318
+ layer_types = getattr(model_config, "layer_types", None)
319
+ if layer_types:
320
+ layer_types_dict = Counter(layer_types)
321
+ num_full_attention = layer_types_dict.pop("full_attention", 0)
322
+ num_sliding_window_attention = layer_types_dict.pop("sliding_attention", 0)
323
+ if len(layer_types_dict) > 0:
324
+ raise ValueError(f"Unknown layer types found in the config: {layer_types_dict.keys()}")
325
+
326
+ else:
327
+ num_full_attention = num_hidden_layers
328
+ num_sliding_window_attention = 0
329
+
330
+ # Reduce available DRAM by sliding window attention kv-cache
331
+ # Since memory occupation of swa layer is constant regardless of num_blocks
332
+ swa_kv_nbytes = 0
333
+ if num_sliding_window_attention > 0:
334
+ sliding_window = getattr(model_config, "sliding_window", None)
335
+ if sliding_window is None:
336
+ logger.warning(
337
+ "`sliding_window` is not found in the config while `sliding_attention` layers are present. "
338
+ "Assuming maximum sliding window size for estimation."
339
+ )
340
+ sliding_window = rbln_config.kvcache_block_size
341
+
342
+ swa_kv_nbytes = num_sliding_window_attention * get_dram_per_block(
343
+ seq_len=sliding_window,
344
+ num_key_value_heads=num_key_value_heads,
345
+ tensor_parallel_size=tensor_parallel_size,
190
346
  )
191
- kernel_size = layer_nbytes + lm_heads_nbytes
192
- elif n_model_params is not None:
193
- raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
194
347
 
195
- available_dram -= kernel_size
348
+ available_dram -= swa_kv_nbytes
196
349
 
197
- if buffer is None:
198
- # TODO: Accurate buffer estimation
199
- buffer_per_runtime_per_core = 2**28 # 256MB per runtime
200
- buffer_per_core = buffer_per_runtime_per_core * num_runtimes # 1 for prefill, 1 for decoder
201
- buffer = buffer_per_core * tensor_parallel_size
202
- available_dram -= buffer
350
+ dram_per_block = num_full_attention * get_dram_per_block(
351
+ seq_len=rbln_config.kvcache_block_size,
352
+ num_key_value_heads=num_key_value_heads,
353
+ tensor_parallel_size=tensor_parallel_size,
354
+ )
203
355
 
204
- b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
205
- c = num_layers * 2 * tensor_parallel_size
206
- k = available_dram / c
207
- max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
356
+ if dram_per_block == 0:
357
+ raise ValueError("DRAM per block is calculated as zero, cannot estimate maximum number of blocks.")
208
358
 
359
+ max_n_blocks = available_dram // dram_per_block
209
360
  return max_n_blocks
210
361
 
211
362
  @classmethod
212
363
  def maybe_suggest_kvcache_num_blocks(
213
364
  cls,
214
- compiled_models: Dict[str, "RBLNCompiledModel"],
365
+ compiled_models: Dict[str, "rebel.RBLNCompiledModel"],
215
366
  model_config: "PretrainedConfig",
216
367
  rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
217
368
  ) -> None:
218
- # Get the actual memory allocation of each node by key
219
- alloc_memory_per_node_by_key: Dict[str, List[int]] = compiled_models["prefill"].get_alloc_per_node_by_key()
220
- alloc_memory_by_key: Dict[str, int] = {
221
- key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
222
- }
223
- for batch_size in rbln_config.decoder_batch_sizes:
224
- for key, memory_per_node in (
225
- compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
226
- ):
227
- alloc_memory_by_key[key] += sum(memory_per_node)
228
- alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
229
- alloc_memory_by_key.pop("DramTensor", None) # kv-cache
230
- kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
231
-
232
- # Get the maximum number of blocks that can be allocated
233
- buffer = sum(alloc_memory_by_key.values())
234
- max_num_blocks = cls.get_maximum_num_blocks(
235
- config=model_config,
236
- tensor_parallel_size=rbln_config.tensor_parallel_size,
237
- kvcache_block_size=rbln_config.kvcache_block_size,
238
- kernel_size=kernel_size,
239
- buffer=buffer,
369
+ max_num_blocks = cls.get_maximum_num_blocks_by_compiled_model(
370
+ compiled_models=compiled_models,
371
+ model_config=model_config,
372
+ rbln_config=rbln_config,
240
373
  )
241
374
 
242
375
  # Since our estimation logic is not always accurate,
@@ -26,7 +26,6 @@ from typing import TYPE_CHECKING, Optional, Union
26
26
  from torch import nn
27
27
  from transformers import (
28
28
  AutoModel,
29
- AutoModelForAudioClassification,
30
29
  AutoModelForDepthEstimation,
31
30
  AutoModelForImageClassification,
32
31
  AutoModelForMaskedLM,
@@ -42,7 +41,6 @@ from ..modeling import RBLNModel
42
41
  from ..utils.logging import get_logger
43
42
  from .configuration_generic import (
44
43
  RBLNImageModelConfig,
45
- RBLNModelForAudioClassificationConfig,
46
44
  RBLNTransformerEncoderConfig,
47
45
  )
48
46
 
@@ -59,7 +57,7 @@ class RBLNTransformerEncoder(RBLNModel):
59
57
  rbln_dtype = "int64"
60
58
 
61
59
  @classmethod
62
- def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNTransformerEncoderConfig) -> nn.Module:
60
+ def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNTransformerEncoderConfig) -> nn.Module:
63
61
  class TransformerEncoderWrapper(nn.Module):
64
62
  # Parameters to disable for RBLN compilation
65
63
  DISABLED_PARAMS = {"return_dict", "use_cache"}
@@ -268,7 +266,7 @@ class RBLNModelForDepthEstimation(RBLNImageModel):
268
266
  auto_model_class = AutoModelForDepthEstimation
269
267
 
270
268
  @classmethod
271
- def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
269
+ def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
272
270
  class ImageModelWrapper(nn.Module):
273
271
  def __init__(self, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
274
272
  super().__init__()
@@ -280,60 +278,3 @@ class RBLNModelForDepthEstimation(RBLNImageModel):
280
278
  return output.predicted_depth
281
279
 
282
280
  return ImageModelWrapper(model, rbln_config).eval()
283
-
284
-
285
- class RBLNModelForAudioClassification(RBLNModel):
286
- """
287
- This is a generic model class that will be instantiated as one of the model classes of the library (with a audio classification head) when created with the from_pretrained() class method
288
- This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
289
-
290
- A class to convert and run pre-trained transformers based AudioClassification models on RBLN devices.
291
- It implements the methods to convert a pre-trained transformers AudioClassification model into a RBLN transformer model by:
292
-
293
- - transferring the checkpoint weights of the original into an optimized RBLN graph,
294
- - compiling the resulting graph using the RBLN compiler.
295
-
296
- Currently, this model class only supports the 'AST' model from the transformers library. Future updates may include support for additional model types.
297
- """
298
-
299
- auto_model_class = AutoModelForAudioClassification
300
-
301
- @classmethod
302
- def _update_rbln_config(
303
- cls,
304
- preprocessors: "AutoFeatureExtractor" = None,
305
- model: Optional["PreTrainedModel"] = None,
306
- model_config: "PretrainedConfig" = None,
307
- rbln_config: Optional[RBLNModelForAudioClassificationConfig] = None,
308
- ) -> RBLNModelForAudioClassificationConfig:
309
- if rbln_config.num_mel_bins is None:
310
- rbln_config.num_mel_bins = getattr(model_config, "num_mel_bins", None)
311
- if rbln_config.num_mel_bins is None:
312
- for feature_extractor in preprocessors:
313
- if hasattr(feature_extractor, "num_mel_bins"):
314
- rbln_config.num_mel_bins = feature_extractor.num_mel_bins
315
- break
316
-
317
- if rbln_config.num_mel_bins is None:
318
- raise ValueError("`num_mel_bins` should be specified!")
319
-
320
- if rbln_config.max_length is None:
321
- rbln_config.max_length = getattr(model_config, "max_length", None)
322
- for feature_extractor in preprocessors:
323
- if hasattr(feature_extractor, "max_length"):
324
- rbln_config.max_length = feature_extractor.max_length
325
- break
326
-
327
- if rbln_config.max_length is None:
328
- raise ValueError("`max_length` should be specified!")
329
-
330
- input_info = [
331
- (
332
- "input_values",
333
- [rbln_config.batch_size, rbln_config.max_length, rbln_config.num_mel_bins],
334
- "float32",
335
- ),
336
- ]
337
-
338
- rbln_config.set_compile_cfgs([RBLNCompileConfig(input_info=input_info)])
339
- return rbln_config
@@ -24,6 +24,7 @@ class RBLNDecoderOnlyOutput(ModelOutput):
24
24
  logits: torch.FloatTensor = None
25
25
  generate_idx: torch.Tensor = None
26
26
  padded_cache_lengths: int = None
27
+ hidden_states: Tuple[torch.FloatTensor] = None
27
28
 
28
29
 
29
30
  @dataclass
@@ -12,10 +12,36 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from ...configuration_generic import RBLNModelForAudioClassificationConfig
15
+ from typing import Any, Optional
16
16
 
17
+ from ....configuration_utils import RBLNModelConfig
18
+ from ....utils.deprecation import deprecate_kwarg
17
19
 
18
- class RBLNASTForAudioClassificationConfig(RBLNModelForAudioClassificationConfig):
20
+
21
+ class RBLNASTForAudioClassificationConfig(RBLNModelConfig):
19
22
  """
20
23
  Configuration class for RBLNASTForAudioClassification.
21
24
  """
25
+
26
+ @deprecate_kwarg(old_name="num_mel_bins", version="0.10.0")
27
+ def __init__(
28
+ self,
29
+ batch_size: Optional[int] = None,
30
+ max_length: Optional[int] = None,
31
+ **kwargs: Any,
32
+ ):
33
+ """
34
+ Args:
35
+ batch_size (Optional[int]): The batch size for inference. Defaults to 1.
36
+ max_length (Optional[int]): Maximum length of the audio input in time dimension.
37
+ kwargs: Additional arguments passed to the parent RBLNModelConfig.
38
+
39
+ Raises:
40
+ ValueError: If batch_size is not a positive integer.
41
+ """
42
+ super().__init__(**kwargs)
43
+ self.batch_size = batch_size or 1
44
+ if not isinstance(self.batch_size, int) or self.batch_size < 0:
45
+ raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
46
+
47
+ self.max_length = max_length