optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +48 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +50 -21
- optimum/rbln/diffusers/__init__.py +12 -0
- optimum/rbln/diffusers/configurations/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
- optimum/rbln/diffusers/modeling_diffusers.py +1 -1
- optimum/rbln/diffusers/models/__init__.py +17 -3
- optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
- optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
- optimum/rbln/diffusers/models/controlnet.py +17 -2
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +16 -2
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +16 -1
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +14 -1
- optimum/rbln/diffusers/models/unets/__init__.py +1 -0
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +18 -2
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
- optimum/rbln/diffusers/pipelines/__init__.py +4 -0
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +2 -2
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +13 -4
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -4
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -2
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
- optimum/rbln/modeling.py +20 -45
- optimum/rbln/modeling_base.py +18 -14
- optimum/rbln/ops/__init__.py +1 -0
- optimum/rbln/ops/attn.py +10 -0
- optimum/rbln/ops/flash_attn.py +8 -0
- optimum/rbln/ops/moe.py +180 -0
- optimum/rbln/ops/sliding_window_attn.py +9 -0
- optimum/rbln/transformers/__init__.py +36 -0
- optimum/rbln/transformers/configuration_generic.py +0 -27
- optimum/rbln/transformers/modeling_attention_utils.py +156 -127
- optimum/rbln/transformers/modeling_generic.py +2 -61
- optimum/rbln/transformers/modeling_outputs.py +26 -0
- optimum/rbln/transformers/modeling_rope_utils.py +78 -42
- optimum/rbln/transformers/models/__init__.py +28 -0
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
- optimum/rbln/transformers/models/auto/auto_factory.py +1 -0
- optimum/rbln/transformers/models/bart/bart_architecture.py +24 -24
- optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
- optimum/rbln/transformers/models/bert/modeling_bert.py +86 -1
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +42 -15
- optimum/rbln/transformers/models/clip/modeling_clip.py +40 -2
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +14 -20
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +12 -17
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +66 -221
- optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +38 -23
- optimum/rbln/transformers/models/colqwen2/modeling_colqwen2.py +107 -371
- optimum/rbln/transformers/models/decoderonly/__init__.py +2 -0
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +128 -17
- optimum/rbln/transformers/models/decoderonly/configuration_lora.py +2 -2
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +211 -89
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +205 -64
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +17 -9
- optimum/rbln/transformers/models/decoderonly/lora_architecture.py +1 -1
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +194 -132
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +17 -0
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
- optimum/rbln/transformers/models/exaone/exaone_architecture.py +0 -36
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +1 -1
- optimum/rbln/transformers/models/gemma2/__init__.py +16 -0
- optimum/rbln/transformers/models/gemma2/configuration_gemma2.py +45 -0
- optimum/rbln/transformers/models/gemma2/gemma2_architecture.py +83 -0
- optimum/rbln/transformers/models/gemma2/modeling_gemma2.py +101 -0
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +23 -19
- optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +42 -70
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +46 -31
- optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +8 -34
- optimum/rbln/transformers/models/gpt_oss/__init__.py +16 -0
- optimum/rbln/transformers/models/gpt_oss/configuration_gpt_oss.py +41 -0
- optimum/rbln/transformers/models/gpt_oss/gpt_oss_architecture.py +122 -0
- optimum/rbln/transformers/models/gpt_oss/modeling_gpt_oss.py +165 -0
- optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +8 -5
- optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +7 -5
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +24 -9
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -5
- optimum/rbln/transformers/models/llava/modeling_llava.py +37 -26
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +3 -5
- optimum/rbln/transformers/models/midm/midm_architecture.py +29 -22
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -22
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
- optimum/rbln/transformers/models/opt/opt_architecture.py +1 -44
- optimum/rbln/transformers/models/paligemma/__init__.py +16 -0
- optimum/rbln/transformers/models/paligemma/configuration_paligemma.py +129 -0
- optimum/rbln/transformers/models/paligemma/modeling_paligemma.py +564 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +24 -24
- optimum/rbln/transformers/models/phi/phi_architecture.py +13 -21
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +13 -1
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +2 -2
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -28
- optimum/rbln/transformers/models/qwen2_5_vl/__init__.py +6 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +11 -1
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +278 -130
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +43 -39
- optimum/rbln/transformers/models/qwen2_moe/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen2_moe/configuration_qwen2_moe.py +38 -0
- optimum/rbln/transformers/models/qwen2_moe/modeling_qwen2_moe.py +68 -0
- optimum/rbln/transformers/models/qwen2_moe/qwen2_moe_architecture.py +94 -0
- optimum/rbln/transformers/models/qwen2_vl/__init__.py +6 -1
- optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +11 -1
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +268 -111
- optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +27 -35
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +0 -20
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +7 -7
- optimum/rbln/transformers/models/qwen3_moe/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen3_moe/configuration_qwen3_moe.py +38 -0
- optimum/rbln/transformers/models/qwen3_moe/modeling_qwen3_moe.py +68 -0
- optimum/rbln/transformers/models/qwen3_moe/qwen3_moe_architecture.py +100 -0
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -4
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +36 -12
- optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +14 -12
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +21 -19
- optimum/rbln/transformers/models/swin/configuration_swin.py +1 -6
- optimum/rbln/transformers/models/swin/modeling_swin.py +17 -4
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/t5/t5_architecture.py +16 -17
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +25 -10
- optimum/rbln/transformers/models/time_series_transformer/time_series_transformers_architecture.py +0 -3
- optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +60 -8
- optimum/rbln/transformers/models/whisper/generation_whisper.py +48 -14
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
- optimum/rbln/transformers/models/whisper/whisper_architecture.py +0 -3
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +53 -0
- optimum/rbln/transformers/utils/rbln_quantization.py +29 -12
- optimum/rbln/utils/deprecation.py +213 -0
- optimum/rbln/utils/hub.py +14 -3
- optimum/rbln/utils/import_utils.py +23 -2
- optimum/rbln/utils/runtime_utils.py +42 -6
- optimum/rbln/utils/submodule.py +27 -1
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/METADATA +6 -6
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/RECORD +155 -129
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/WHEEL +1 -1
- optimum/rbln/transformers/models/colqwen2/colqwen2_architecture.py +0 -233
- optimum/rbln/utils/depreacate_utils.py +0 -16
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/entry_points.txt +0 -0
- {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,9 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from dataclasses import asdict, dataclass
|
|
15
16
|
from typing import Any, Dict, List, Literal, Optional, Union, get_args
|
|
16
17
|
|
|
17
|
-
from ....configuration_utils import RBLNModelConfig
|
|
18
|
+
from ....configuration_utils import RBLNModelConfig, RBLNSerializableConfigProtocol
|
|
18
19
|
from ....utils.logging import get_logger
|
|
19
20
|
from ...utils.rbln_quantization import RBLNQuantizationConfig
|
|
20
21
|
from .configuration_lora import RBLNLoRAConfig
|
|
@@ -58,6 +59,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
58
59
|
sliding_window_layers: Optional[List[int]] = None,
|
|
59
60
|
phases: Optional[List[PhaseType]] = None,
|
|
60
61
|
logits_to_keep: Optional[int] = None,
|
|
62
|
+
output_hidden_states: Optional[bool] = None,
|
|
63
|
+
kvcache_metas: Optional[List["KVCacheMeta"]] = None,
|
|
61
64
|
**kwargs,
|
|
62
65
|
):
|
|
63
66
|
"""
|
|
@@ -92,8 +95,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
92
95
|
processing input sequences. Defaults to 128. Must be a positive integer
|
|
93
96
|
divisible by 64. Affects prefill performance and memory usage.
|
|
94
97
|
kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
|
|
95
|
-
PagedAttention KV cache
|
|
96
|
-
section below for details.
|
|
98
|
+
PagedAttention KV cache at compile time. Defaults to 0 (automatically determined).
|
|
99
|
+
See the "KV Cache Number of Blocks (`kvcache_num_blocks`)" section below for details.
|
|
97
100
|
decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
|
|
98
101
|
This allows the model to handle varying batch sizes efficiently during generation. If not specified,
|
|
99
102
|
defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
|
|
@@ -112,6 +115,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
112
115
|
["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
|
|
113
116
|
logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
|
|
114
117
|
Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
|
|
118
|
+
output_hidden_states (Optional[bool]): Whether to output the hidden states of the decoder. Defaults to False.
|
|
119
|
+
kvcache_metas (Optional[List["KVCacheMeta"]]): The metadata for the KV cache tensors. Handled internally if not provided. Defaults to None.
|
|
115
120
|
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
116
121
|
|
|
117
122
|
Raises:
|
|
@@ -150,17 +155,15 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
150
155
|
|
|
151
156
|
|
|
152
157
|
KV Cache Number of Blocks:
|
|
153
|
-
`kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache
|
|
154
|
-
Each block holds `kvcache_block_size` tokens of Key and Value states.
|
|
155
|
-
|
|
156
|
-
- **Automatic
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
- **Manual Setting**: You can explicitly set the number of blocks. This provides finer control
|
|
163
|
-
but requires careful consideration of memory limits. Setting it too high may lead to
|
|
158
|
+
`kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache
|
|
159
|
+
at compile time. Each block holds `kvcache_block_size` tokens of Key and Value states.
|
|
160
|
+
|
|
161
|
+
- **Automatic Determination (Default)**: If `kvcache_num_blocks` is `0` (default), the number of blocks
|
|
162
|
+
is automatically determined during compilation to fit within the available DRAM on the NPU. This allows
|
|
163
|
+
the model to utilize the remaining memory after compilation without manual tuning, providing optimal
|
|
164
|
+
cache capacity for better performance with long sequences or larger batches.
|
|
165
|
+
- **Manual Setting**: You can explicitly set the number of blocks to a positive integer. This provides
|
|
166
|
+
finer control but requires careful consideration of memory limits. Setting it too high may lead to
|
|
164
167
|
compilation errors if it exceeds available memory. The system will issue warnings if your
|
|
165
168
|
setting exceeds the estimated maximum.
|
|
166
169
|
- **Performance Impact**: A larger number of blocks reduces the likelihood of cache eviction,
|
|
@@ -173,7 +176,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
173
176
|
are violated (e.g., if `kvcache_num_blocks` is less than `batch_size` when using Flash Attention).
|
|
174
177
|
|
|
175
178
|
The optimal value depends on the specific model, task, hardware, and desired trade-off
|
|
176
|
-
between performance and memory usage.
|
|
179
|
+
between performance and memory usage. Automatic determination (default) provides a robust starting point
|
|
180
|
+
that adapts to the available DRAM on the NPU at compile time.
|
|
177
181
|
"""
|
|
178
182
|
|
|
179
183
|
super().__init__(**kwargs)
|
|
@@ -220,7 +224,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
220
224
|
if self.prefill_chunk_size % 64 != 0 or self.prefill_chunk_size <= 0:
|
|
221
225
|
raise ValueError("`prefill_chunk_size` must be a positive integer divisible by 64.")
|
|
222
226
|
|
|
223
|
-
self.kvcache_num_blocks = kvcache_num_blocks
|
|
227
|
+
self.kvcache_num_blocks = kvcache_num_blocks if kvcache_num_blocks is not None else 0
|
|
224
228
|
self.cache_impl = cache_impl or "static"
|
|
225
229
|
self.sliding_window = sliding_window
|
|
226
230
|
self.sliding_window_layers = sliding_window_layers or []
|
|
@@ -232,6 +236,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
232
236
|
if self.logits_to_keep is not None and self.logits_to_keep > 1:
|
|
233
237
|
raise NotImplementedError("`logits_to_keep` > 1 is currently not supported for RBLN models.")
|
|
234
238
|
|
|
239
|
+
self.output_hidden_states = output_hidden_states or False
|
|
240
|
+
|
|
235
241
|
self.decoder_batch_sizes = None
|
|
236
242
|
if "decode" in self.phases:
|
|
237
243
|
self.decoder_batch_sizes = decoder_batch_sizes
|
|
@@ -253,6 +259,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
253
259
|
# Larger batch size should be at the beginning of the list.
|
|
254
260
|
self.decoder_batch_sizes.sort(reverse=True)
|
|
255
261
|
|
|
262
|
+
self.kvcache_metas: List["KVCacheMeta"] = kvcache_metas or []
|
|
263
|
+
|
|
256
264
|
@staticmethod
|
|
257
265
|
def validate_phases_type(phases: List[PhaseType]):
|
|
258
266
|
if not isinstance(phases, list):
|
|
@@ -274,13 +282,33 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
|
|
|
274
282
|
|
|
275
283
|
@property
|
|
276
284
|
def use_lora(self):
|
|
277
|
-
"""Check if LoRA is enabled for this configuration."""
|
|
278
285
|
return self.lora_config is not None
|
|
279
286
|
|
|
280
287
|
@property
|
|
281
288
|
def can_generate(self) -> bool:
|
|
282
289
|
return "decode" in self.phases
|
|
283
290
|
|
|
291
|
+
@property
|
|
292
|
+
def nbits_per_param(self) -> int:
|
|
293
|
+
if self.quantization:
|
|
294
|
+
return self.quantization.nbits_per_param
|
|
295
|
+
return 16
|
|
296
|
+
|
|
297
|
+
@property
|
|
298
|
+
def is_auto_num_blocks(self) -> bool:
|
|
299
|
+
"""Returns True if kvcache_num_blocks will be automatically determined during compilation to fit within the available DRAM on the NPU."""
|
|
300
|
+
return self.kvcache_num_blocks == 0
|
|
301
|
+
|
|
302
|
+
@property
|
|
303
|
+
def num_full_blocks(self) -> int:
|
|
304
|
+
return (self.max_seq_len // self.kvcache_block_size) * self.batch_size
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def num_min_blocks(self) -> int:
|
|
308
|
+
if self.attn_impl == "flash_attn":
|
|
309
|
+
return min(self.max_seq_len // self.kvcache_block_size + 1, self.num_full_blocks)
|
|
310
|
+
return self.batch_size
|
|
311
|
+
|
|
284
312
|
|
|
285
313
|
class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
|
|
286
314
|
"""
|
|
@@ -293,3 +321,86 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
|
|
|
293
321
|
|
|
294
322
|
_default_phases = ["prefill", "decode"]
|
|
295
323
|
_default_logits_to_keep = 1
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@dataclass
|
|
327
|
+
class KVCacheMeta(RBLNSerializableConfigProtocol):
|
|
328
|
+
"""
|
|
329
|
+
KVCacheMeta contains metadata describing the key-value (KV) cache tensor for a specific transformer layer.
|
|
330
|
+
|
|
331
|
+
This is used during compilation and runtime on RBLN devices to manage memory and configure the
|
|
332
|
+
static or dynamic characteristics of the cache implementation for decoder-only models.
|
|
333
|
+
|
|
334
|
+
Attributes:
|
|
335
|
+
name (str): Logical name of the KV cache tensor.
|
|
336
|
+
layer_index (int): Index of the transformer layer corresponding to this cache.
|
|
337
|
+
shape (list[int]): The 4D shape of the cache tensor:
|
|
338
|
+
[num_blocks, num_heads, block_size, head_dim]. The number of blocks may be dynamic or static
|
|
339
|
+
depending on model configuration.
|
|
340
|
+
layer_type (str): String describing the attention/cache algorithm (e.g., "full_attention", "sliding_attention").
|
|
341
|
+
is_auto (bool): Whether the number of blocks is automatically determined during compilation (True) or manually specified (False).
|
|
342
|
+
In both cases, the KV cache size is fixed at compile time.
|
|
343
|
+
dtype (str): Data type of the cache buffer ("float16", "float32", etc.).
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
name: str
|
|
347
|
+
layer_index: int
|
|
348
|
+
shape: list[int] # (num_blocks, num_heads, block_size(seq), head_dim)
|
|
349
|
+
layer_type: str
|
|
350
|
+
is_auto: bool
|
|
351
|
+
dtype: str
|
|
352
|
+
|
|
353
|
+
def _prepare_for_serialization(self) -> dict[str, Any]:
|
|
354
|
+
return asdict(self)
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
def compile_shape(self):
|
|
358
|
+
return [1, self.shape[1], self.shape[2], self.shape[3]] if self.can_resize else self.shape
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def can_resize(self):
|
|
362
|
+
return self.is_auto and self.layer_type == "full_attention"
|
|
363
|
+
|
|
364
|
+
@property
|
|
365
|
+
def num_blocks(self) -> int:
|
|
366
|
+
return self.shape[0]
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def block_size(self) -> int:
|
|
370
|
+
return self.shape[2]
|
|
371
|
+
|
|
372
|
+
@staticmethod
|
|
373
|
+
def make(
|
|
374
|
+
name: str,
|
|
375
|
+
layer_index: int,
|
|
376
|
+
num_key_value_heads: int,
|
|
377
|
+
head_dim: int,
|
|
378
|
+
dtype: str,
|
|
379
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
380
|
+
) -> "KVCacheMeta":
|
|
381
|
+
assert len(rbln_config.compile_cfgs) == 0, "KVCacheMeta cannot be created from rbln_config with compile_cfgs"
|
|
382
|
+
|
|
383
|
+
if rbln_config.sliding_window is not None and layer_index in rbln_config.sliding_window_layers:
|
|
384
|
+
layer_type = "sliding_attention"
|
|
385
|
+
block_size = rbln_config.sliding_window
|
|
386
|
+
num_blocks = rbln_config.batch_size
|
|
387
|
+
is_auto = False
|
|
388
|
+
|
|
389
|
+
else:
|
|
390
|
+
layer_type = "full_attention"
|
|
391
|
+
block_size = rbln_config.kvcache_block_size
|
|
392
|
+
|
|
393
|
+
if rbln_config.is_auto_num_blocks:
|
|
394
|
+
num_blocks = rbln_config.num_full_blocks
|
|
395
|
+
is_auto = True
|
|
396
|
+
else:
|
|
397
|
+
num_blocks = rbln_config.kvcache_num_blocks
|
|
398
|
+
is_auto = False
|
|
399
|
+
|
|
400
|
+
shape = [num_blocks, num_key_value_heads, block_size, head_dim]
|
|
401
|
+
if num_blocks <= 0:
|
|
402
|
+
raise ValueError("`num_blocks` must be greater than 0 when using KV cache.")
|
|
403
|
+
|
|
404
|
+
return KVCacheMeta(
|
|
405
|
+
name=name, layer_index=layer_index, shape=shape, layer_type=layer_type, is_auto=is_auto, dtype=dtype
|
|
406
|
+
)
|
|
@@ -46,7 +46,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
|
|
|
46
46
|
model = RBLNLlamaForCausalLM.from_pretrained(
|
|
47
47
|
model_id,
|
|
48
48
|
rbln_config=RBLNLlamaForCausalLMConfig(lora_config=lora_config, tensor_parallel_size=tp_size, max_seq_len=8192),
|
|
49
|
-
|
|
49
|
+
dtype="auto",
|
|
50
50
|
)
|
|
51
51
|
|
|
52
52
|
|
|
@@ -183,7 +183,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
|
|
|
183
183
|
f"Failed to download LoRA adapter '{path.as_posix()}' from HuggingFace Hub. "
|
|
184
184
|
f"Please check if the model ID is correct or provide a valid local path. "
|
|
185
185
|
f"Error: {e}"
|
|
186
|
-
)
|
|
186
|
+
) from e
|
|
187
187
|
|
|
188
188
|
def _load_adapter_config(self) -> Dict[str, Any]:
|
|
189
189
|
"""
|