optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. optimum/rbln/__init__.py +48 -0
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +50 -21
  4. optimum/rbln/diffusers/__init__.py +12 -0
  5. optimum/rbln/diffusers/configurations/__init__.py +3 -0
  6. optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
  7. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
  8. optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
  9. optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
  10. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
  11. optimum/rbln/diffusers/modeling_diffusers.py +1 -1
  12. optimum/rbln/diffusers/models/__init__.py +17 -3
  13. optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
  14. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
  15. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
  16. optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
  17. optimum/rbln/diffusers/models/controlnet.py +17 -2
  18. optimum/rbln/diffusers/models/transformers/prior_transformer.py +16 -2
  19. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +16 -1
  20. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +14 -1
  21. optimum/rbln/diffusers/models/unets/__init__.py +1 -0
  22. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +18 -2
  23. optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
  24. optimum/rbln/diffusers/pipelines/__init__.py +4 -0
  25. optimum/rbln/diffusers/pipelines/auto_pipeline.py +2 -2
  26. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
  27. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +13 -4
  28. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +13 -4
  29. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +13 -4
  30. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -4
  31. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +1 -1
  32. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  33. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -2
  34. optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
  35. optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
  36. optimum/rbln/modeling.py +20 -45
  37. optimum/rbln/modeling_base.py +18 -14
  38. optimum/rbln/ops/__init__.py +1 -0
  39. optimum/rbln/ops/attn.py +10 -0
  40. optimum/rbln/ops/flash_attn.py +8 -0
  41. optimum/rbln/ops/moe.py +180 -0
  42. optimum/rbln/ops/sliding_window_attn.py +9 -0
  43. optimum/rbln/transformers/__init__.py +36 -0
  44. optimum/rbln/transformers/configuration_generic.py +0 -27
  45. optimum/rbln/transformers/modeling_attention_utils.py +156 -127
  46. optimum/rbln/transformers/modeling_generic.py +2 -61
  47. optimum/rbln/transformers/modeling_outputs.py +26 -0
  48. optimum/rbln/transformers/modeling_rope_utils.py +78 -42
  49. optimum/rbln/transformers/models/__init__.py +28 -0
  50. optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
  51. optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
  52. optimum/rbln/transformers/models/auto/auto_factory.py +1 -0
  53. optimum/rbln/transformers/models/bart/bart_architecture.py +24 -24
  54. optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
  55. optimum/rbln/transformers/models/bert/modeling_bert.py +86 -1
  56. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +42 -15
  57. optimum/rbln/transformers/models/clip/modeling_clip.py +40 -2
  58. optimum/rbln/transformers/models/colpali/colpali_architecture.py +14 -20
  59. optimum/rbln/transformers/models/colpali/configuration_colpali.py +12 -17
  60. optimum/rbln/transformers/models/colpali/modeling_colpali.py +66 -221
  61. optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +38 -23
  62. optimum/rbln/transformers/models/colqwen2/modeling_colqwen2.py +107 -371
  63. optimum/rbln/transformers/models/decoderonly/__init__.py +2 -0
  64. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +128 -17
  65. optimum/rbln/transformers/models/decoderonly/configuration_lora.py +2 -2
  66. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +211 -89
  67. optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +205 -64
  68. optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +17 -9
  69. optimum/rbln/transformers/models/decoderonly/lora_architecture.py +1 -1
  70. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +194 -132
  71. optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +17 -0
  72. optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
  73. optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
  74. optimum/rbln/transformers/models/exaone/exaone_architecture.py +0 -36
  75. optimum/rbln/transformers/models/gemma/gemma_architecture.py +1 -1
  76. optimum/rbln/transformers/models/gemma2/__init__.py +16 -0
  77. optimum/rbln/transformers/models/gemma2/configuration_gemma2.py +45 -0
  78. optimum/rbln/transformers/models/gemma2/gemma2_architecture.py +83 -0
  79. optimum/rbln/transformers/models/gemma2/modeling_gemma2.py +101 -0
  80. optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +23 -19
  81. optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +42 -70
  82. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +46 -31
  83. optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +8 -34
  84. optimum/rbln/transformers/models/gpt_oss/__init__.py +16 -0
  85. optimum/rbln/transformers/models/gpt_oss/configuration_gpt_oss.py +41 -0
  86. optimum/rbln/transformers/models/gpt_oss/gpt_oss_architecture.py +122 -0
  87. optimum/rbln/transformers/models/gpt_oss/modeling_gpt_oss.py +165 -0
  88. optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +8 -5
  89. optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +7 -5
  90. optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +24 -9
  91. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -5
  92. optimum/rbln/transformers/models/llava/modeling_llava.py +37 -26
  93. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +3 -5
  94. optimum/rbln/transformers/models/midm/midm_architecture.py +29 -22
  95. optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -22
  96. optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
  97. optimum/rbln/transformers/models/opt/opt_architecture.py +1 -44
  98. optimum/rbln/transformers/models/paligemma/__init__.py +16 -0
  99. optimum/rbln/transformers/models/paligemma/configuration_paligemma.py +129 -0
  100. optimum/rbln/transformers/models/paligemma/modeling_paligemma.py +564 -0
  101. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
  102. optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +24 -24
  103. optimum/rbln/transformers/models/phi/phi_architecture.py +13 -21
  104. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +13 -1
  105. optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +2 -2
  106. optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -28
  107. optimum/rbln/transformers/models/qwen2_5_vl/__init__.py +6 -1
  108. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +11 -1
  109. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +278 -130
  110. optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +43 -39
  111. optimum/rbln/transformers/models/qwen2_moe/__init__.py +16 -0
  112. optimum/rbln/transformers/models/qwen2_moe/configuration_qwen2_moe.py +38 -0
  113. optimum/rbln/transformers/models/qwen2_moe/modeling_qwen2_moe.py +68 -0
  114. optimum/rbln/transformers/models/qwen2_moe/qwen2_moe_architecture.py +94 -0
  115. optimum/rbln/transformers/models/qwen2_vl/__init__.py +6 -1
  116. optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +11 -1
  117. optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +268 -111
  118. optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +27 -35
  119. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +0 -20
  120. optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +7 -7
  121. optimum/rbln/transformers/models/qwen3_moe/__init__.py +16 -0
  122. optimum/rbln/transformers/models/qwen3_moe/configuration_qwen3_moe.py +38 -0
  123. optimum/rbln/transformers/models/qwen3_moe/modeling_qwen3_moe.py +68 -0
  124. optimum/rbln/transformers/models/qwen3_moe/qwen3_moe_architecture.py +100 -0
  125. optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
  126. optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
  127. optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
  128. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -4
  129. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +36 -12
  130. optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +14 -12
  131. optimum/rbln/transformers/models/siglip/modeling_siglip.py +21 -19
  132. optimum/rbln/transformers/models/swin/configuration_swin.py +1 -6
  133. optimum/rbln/transformers/models/swin/modeling_swin.py +17 -4
  134. optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
  135. optimum/rbln/transformers/models/t5/t5_architecture.py +16 -17
  136. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +25 -10
  137. optimum/rbln/transformers/models/time_series_transformer/time_series_transformers_architecture.py +0 -3
  138. optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
  139. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
  140. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +60 -8
  141. optimum/rbln/transformers/models/whisper/generation_whisper.py +48 -14
  142. optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
  143. optimum/rbln/transformers/models/whisper/whisper_architecture.py +0 -3
  144. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +53 -0
  145. optimum/rbln/transformers/utils/rbln_quantization.py +29 -12
  146. optimum/rbln/utils/deprecation.py +213 -0
  147. optimum/rbln/utils/hub.py +14 -3
  148. optimum/rbln/utils/import_utils.py +23 -2
  149. optimum/rbln/utils/runtime_utils.py +42 -6
  150. optimum/rbln/utils/submodule.py +27 -1
  151. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/METADATA +6 -6
  152. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/RECORD +155 -129
  153. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/WHEEL +1 -1
  154. optimum/rbln/transformers/models/colqwen2/colqwen2_architecture.py +0 -233
  155. optimum/rbln/utils/depreacate_utils.py +0 -16
  156. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/entry_points.txt +0 -0
  157. {optimum_rbln-0.9.3rc0.dist-info → optimum_rbln-0.9.5a4.dist-info}/licenses/LICENSE +0 -0
@@ -12,9 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from dataclasses import asdict, dataclass
15
16
  from typing import Any, Dict, List, Literal, Optional, Union, get_args
16
17
 
17
- from ....configuration_utils import RBLNModelConfig
18
+ from ....configuration_utils import RBLNModelConfig, RBLNSerializableConfigProtocol
18
19
  from ....utils.logging import get_logger
19
20
  from ...utils.rbln_quantization import RBLNQuantizationConfig
20
21
  from .configuration_lora import RBLNLoRAConfig
@@ -58,6 +59,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
58
59
  sliding_window_layers: Optional[List[int]] = None,
59
60
  phases: Optional[List[PhaseType]] = None,
60
61
  logits_to_keep: Optional[int] = None,
62
+ output_hidden_states: Optional[bool] = None,
63
+ kvcache_metas: Optional[List["KVCacheMeta"]] = None,
61
64
  **kwargs,
62
65
  ):
63
66
  """
@@ -92,8 +95,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
92
95
  processing input sequences. Defaults to 128. Must be a positive integer
93
96
  divisible by 64. Affects prefill performance and memory usage.
94
97
  kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
95
- PagedAttention KV cache. See the "KV Cache Number of Blocks (`kvcache_num_blocks`)"
96
- section below for details.
98
+ PagedAttention KV cache at compile time. Defaults to 0 (automatically determined).
99
+ See the "KV Cache Number of Blocks (`kvcache_num_blocks`)" section below for details.
97
100
  decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
98
101
  This allows the model to handle varying batch sizes efficiently during generation. If not specified,
99
102
  defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
@@ -112,6 +115,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
112
115
  ["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
113
116
  logits_to_keep (Optional[int]): The number of logits to keep for the decoder. If set to 0, the decoder will keep all logits.
114
117
  Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
118
+ output_hidden_states (Optional[bool]): Whether to output the hidden states of the decoder. Defaults to False.
119
+ kvcache_metas (Optional[List["KVCacheMeta"]]): The metadata for the KV cache tensors. Handled internally if not provided. Defaults to None.
115
120
  kwargs: Additional arguments passed to the parent RBLNModelConfig.
116
121
 
117
122
  Raises:
@@ -150,17 +155,15 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
150
155
 
151
156
 
152
157
  KV Cache Number of Blocks:
153
- `kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache.
154
- Each block holds `kvcache_block_size` tokens of Key and Value states.
155
-
156
- - **Automatic Estimation (Default)**: If `kvcache_num_blocks` is `None`, the system estimates
157
- the maximum number of blocks that can fit into the available RBLN device memory. This
158
- calculation considers the model size (kernel memory), required buffer memory, the number
159
- of layers and heads, `kvcache_block_size`, tensor parallelism, and available RBLN NPU DRAM.
160
- This aims to maximize cache capacity for potentially better performance with long sequences
161
- or larger batches without manual tuning.
162
- - **Manual Setting**: You can explicitly set the number of blocks. This provides finer control
163
- but requires careful consideration of memory limits. Setting it too high may lead to
158
+ `kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache
159
+ at compile time. Each block holds `kvcache_block_size` tokens of Key and Value states.
160
+
161
+ - **Automatic Determination (Default)**: If `kvcache_num_blocks` is `0` (default), the number of blocks
162
+ is automatically determined during compilation to fit within the available DRAM on the NPU. This allows
163
+ the model to utilize the remaining memory after compilation without manual tuning, providing optimal
164
+ cache capacity for better performance with long sequences or larger batches.
165
+ - **Manual Setting**: You can explicitly set the number of blocks to a positive integer. This provides
166
+ finer control but requires careful consideration of memory limits. Setting it too high may lead to
164
167
  compilation errors if it exceeds available memory. The system will issue warnings if your
165
168
  setting exceeds the estimated maximum.
166
169
  - **Performance Impact**: A larger number of blocks reduces the likelihood of cache eviction,
@@ -173,7 +176,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
173
176
  are violated (e.g., if `kvcache_num_blocks` is less than `batch_size` when using Flash Attention).
174
177
 
175
178
  The optimal value depends on the specific model, task, hardware, and desired trade-off
176
- between performance and memory usage. The automatic estimation provides a robust starting point.
179
+ between performance and memory usage. Automatic determination (default) provides a robust starting point
180
+ that adapts to the available DRAM on the NPU at compile time.
177
181
  """
178
182
 
179
183
  super().__init__(**kwargs)
@@ -220,7 +224,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
220
224
  if self.prefill_chunk_size % 64 != 0 or self.prefill_chunk_size <= 0:
221
225
  raise ValueError("`prefill_chunk_size` must be a positive integer divisible by 64.")
222
226
 
223
- self.kvcache_num_blocks = kvcache_num_blocks
227
+ self.kvcache_num_blocks = kvcache_num_blocks if kvcache_num_blocks is not None else 0
224
228
  self.cache_impl = cache_impl or "static"
225
229
  self.sliding_window = sliding_window
226
230
  self.sliding_window_layers = sliding_window_layers or []
@@ -232,6 +236,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
232
236
  if self.logits_to_keep is not None and self.logits_to_keep > 1:
233
237
  raise NotImplementedError("`logits_to_keep` > 1 is currently not supported for RBLN models.")
234
238
 
239
+ self.output_hidden_states = output_hidden_states or False
240
+
235
241
  self.decoder_batch_sizes = None
236
242
  if "decode" in self.phases:
237
243
  self.decoder_batch_sizes = decoder_batch_sizes
@@ -253,6 +259,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
253
259
  # Larger batch size should be at the beginning of the list.
254
260
  self.decoder_batch_sizes.sort(reverse=True)
255
261
 
262
+ self.kvcache_metas: List["KVCacheMeta"] = kvcache_metas or []
263
+
256
264
  @staticmethod
257
265
  def validate_phases_type(phases: List[PhaseType]):
258
266
  if not isinstance(phases, list):
@@ -274,13 +282,33 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
274
282
 
275
283
  @property
276
284
  def use_lora(self):
277
- """Check if LoRA is enabled for this configuration."""
278
285
  return self.lora_config is not None
279
286
 
280
287
  @property
281
288
  def can_generate(self) -> bool:
282
289
  return "decode" in self.phases
283
290
 
291
+ @property
292
+ def nbits_per_param(self) -> int:
293
+ if self.quantization:
294
+ return self.quantization.nbits_per_param
295
+ return 16
296
+
297
+ @property
298
+ def is_auto_num_blocks(self) -> bool:
299
+ """Returns True if kvcache_num_blocks will be automatically determined during compilation to fit within the available DRAM on the NPU."""
300
+ return self.kvcache_num_blocks == 0
301
+
302
+ @property
303
+ def num_full_blocks(self) -> int:
304
+ return (self.max_seq_len // self.kvcache_block_size) * self.batch_size
305
+
306
+ @property
307
+ def num_min_blocks(self) -> int:
308
+ if self.attn_impl == "flash_attn":
309
+ return min(self.max_seq_len // self.kvcache_block_size + 1, self.num_full_blocks)
310
+ return self.batch_size
311
+
284
312
 
285
313
  class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
286
314
  """
@@ -293,3 +321,86 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
293
321
 
294
322
  _default_phases = ["prefill", "decode"]
295
323
  _default_logits_to_keep = 1
324
+
325
+
326
+ @dataclass
327
+ class KVCacheMeta(RBLNSerializableConfigProtocol):
328
+ """
329
+ KVCacheMeta contains metadata describing the key-value (KV) cache tensor for a specific transformer layer.
330
+
331
+ This is used during compilation and runtime on RBLN devices to manage memory and configure the
332
+ static or dynamic characteristics of the cache implementation for decoder-only models.
333
+
334
+ Attributes:
335
+ name (str): Logical name of the KV cache tensor.
336
+ layer_index (int): Index of the transformer layer corresponding to this cache.
337
+ shape (list[int]): The 4D shape of the cache tensor:
338
+ [num_blocks, num_heads, block_size, head_dim]. The number of blocks may be dynamic or static
339
+ depending on model configuration.
340
+ layer_type (str): String describing the attention/cache algorithm (e.g., "full_attention", "sliding_attention").
341
+ is_auto (bool): Whether the number of blocks is automatically determined during compilation (True) or manually specified (False).
342
+ In both cases, the KV cache size is fixed at compile time.
343
+ dtype (str): Data type of the cache buffer ("float16", "float32", etc.).
344
+ """
345
+
346
+ name: str
347
+ layer_index: int
348
+ shape: list[int] # (num_blocks, num_heads, block_size(seq), head_dim)
349
+ layer_type: str
350
+ is_auto: bool
351
+ dtype: str
352
+
353
+ def _prepare_for_serialization(self) -> dict[str, Any]:
354
+ return asdict(self)
355
+
356
+ @property
357
+ def compile_shape(self):
358
+ return [1, self.shape[1], self.shape[2], self.shape[3]] if self.can_resize else self.shape
359
+
360
+ @property
361
+ def can_resize(self):
362
+ return self.is_auto and self.layer_type == "full_attention"
363
+
364
+ @property
365
+ def num_blocks(self) -> int:
366
+ return self.shape[0]
367
+
368
+ @property
369
+ def block_size(self) -> int:
370
+ return self.shape[2]
371
+
372
+ @staticmethod
373
+ def make(
374
+ name: str,
375
+ layer_index: int,
376
+ num_key_value_heads: int,
377
+ head_dim: int,
378
+ dtype: str,
379
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
380
+ ) -> "KVCacheMeta":
381
+ assert len(rbln_config.compile_cfgs) == 0, "KVCacheMeta cannot be created from rbln_config with compile_cfgs"
382
+
383
+ if rbln_config.sliding_window is not None and layer_index in rbln_config.sliding_window_layers:
384
+ layer_type = "sliding_attention"
385
+ block_size = rbln_config.sliding_window
386
+ num_blocks = rbln_config.batch_size
387
+ is_auto = False
388
+
389
+ else:
390
+ layer_type = "full_attention"
391
+ block_size = rbln_config.kvcache_block_size
392
+
393
+ if rbln_config.is_auto_num_blocks:
394
+ num_blocks = rbln_config.num_full_blocks
395
+ is_auto = True
396
+ else:
397
+ num_blocks = rbln_config.kvcache_num_blocks
398
+ is_auto = False
399
+
400
+ shape = [num_blocks, num_key_value_heads, block_size, head_dim]
401
+ if num_blocks <= 0:
402
+ raise ValueError("`num_blocks` must be greater than 0 when using KV cache.")
403
+
404
+ return KVCacheMeta(
405
+ name=name, layer_index=layer_index, shape=shape, layer_type=layer_type, is_auto=is_auto, dtype=dtype
406
+ )
@@ -46,7 +46,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
46
46
  model = RBLNLlamaForCausalLM.from_pretrained(
47
47
  model_id,
48
48
  rbln_config=RBLNLlamaForCausalLMConfig(lora_config=lora_config, tensor_parallel_size=tp_size, max_seq_len=8192),
49
- torch_dtype="auto",
49
+ dtype="auto",
50
50
  )
51
51
 
52
52
 
@@ -183,7 +183,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
183
183
  f"Failed to download LoRA adapter '{path.as_posix()}' from HuggingFace Hub. "
184
184
  f"Please check if the model ID is correct or provide a valid local path. "
185
185
  f"Error: {e}"
186
- )
186
+ ) from e
187
187
 
188
188
  def _load_adapter_config(self) -> Dict[str, Any]:
189
189
  """