optimum-rbln 0.8.1rc1__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (119) hide show
  1. optimum/rbln/__init__.py +58 -9
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +24 -5
  4. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +2 -2
  5. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +2 -2
  6. optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +2 -2
  7. optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
  8. optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +5 -2
  9. optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +2 -2
  10. optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +2 -2
  11. optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +2 -2
  12. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +3 -3
  13. optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +2 -2
  14. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +4 -4
  15. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +2 -2
  16. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +2 -2
  17. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +2 -2
  18. optimum/rbln/diffusers/modeling_diffusers.py +4 -5
  19. optimum/rbln/diffusers/models/__init__.py +3 -13
  20. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
  21. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
  22. optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
  23. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
  24. optimum/rbln/diffusers/pipelines/__init__.py +1 -5
  25. optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +12 -4
  26. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -28
  27. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +1 -1
  28. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +1 -1
  29. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
  30. optimum/rbln/modeling.py +4 -5
  31. optimum/rbln/modeling_base.py +18 -14
  32. optimum/rbln/ops/kv_cache_update.py +5 -0
  33. optimum/rbln/ops/linear.py +7 -0
  34. optimum/rbln/transformers/__init__.py +60 -0
  35. optimum/rbln/transformers/configuration_generic.py +4 -4
  36. optimum/rbln/transformers/modeling_attention_utils.py +252 -0
  37. optimum/rbln/transformers/modeling_generic.py +1 -4
  38. optimum/rbln/transformers/models/__init__.py +45 -30
  39. optimum/rbln/transformers/models/bart/bart_architecture.py +2 -7
  40. optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +2 -2
  41. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -5
  42. optimum/rbln/transformers/models/clip/configuration_clip.py +14 -3
  43. optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
  44. optimum/rbln/transformers/models/colpali/colpali_architecture.py +1 -4
  45. optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
  46. optimum/rbln/transformers/models/colpali/modeling_colpali.py +2 -10
  47. optimum/rbln/transformers/models/decoderonly/__init__.py +2 -2
  48. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +214 -45
  49. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +323 -454
  50. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +579 -362
  51. optimum/rbln/transformers/models/exaone/exaone_architecture.py +17 -42
  52. optimum/rbln/transformers/models/gemma/__init__.py +2 -2
  53. optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
  54. optimum/rbln/transformers/models/gemma/gemma_architecture.py +3 -44
  55. optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
  56. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +21 -9
  57. optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +9 -63
  58. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +200 -292
  59. optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
  60. optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
  61. optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +19 -24
  62. optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
  63. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +2 -2
  64. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -9
  65. optimum/rbln/transformers/models/llama/__init__.py +2 -2
  66. optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
  67. optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
  68. optimum/rbln/transformers/models/llava/__init__.py +16 -0
  69. optimum/rbln/transformers/models/llava/configuration_llava.py +54 -0
  70. optimum/rbln/transformers/models/llava/modeling_llava.py +419 -0
  71. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +20 -3
  72. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -16
  73. optimum/rbln/transformers/models/midm/midm_architecture.py +14 -22
  74. optimum/rbln/transformers/models/mistral/__init__.py +2 -2
  75. optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
  76. optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
  77. optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
  78. optimum/rbln/transformers/models/opt/__init__.py +2 -2
  79. optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
  80. optimum/rbln/transformers/models/opt/modeling_opt.py +41 -1
  81. optimum/rbln/transformers/models/opt/opt_architecture.py +16 -25
  82. optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
  83. optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +34 -0
  84. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +69 -0
  85. optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
  86. optimum/rbln/transformers/models/phi/__init__.py +2 -2
  87. optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
  88. optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
  89. optimum/rbln/transformers/models/phi/phi_architecture.py +16 -22
  90. optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
  91. optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
  92. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +315 -0
  93. optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
  94. optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
  95. optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
  96. optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
  97. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +3 -3
  98. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -15
  99. optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +1 -4
  100. optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
  101. optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
  102. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +133 -0
  103. optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +31 -0
  104. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -12
  105. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +3 -1
  106. optimum/rbln/transformers/models/siglip/__init__.py +2 -6
  107. optimum/rbln/transformers/models/siglip/modeling_siglip.py +2 -2
  108. optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +2 -2
  109. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -5
  110. optimum/rbln/transformers/models/whisper/configuration_whisper.py +3 -12
  111. optimum/rbln/transformers/models/whisper/modeling_whisper.py +8 -2
  112. optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
  113. optimum/rbln/utils/depreacate_utils.py +16 -0
  114. optimum/rbln/utils/hub.py +8 -47
  115. optimum/rbln/utils/runtime_utils.py +31 -5
  116. {optimum_rbln-0.8.1rc1.dist-info → optimum_rbln-0.8.2.dist-info}/METADATA +1 -1
  117. {optimum_rbln-0.8.1rc1.dist-info → optimum_rbln-0.8.2.dist-info}/RECORD +119 -102
  118. {optimum_rbln-0.8.1rc1.dist-info → optimum_rbln-0.8.2.dist-info}/WHEEL +0 -0
  119. {optimum_rbln-0.8.1rc1.dist-info → optimum_rbln-0.8.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,252 @@
1
+ import math
2
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
3
+
4
+ from optimum.rbln.transformers.models.decoderonly.configuration_decoderonly import (
5
+ RBLNDecoderOnlyModelForCausalLMConfig,
6
+ )
7
+
8
+ from ..utils.logging import get_logger
9
+
10
+
11
+ logger = get_logger()
12
+
13
+ if TYPE_CHECKING:
14
+ from rebel import RBLNCompiledModel
15
+ from transformers import PretrainedConfig
16
+
17
+
18
+ DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
19
+ DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
20
+ MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
21
+ MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
22
+ MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
23
+ MAX_SLIDING_WINDOW_SIZE = 32_768
24
+
25
+
26
+ def set_default_values(
27
+ attn_impl: Optional[str] = None,
28
+ kvcache_partition_len: Optional[int] = None,
29
+ kvcache_block_size: Optional[int] = None,
30
+ max_seq_len: Optional[int] = None,
31
+ ) -> Tuple[str, int, int]:
32
+ if attn_impl is None:
33
+ attn_impl = "eager"
34
+
35
+ if kvcache_partition_len is not None:
36
+ if attn_impl == "eager":
37
+ attn_impl = "flash_attn"
38
+ logger.warning(
39
+ "A non-null `kvcache_partition_len` was provided, but `attn_impl` was not explicitly set or "
40
+ "set to 'eager'. Since KV cache partitioning is only supported with flash attention, "
41
+ "`attn_impl` has been automatically switched to 'flash_attn'."
42
+ )
43
+
44
+ if kvcache_partition_len is None and attn_impl == "flash_attn":
45
+ kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
46
+
47
+ if kvcache_block_size is None:
48
+ if attn_impl == "eager":
49
+ kvcache_block_size = max_seq_len
50
+ else:
51
+ kvcache_block_size = kvcache_partition_len
52
+
53
+ return attn_impl, kvcache_partition_len, kvcache_block_size
54
+
55
+
56
+ def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcache_block_size: int, max_seq_len: int):
57
+ if attn_impl not in ["eager", "flash_attn"]:
58
+ raise ValueError(f"Unknown `attn_impl` : {attn_impl}. (Available : 'eager', 'flash_attn`)")
59
+
60
+ ## Checking Constraints...
61
+ # Constraint of eager attention:
62
+ # - `max_seq_len` <= 32k
63
+
64
+ # Constraints of flash attention:
65
+ # 1. `max_seq_len` should be multiple of `partition_len`.
66
+ # 2. 4k <= `partition_len` <= 32k.
67
+ # 3. `max_seq_len` should be larger then 8k.
68
+ if attn_impl == "eager" and max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
69
+ raise ValueError(
70
+ f"`max_seq_len` is set to {max_seq_len}, "
71
+ f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
72
+ f"Please reduce the `max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
73
+ " or consider switching `attn_impl` to 'flash_attn' for larger sequence lengths."
74
+ )
75
+
76
+ if attn_impl == "flash_attn":
77
+ if max_seq_len // kvcache_partition_len < 2 or max_seq_len % kvcache_partition_len != 0:
78
+ raise ValueError(
79
+ f"`max_seq_len` ({max_seq_len}) must be a multiple of `kvcache_partition_len` ({kvcache_partition_len}) "
80
+ f"when using 'flash_attn'. Please adjust either value to meet this requirement."
81
+ )
82
+ elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
83
+ raise ValueError(
84
+ f"`kvcache_partition_len` ({kvcache_partition_len}) is out of the supported range for 'flash_attn' "
85
+ f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
86
+ f"Please provide a valid value within this range."
87
+ )
88
+ elif max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
89
+ raise ValueError(
90
+ f"`max_seq_len` ({max_seq_len}) is too small for 'flash_attn'. The minimum "
91
+ f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `max_seq_len` to meet "
92
+ "this requirement, or consider switching `attn_impl` to 'eager' for shorter lengths."
93
+ )
94
+
95
+ if kvcache_block_size is not None:
96
+ if attn_impl == "flash_attn" and kvcache_partition_len != kvcache_block_size:
97
+ raise ValueError(
98
+ f" When using 'flash attention', the `kvcache_block_size` ({kvcache_block_size}) "
99
+ f"must always be set equal to the `kvcache_partition_len` {kvcache_partition_len}."
100
+ )
101
+ elif attn_impl == "eager" and kvcache_block_size != max_seq_len:
102
+ raise ValueError(
103
+ f" When using 'eager attention', the `kvcache_block_size` ({kvcache_block_size}) "
104
+ f"must always be set equal to the `max_seq_len` {max_seq_len}."
105
+ )
106
+
107
+
108
+ def validate_sliding_window(rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
109
+ if rbln_config.sliding_window > MAX_SLIDING_WINDOW_SIZE - rbln_config.prefill_chunk_size:
110
+ raise ValueError(
111
+ f"Sliding window size ({rbln_config.sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - rbln_config.prefill_chunk_size})"
112
+ )
113
+
114
+ if rbln_config.cache_impl == "sliding_window" and rbln_config.use_attention_mask:
115
+ raise ValueError("`use_attention_mask` must be set to False when `cache_impl` is set to 'sliding_window'.")
116
+
117
+
118
+ class RBLNDecoderOnlyFlashAttentionMixin:
119
+ @classmethod
120
+ def get_maximum_num_blocks(
121
+ cls,
122
+ config: "PretrainedConfig",
123
+ tensor_parallel_size: int,
124
+ kvcache_block_size: int,
125
+ nbits_per_param: Optional[int] = None,
126
+ n_model_params: Optional[int] = None,
127
+ kernel_size: Optional[int] = None,
128
+ buffer: Optional[int] = None,
129
+ num_runtimes: int = 2,
130
+ ) -> int:
131
+ # We are finding max_n_blocks(x) that satisfies the following equation:
132
+
133
+ # available_dram - kernel_size - buffer
134
+ # - num_layers * 2 * tensor_parallel_size
135
+ # * align_2MB(
136
+ # x
137
+ # * block_size
138
+ # * align_64(head_dim)
139
+ # * math.ceil(num_key_value_heads / tensor_parallel_size)
140
+ # * 2
141
+ # ) > 0
142
+
143
+ # This inequality can be rewritten as follows:
144
+
145
+ # a - c * align_2MB(b * x) > 0
146
+ # where
147
+ # a = available_dram - kernel_size - buffer
148
+ # b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
149
+ # c = num_layers * 2 * tensor_parallel_size
150
+
151
+ # We can rewrite the inequality as follows:
152
+ # k > align_2MB(b*x)
153
+ # where
154
+ # k = a / c
155
+
156
+ # After that, we can derive the following equation:
157
+ # x = floor(2**21 / b * floor((k - 1) / 2**21))
158
+
159
+ def align(x: int, nbytes: int) -> int:
160
+ return int(math.ceil(x / nbytes) * nbytes)
161
+
162
+ def align_2MB(x: int) -> int:
163
+ return align(x, 2**21)
164
+
165
+ num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
166
+ num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
167
+ head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
168
+ vocab_size = config.vocab_size
169
+ hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
170
+ num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
171
+
172
+ # TODO(jongho): Update if target npu is REBEL.
173
+ ATOM_DRAM_NBYTES = 16 * 2**30
174
+ ATOM_SYS_DRAM_NBYTES = 288 * 2**20
175
+ available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
176
+
177
+ if kernel_size is None:
178
+ if n_model_params is None:
179
+ raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
180
+ # Get estimated kernel size (approximated)
181
+ lm_heads_params = align(vocab_size, 64) * hidden_size
182
+ lm_heads_nbytes = (
183
+ align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
184
+ )
185
+ params = n_model_params - lm_heads_params
186
+ layer_nbytes = (
187
+ align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
188
+ * num_layers
189
+ * tensor_parallel_size
190
+ )
191
+ kernel_size = layer_nbytes + lm_heads_nbytes
192
+ elif n_model_params is not None:
193
+ raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
194
+
195
+ available_dram -= kernel_size
196
+
197
+ if buffer is None:
198
+ # TODO: Accurate buffer estimation
199
+ buffer_per_runtime_per_core = 2**28 # 256MB per runtime
200
+ buffer_per_core = buffer_per_runtime_per_core * num_runtimes # 1 for prefill, 1 for decoder
201
+ buffer = buffer_per_core * tensor_parallel_size
202
+ available_dram -= buffer
203
+
204
+ b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
205
+ c = num_layers * 2 * tensor_parallel_size
206
+ k = available_dram / c
207
+ max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
208
+
209
+ return max_n_blocks
210
+
211
+ @classmethod
212
+ def maybe_suggest_kvcache_num_blocks(
213
+ cls,
214
+ compiled_models: Dict[str, "RBLNCompiledModel"],
215
+ model_config: "PretrainedConfig",
216
+ rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
217
+ ) -> None:
218
+ # Get the actual memory allocation of each node by key
219
+ alloc_memory_per_node_by_key: Dict[str, List[int]] = compiled_models["prefill"].get_alloc_per_node_by_key()
220
+ alloc_memory_by_key: Dict[str, int] = {
221
+ key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
222
+ }
223
+ for batch_size in rbln_config.decoder_batch_sizes:
224
+ for key, memory_per_node in (
225
+ compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
226
+ ):
227
+ alloc_memory_by_key[key] += sum(memory_per_node)
228
+ alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
229
+ alloc_memory_by_key.pop("DramTensor", None) # kv-cache
230
+ kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
231
+
232
+ # Get the maximum number of blocks that can be allocated
233
+ buffer = sum(alloc_memory_by_key.values())
234
+ max_num_blocks = cls.get_maximum_num_blocks(
235
+ config=model_config,
236
+ tensor_parallel_size=rbln_config.tensor_parallel_size,
237
+ kvcache_block_size=rbln_config.kvcache_block_size,
238
+ kernel_size=kernel_size,
239
+ buffer=buffer,
240
+ )
241
+
242
+ # Since our estimation logic is not always accurate,
243
+ # users can set `kvcache_num_blocks` to `max_num_blocks`.
244
+ # If the memory is not enough, the model will fail to compile.
245
+ if rbln_config.kvcache_num_blocks < max_num_blocks:
246
+ logger.warning(
247
+ f"Current `kvcache_num_blocks` setting is {rbln_config.kvcache_num_blocks}. "
248
+ "Our analysis indicates that additional memory is available for more blocks. "
249
+ f"Consider increasing `kvcache_num_blocks` to {max_num_blocks} for potentially improved performance. "
250
+ "Please be advised that our memory estimation algorithm has limitations, "
251
+ "and increasing this value may not guarantee successful model compilation."
252
+ )
@@ -34,10 +34,7 @@ from transformers import (
34
34
  AutoModelForTextEncoding,
35
35
  PretrainedConfig,
36
36
  )
37
- from transformers.modeling_outputs import (
38
- BaseModelOutput,
39
- QuestionAnsweringModelOutput,
40
- )
37
+ from transformers.modeling_outputs import BaseModelOutput, QuestionAnsweringModelOutput
41
38
 
42
39
  from ..configuration_utils import RBLNCompileConfig
43
40
  from ..modeling import RBLNModel
@@ -84,6 +84,8 @@ _import_structure = {
84
84
  "RBLNQwen2_5_VLForConditionalGenerationConfig",
85
85
  ],
86
86
  "decoderonly": [
87
+ "RBLNDecoderOnlyModelConfig",
88
+ "RBLNDecoderOnlyModel",
87
89
  "RBLNDecoderOnlyModelForCausalLM",
88
90
  "RBLNDecoderOnlyModelForCausalLMConfig",
89
91
  ],
@@ -92,27 +94,41 @@ _import_structure = {
92
94
  "RBLNDPTForDepthEstimationConfig",
93
95
  ],
94
96
  "exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
95
- "gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig"],
97
+ "gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig", "RBLNGemmaModel", "RBLNGemmaModelConfig"],
96
98
  "gemma3": [
97
99
  "RBLNGemma3ForCausalLM",
98
100
  "RBLNGemma3ForCausalLMConfig",
99
101
  "RBLNGemma3ForConditionalGeneration",
100
102
  "RBLNGemma3ForConditionalGenerationConfig",
101
103
  ],
102
- "gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig"],
104
+ "gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig", "RBLNGPT2Model", "RBLNGPT2ModelConfig"],
103
105
  "idefics3": [
104
106
  "RBLNIdefics3VisionTransformer",
105
107
  "RBLNIdefics3ForConditionalGeneration",
106
108
  "RBLNIdefics3ForConditionalGenerationConfig",
107
109
  "RBLNIdefics3VisionTransformerConfig",
108
110
  ],
109
- "llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig"],
110
- "opt": ["RBLNOPTForCausalLM", "RBLNOPTForCausalLMConfig"],
111
+ "llava": ["RBLNLlavaForConditionalGeneration", "RBLNLlavaForConditionalGenerationConfig"],
112
+ "llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig", "RBLNLlamaModel", "RBLNLlamaModelConfig"],
113
+ "opt": ["RBLNOPTForCausalLM", "RBLNOPTForCausalLMConfig", "RBLNOPTModel", "RBLNOPTModelConfig"],
114
+ "pegasus": [
115
+ "RBLNPegasusForConditionalGeneration",
116
+ "RBLNPegasusModel",
117
+ "RBLNPegasusForConditionalGenerationConfig",
118
+ "RBLNPegasusModelConfig",
119
+ ],
111
120
  "llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
112
121
  "midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
113
- "mistral": ["RBLNMistralForCausalLM", "RBLNMistralForCausalLMConfig"],
114
- "phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig"],
115
- "qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig"],
122
+ "pixtral": ["RBLNPixtralVisionModel", "RBLNPixtralVisionModelConfig"],
123
+ "mistral": [
124
+ "RBLNMistralForCausalLM",
125
+ "RBLNMistralForCausalLMConfig",
126
+ "RBLNMistralModel",
127
+ "RBLNMistralModelConfig",
128
+ ],
129
+ "phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig", "RBLNPhiModel", "RBLNPhiModelConfig"],
130
+ "qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig", "RBLNQwen2Model", "RBLNQwen2ModelConfig"],
131
+ "qwen3": ["RBLNQwen3ForCausalLM", "RBLNQwen3ForCausalLMConfig", "RBLNQwen3Model", "RBLNQwen3ModelConfig"],
116
132
  "resnet": ["RBLNResNetForImageClassification", "RBLNResNetForImageClassificationConfig"],
117
133
  "roberta": [
118
134
  "RBLNRobertaForMaskedLM",
@@ -146,10 +162,7 @@ _import_structure = {
146
162
  }
147
163
 
148
164
  if TYPE_CHECKING:
149
- from .audio_spectrogram_transformer import (
150
- RBLNASTForAudioClassification,
151
- RBLNASTForAudioClassificationConfig,
152
- )
165
+ from .audio_spectrogram_transformer import RBLNASTForAudioClassification, RBLNASTForAudioClassificationConfig
153
166
  from .auto import (
154
167
  RBLNAutoModel,
155
168
  RBLNAutoModelForAudioClassification,
@@ -197,50 +210,52 @@ if TYPE_CHECKING:
197
210
  RBLNCLIPVisionModelWithProjection,
198
211
  RBLNCLIPVisionModelWithProjectionConfig,
199
212
  )
200
- from .colpali import (
201
- RBLNColPaliForRetrieval,
202
- RBLNColPaliForRetrievalConfig,
203
- )
213
+ from .colpali import RBLNColPaliForRetrieval, RBLNColPaliForRetrievalConfig
204
214
  from .decoderonly import (
215
+ RBLNDecoderOnlyModel,
216
+ RBLNDecoderOnlyModelConfig,
205
217
  RBLNDecoderOnlyModelForCausalLM,
206
218
  RBLNDecoderOnlyModelForCausalLMConfig,
207
219
  )
208
- from .distilbert import (
209
- RBLNDistilBertForQuestionAnswering,
210
- RBLNDistilBertForQuestionAnsweringConfig,
211
- )
212
- from .dpt import (
213
- RBLNDPTForDepthEstimation,
214
- RBLNDPTForDepthEstimationConfig,
215
- )
220
+ from .distilbert import RBLNDistilBertForQuestionAnswering, RBLNDistilBertForQuestionAnsweringConfig
221
+ from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
216
222
  from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
217
- from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig
223
+ from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig, RBLNGemmaModel, RBLNGemmaModelConfig
218
224
  from .gemma3 import (
219
225
  RBLNGemma3ForCausalLM,
220
226
  RBLNGemma3ForCausalLMConfig,
221
227
  RBLNGemma3ForConditionalGeneration,
222
228
  RBLNGemma3ForConditionalGenerationConfig,
223
229
  )
224
- from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig
230
+ from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig, RBLNGPT2Model, RBLNGPT2ModelConfig
225
231
  from .idefics3 import (
226
232
  RBLNIdefics3ForConditionalGeneration,
227
233
  RBLNIdefics3ForConditionalGenerationConfig,
228
234
  RBLNIdefics3VisionTransformer,
229
235
  RBLNIdefics3VisionTransformerConfig,
230
236
  )
231
- from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig
237
+ from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig, RBLNLlamaModel, RBLNLlamaModelConfig
238
+ from .llava import RBLNLlavaForConditionalGeneration, RBLNLlavaForConditionalGenerationConfig
232
239
  from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
233
240
  from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
234
- from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
235
- from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig
236
- from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig
237
- from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig
241
+ from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
242
+ from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
243
+ from .pegasus import (
244
+ RBLNPegasusForConditionalGeneration,
245
+ RBLNPegasusForConditionalGenerationConfig,
246
+ RBLNPegasusModel,
247
+ RBLNPegasusModelConfig,
248
+ )
249
+ from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig, RBLNPhiModel, RBLNPhiModelConfig
250
+ from .pixtral import RBLNPixtralVisionModel, RBLNPixtralVisionModelConfig
251
+ from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig, RBLNQwen2Model, RBLNQwen2ModelConfig
238
252
  from .qwen2_5_vl import (
239
253
  RBLNQwen2_5_VisionTransformerPretrainedModel,
240
254
  RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
241
255
  RBLNQwen2_5_VLForConditionalGeneration,
242
256
  RBLNQwen2_5_VLForConditionalGenerationConfig,
243
257
  )
258
+ from .qwen3 import RBLNQwen3ForCausalLM, RBLNQwen3ForCausalLMConfig, RBLNQwen3Model, RBLNQwen3ModelConfig
244
259
  from .resnet import RBLNResNetForImageClassification, RBLNResNetForImageClassificationConfig
245
260
  from .roberta import (
246
261
  RBLNRobertaForMaskedLM,
@@ -16,9 +16,7 @@ from typing import Tuple
16
16
 
17
17
  import torch
18
18
  from torch import nn
19
- from transformers.modeling_attn_mask_utils import (
20
- _prepare_4d_attention_mask,
21
- )
19
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
22
20
  from transformers.utils import logging
23
21
 
24
22
  from ..seq2seq.seq2seq_architecture import (
@@ -56,10 +54,7 @@ class BartDecoderWrapper(Seq2SeqDecoderWrapper):
56
54
 
57
55
 
58
56
  class BartForConditionalGeneration(Seq2SeqForConditionalGeneration):
59
- has_rescaling = False
60
-
61
- def __post_init__(self):
62
- self.scaling = self.config.d_model**-0.5
57
+ pass
63
58
 
64
59
 
65
60
  class BartDecoder(Seq2SeqDecoder):
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import Any, Optional
16
16
 
17
17
  from ....configuration_utils import RBLNModelConfig
18
18
 
@@ -62,7 +62,7 @@ class RBLNBlip2ForConditionalGenerationConfig(RBLNModelConfig):
62
62
  vision_model: Optional[RBLNModelConfig] = None,
63
63
  qformer: Optional[RBLNModelConfig] = None,
64
64
  language_model: Optional[RBLNModelConfig] = None,
65
- **kwargs,
65
+ **kwargs: Any,
66
66
  ):
67
67
  """
68
68
  Args:
@@ -35,11 +35,7 @@ from ....modeling import RBLNModel
35
35
  logger = logging.get_logger(__name__)
36
36
 
37
37
  if TYPE_CHECKING:
38
- from transformers import (
39
- AutoFeatureExtractor,
40
- AutoProcessor,
41
- AutoTokenizer,
42
- )
38
+ from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
43
39
 
44
40
 
45
41
  class LoopProjector:
@@ -12,13 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Dict, Optional
15
+ from typing import Any, Optional
16
16
 
17
17
  from ....configuration_utils import RBLNModelConfig
18
18
 
19
19
 
20
20
  class RBLNCLIPTextModelConfig(RBLNModelConfig):
21
- def __init__(self, batch_size: Optional[int] = None, **kwargs: Dict[str, Any]):
21
+ def __init__(self, batch_size: Optional[int] = None, **kwargs: Any):
22
22
  """
23
23
  Args:
24
24
  batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
@@ -43,7 +43,15 @@ class RBLNCLIPTextModelWithProjectionConfig(RBLNCLIPTextModelConfig):
43
43
 
44
44
 
45
45
  class RBLNCLIPVisionModelConfig(RBLNModelConfig):
46
- def __init__(self, batch_size: Optional[int] = None, image_size: Optional[int] = None, **kwargs: Dict[str, Any]):
46
+ def __init__(
47
+ self,
48
+ batch_size: Optional[int] = None,
49
+ image_size: Optional[int] = None,
50
+ interpolate_pos_encoding: Optional[bool] = None,
51
+ output_hidden_states: Optional[bool] = None,
52
+ output_attentions: Optional[bool] = None,
53
+ **kwargs: Any,
54
+ ):
47
55
  """
48
56
  Args:
49
57
  batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
@@ -60,6 +68,9 @@ class RBLNCLIPVisionModelConfig(RBLNModelConfig):
60
68
  raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
61
69
 
62
70
  self.image_size = image_size
71
+ self.interpolate_pos_encoding = interpolate_pos_encoding or False
72
+ self.output_hidden_states = output_hidden_states
73
+ self.output_attentions = output_attentions
63
74
 
64
75
  @property
65
76
  def image_width(self):