optimum-rbln 0.8.2a4__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +96 -9
- optimum/rbln/__version__.py +16 -3
- optimum/rbln/cli.py +660 -0
- optimum/rbln/configuration_utils.py +153 -42
- optimum/rbln/diffusers/__init__.py +7 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +4 -4
- optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +9 -4
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +9 -4
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +35 -19
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +14 -11
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +30 -20
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +13 -9
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +17 -13
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +17 -10
- optimum/rbln/diffusers/modeling_diffusers.py +30 -14
- optimum/rbln/diffusers/models/__init__.py +3 -13
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +31 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +28 -3
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +31 -3
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +1 -1
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +9 -1
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +9 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +6 -3
- optimum/rbln/diffusers/pipelines/__init__.py +11 -5
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +307 -0
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +19 -16
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +14 -18
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +31 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +31 -1
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
- optimum/rbln/modeling.py +71 -19
- optimum/rbln/modeling_base.py +99 -21
- optimum/rbln/ops/attn.py +158 -0
- optimum/rbln/ops/flash_attn.py +166 -0
- optimum/rbln/ops/kv_cache_update.py +5 -0
- optimum/rbln/ops/linear.py +7 -0
- optimum/rbln/transformers/__init__.py +92 -0
- optimum/rbln/transformers/configuration_generic.py +9 -7
- optimum/rbln/transformers/modeling_attention_utils.py +252 -0
- optimum/rbln/transformers/modeling_generic.py +51 -9
- optimum/rbln/transformers/modeling_outputs.py +37 -0
- optimum/rbln/transformers/models/__init__.py +91 -30
- optimum/rbln/transformers/models/auto/__init__.py +2 -0
- optimum/rbln/transformers/models/auto/auto_factory.py +92 -17
- optimum/rbln/transformers/models/auto/modeling_auto.py +45 -0
- optimum/rbln/transformers/models/bart/bart_architecture.py +1 -3
- optimum/rbln/transformers/models/bart/configuration_bart.py +2 -0
- optimum/rbln/transformers/models/bert/bert_architecture.py +16 -0
- optimum/rbln/transformers/models/bert/modeling_bert.py +8 -4
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +42 -11
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +94 -30
- optimum/rbln/transformers/models/clip/configuration_clip.py +10 -7
- optimum/rbln/transformers/models/clip/modeling_clip.py +27 -4
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +3 -6
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +37 -21
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +113 -96
- optimum/rbln/transformers/models/colqwen2/__init__.py +2 -0
- optimum/rbln/transformers/models/colqwen2/colqwen2_architecture.py +233 -0
- optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +74 -0
- optimum/rbln/transformers/models/colqwen2/modeling_colqwen2.py +446 -0
- optimum/rbln/transformers/models/decoderonly/__init__.py +3 -2
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +109 -37
- optimum/rbln/transformers/models/decoderonly/configuration_lora.py +411 -0
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +318 -309
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +504 -0
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +111 -0
- optimum/rbln/transformers/models/decoderonly/lora_architecture.py +204 -0
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +453 -897
- optimum/rbln/transformers/models/depth_anything/__init__.py +16 -0
- optimum/rbln/transformers/models/depth_anything/configuration_depth_anything.py +24 -0
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +25 -0
- optimum/rbln/transformers/models/exaone/modeling_exaone.py +42 -4
- optimum/rbln/transformers/models/gemma/__init__.py +2 -2
- optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +1 -4
- optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +49 -13
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +12 -2
- optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +245 -0
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +201 -349
- optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
- optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
- optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +10 -8
- optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
- optimum/rbln/transformers/models/grounding_dino/__init__.py +10 -0
- optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +92 -0
- optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +599 -0
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +1032 -0
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +35 -7
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +26 -27
- optimum/rbln/transformers/models/llama/__init__.py +2 -2
- optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
- optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
- optimum/rbln/transformers/models/llava/__init__.py +16 -0
- optimum/rbln/transformers/models/llava/configuration_llava.py +72 -0
- optimum/rbln/transformers/models/llava/modeling_llava.py +478 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +15 -17
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +235 -375
- optimum/rbln/transformers/models/midm/midm_architecture.py +4 -1
- optimum/rbln/transformers/models/midm/modeling_midm.py +42 -4
- optimum/rbln/transformers/models/mistral/__init__.py +2 -2
- optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
- optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
- optimum/rbln/transformers/models/opt/__init__.py +2 -2
- optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
- optimum/rbln/transformers/models/opt/modeling_opt.py +28 -16
- optimum/rbln/transformers/models/opt/opt_architecture.py +4 -4
- optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
- optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +38 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +71 -0
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
- optimum/rbln/transformers/models/phi/__init__.py +2 -2
- optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
- optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
- optimum/rbln/transformers/models/phi/phi_architecture.py +11 -7
- optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +310 -0
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
- optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
- optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +21 -6
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +15 -21
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +28 -7
- optimum/rbln/transformers/models/qwen2_vl/__init__.py +19 -0
- optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +88 -0
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +514 -0
- optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +165 -0
- optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +2 -2
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +86 -330
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +1 -245
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +20 -13
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +24 -3
- optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +2 -2
- optimum/rbln/transformers/models/siglip/__init__.py +2 -6
- optimum/rbln/transformers/models/siglip/configuration_siglip.py +1 -1
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +5 -16
- optimum/rbln/transformers/models/swin/__init__.py +16 -0
- optimum/rbln/transformers/models/swin/configuration_swin.py +42 -0
- optimum/rbln/transformers/models/swin/modeling_swin.py +341 -0
- optimum/rbln/transformers/models/t5/configuration_t5.py +2 -0
- optimum/rbln/transformers/models/t5/t5_architecture.py +8 -1
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +3 -3
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -14
- optimum/rbln/transformers/models/time_series_transformer/time_series_transformers_architecture.py +7 -1
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +1 -0
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +12 -13
- optimum/rbln/transformers/models/whisper/generation_whisper.py +28 -6
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +28 -3
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
- optimum/rbln/transformers/utils/rbln_quantization.py +391 -75
- optimum/rbln/transformers/utils/rbln_runtime_wrapper.py +79 -0
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/runtime_utils.py +28 -18
- optimum/rbln/utils/submodule.py +31 -9
- {optimum_rbln-0.8.2a4.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +8 -7
- {optimum_rbln-0.8.2a4.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +167 -125
- optimum_rbln-0.9.3rc0.dist-info/entry_points.txt +2 -0
- {optimum_rbln-0.8.2a4.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.8.2a4.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from optimum.rbln.transformers.models.decoderonly.configuration_decoderonly import (
|
|
5
|
+
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from ..utils.logging import get_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from rebel import RBLNCompiledModel
|
|
15
|
+
from transformers import PretrainedConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
|
|
19
|
+
DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
|
|
20
|
+
MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
|
|
21
|
+
MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
|
|
22
|
+
MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
|
|
23
|
+
MAX_SLIDING_WINDOW_SIZE = 32_768
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def set_default_values(
|
|
27
|
+
attn_impl: Optional[str] = None,
|
|
28
|
+
kvcache_partition_len: Optional[int] = None,
|
|
29
|
+
kvcache_block_size: Optional[int] = None,
|
|
30
|
+
max_seq_len: Optional[int] = None,
|
|
31
|
+
) -> Tuple[str, int, int]:
|
|
32
|
+
if attn_impl is None:
|
|
33
|
+
attn_impl = "eager"
|
|
34
|
+
|
|
35
|
+
if kvcache_partition_len is not None:
|
|
36
|
+
if attn_impl == "eager":
|
|
37
|
+
attn_impl = "flash_attn"
|
|
38
|
+
logger.warning(
|
|
39
|
+
"A non-null `kvcache_partition_len` was provided, but `attn_impl` was not explicitly set or "
|
|
40
|
+
"set to 'eager'. Since KV cache partitioning is only supported with flash attention, "
|
|
41
|
+
"`attn_impl` has been automatically switched to 'flash_attn'."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if kvcache_partition_len is None and attn_impl == "flash_attn":
|
|
45
|
+
kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
|
|
46
|
+
|
|
47
|
+
if kvcache_block_size is None:
|
|
48
|
+
if attn_impl == "eager":
|
|
49
|
+
kvcache_block_size = max_seq_len
|
|
50
|
+
else:
|
|
51
|
+
kvcache_block_size = kvcache_partition_len
|
|
52
|
+
|
|
53
|
+
return attn_impl, kvcache_partition_len, kvcache_block_size
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcache_block_size: int, max_seq_len: int):
|
|
57
|
+
if attn_impl not in ["eager", "flash_attn"]:
|
|
58
|
+
raise ValueError(f"Unknown `attn_impl` : {attn_impl}. (Available : 'eager', 'flash_attn`)")
|
|
59
|
+
|
|
60
|
+
## Checking Constraints...
|
|
61
|
+
# Constraint of eager attention:
|
|
62
|
+
# - `max_seq_len` <= 32k
|
|
63
|
+
|
|
64
|
+
# Constraints of flash attention:
|
|
65
|
+
# 1. `max_seq_len` should be multiple of `partition_len`.
|
|
66
|
+
# 2. 4k <= `partition_len` <= 32k.
|
|
67
|
+
# 3. `max_seq_len` should be larger then 8k.
|
|
68
|
+
if attn_impl == "eager" and max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"`max_seq_len` is set to {max_seq_len}, "
|
|
71
|
+
f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
|
|
72
|
+
f"Please reduce the `max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
|
|
73
|
+
" or consider switching `attn_impl` to 'flash_attn' for larger sequence lengths."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if attn_impl == "flash_attn":
|
|
77
|
+
if max_seq_len // kvcache_partition_len < 2 or max_seq_len % kvcache_partition_len != 0:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"`max_seq_len` ({max_seq_len}) must be a multiple of `kvcache_partition_len` ({kvcache_partition_len}) "
|
|
80
|
+
f"when using 'flash_attn'. Please adjust either value to meet this requirement."
|
|
81
|
+
)
|
|
82
|
+
elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"`kvcache_partition_len` ({kvcache_partition_len}) is out of the supported range for 'flash_attn' "
|
|
85
|
+
f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
|
|
86
|
+
f"Please provide a valid value within this range."
|
|
87
|
+
)
|
|
88
|
+
elif max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"`max_seq_len` ({max_seq_len}) is too small for 'flash_attn'. The minimum "
|
|
91
|
+
f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `max_seq_len` to meet "
|
|
92
|
+
"this requirement, or consider switching `attn_impl` to 'eager' for shorter lengths."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if kvcache_block_size is not None:
|
|
96
|
+
if attn_impl == "flash_attn" and kvcache_partition_len != kvcache_block_size:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f" When using 'flash attention', the `kvcache_block_size` ({kvcache_block_size}) "
|
|
99
|
+
f"must always be set equal to the `kvcache_partition_len` {kvcache_partition_len}."
|
|
100
|
+
)
|
|
101
|
+
elif attn_impl == "eager" and kvcache_block_size != max_seq_len:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f" When using 'eager attention', the `kvcache_block_size` ({kvcache_block_size}) "
|
|
104
|
+
f"must always be set equal to the `max_seq_len` {max_seq_len}."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_sliding_window(rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
|
|
109
|
+
if rbln_config.sliding_window > MAX_SLIDING_WINDOW_SIZE - rbln_config.prefill_chunk_size:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"Sliding window size ({rbln_config.sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - rbln_config.prefill_chunk_size})"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if rbln_config.cache_impl == "sliding_window" and rbln_config.use_attention_mask:
|
|
115
|
+
raise ValueError("`use_attention_mask` must be set to False when `cache_impl` is set to 'sliding_window'.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class RBLNDecoderOnlyFlashAttentionMixin:
|
|
119
|
+
@classmethod
|
|
120
|
+
def get_maximum_num_blocks(
|
|
121
|
+
cls,
|
|
122
|
+
config: "PretrainedConfig",
|
|
123
|
+
tensor_parallel_size: int,
|
|
124
|
+
kvcache_block_size: int,
|
|
125
|
+
nbits_per_param: Optional[int] = None,
|
|
126
|
+
n_model_params: Optional[int] = None,
|
|
127
|
+
kernel_size: Optional[int] = None,
|
|
128
|
+
buffer: Optional[int] = None,
|
|
129
|
+
num_runtimes: int = 2,
|
|
130
|
+
) -> int:
|
|
131
|
+
# We are finding max_n_blocks(x) that satisfies the following equation:
|
|
132
|
+
|
|
133
|
+
# available_dram - kernel_size - buffer
|
|
134
|
+
# - num_layers * 2 * tensor_parallel_size
|
|
135
|
+
# * align_2MB(
|
|
136
|
+
# x
|
|
137
|
+
# * block_size
|
|
138
|
+
# * align_64(head_dim)
|
|
139
|
+
# * math.ceil(num_key_value_heads / tensor_parallel_size)
|
|
140
|
+
# * 2
|
|
141
|
+
# ) > 0
|
|
142
|
+
|
|
143
|
+
# This inequality can be rewritten as follows:
|
|
144
|
+
|
|
145
|
+
# a - c * align_2MB(b * x) > 0
|
|
146
|
+
# where
|
|
147
|
+
# a = available_dram - kernel_size - buffer
|
|
148
|
+
# b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
|
149
|
+
# c = num_layers * 2 * tensor_parallel_size
|
|
150
|
+
|
|
151
|
+
# We can rewrite the inequality as follows:
|
|
152
|
+
# k > align_2MB(b*x)
|
|
153
|
+
# where
|
|
154
|
+
# k = a / c
|
|
155
|
+
|
|
156
|
+
# After that, we can derive the following equation:
|
|
157
|
+
# x = floor(2**21 / b * floor((k - 1) / 2**21))
|
|
158
|
+
|
|
159
|
+
def align(x: int, nbytes: int) -> int:
|
|
160
|
+
return int(math.ceil(x / nbytes) * nbytes)
|
|
161
|
+
|
|
162
|
+
def align_2MB(x: int) -> int:
|
|
163
|
+
return align(x, 2**21)
|
|
164
|
+
|
|
165
|
+
num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
|
|
166
|
+
num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
|
|
167
|
+
head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
|
|
168
|
+
vocab_size = config.vocab_size
|
|
169
|
+
hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
|
|
170
|
+
num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
|
|
171
|
+
|
|
172
|
+
# TODO(jongho): Update if target npu is REBEL.
|
|
173
|
+
ATOM_DRAM_NBYTES = 16 * 2**30
|
|
174
|
+
ATOM_SYS_DRAM_NBYTES = 288 * 2**20
|
|
175
|
+
available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
|
|
176
|
+
|
|
177
|
+
if kernel_size is None:
|
|
178
|
+
if n_model_params is None:
|
|
179
|
+
raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
|
|
180
|
+
# Get estimated kernel size (approximated)
|
|
181
|
+
lm_heads_params = align(vocab_size, 64) * hidden_size
|
|
182
|
+
lm_heads_nbytes = (
|
|
183
|
+
align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
|
|
184
|
+
)
|
|
185
|
+
params = n_model_params - lm_heads_params
|
|
186
|
+
layer_nbytes = (
|
|
187
|
+
align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
|
|
188
|
+
* num_layers
|
|
189
|
+
* tensor_parallel_size
|
|
190
|
+
)
|
|
191
|
+
kernel_size = layer_nbytes + lm_heads_nbytes
|
|
192
|
+
elif n_model_params is not None:
|
|
193
|
+
raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
|
|
194
|
+
|
|
195
|
+
available_dram -= kernel_size
|
|
196
|
+
|
|
197
|
+
if buffer is None:
|
|
198
|
+
# TODO: Accurate buffer estimation
|
|
199
|
+
buffer_per_runtime_per_core = 2**28 # 256MB per runtime
|
|
200
|
+
buffer_per_core = buffer_per_runtime_per_core * num_runtimes # 1 for prefill, 1 for decoder
|
|
201
|
+
buffer = buffer_per_core * tensor_parallel_size
|
|
202
|
+
available_dram -= buffer
|
|
203
|
+
|
|
204
|
+
b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
|
205
|
+
c = num_layers * 2 * tensor_parallel_size
|
|
206
|
+
k = available_dram / c
|
|
207
|
+
max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
|
|
208
|
+
|
|
209
|
+
return max_n_blocks
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def maybe_suggest_kvcache_num_blocks(
|
|
213
|
+
cls,
|
|
214
|
+
compiled_models: Dict[str, "RBLNCompiledModel"],
|
|
215
|
+
model_config: "PretrainedConfig",
|
|
216
|
+
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
|
217
|
+
) -> None:
|
|
218
|
+
# Get the actual memory allocation of each node by key
|
|
219
|
+
alloc_memory_per_node_by_key: Dict[str, List[int]] = compiled_models["prefill"].get_alloc_per_node_by_key()
|
|
220
|
+
alloc_memory_by_key: Dict[str, int] = {
|
|
221
|
+
key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
|
|
222
|
+
}
|
|
223
|
+
for batch_size in rbln_config.decoder_batch_sizes:
|
|
224
|
+
for key, memory_per_node in (
|
|
225
|
+
compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
|
|
226
|
+
):
|
|
227
|
+
alloc_memory_by_key[key] += sum(memory_per_node)
|
|
228
|
+
alloc_memory_by_key.pop("PortRecur", None) # Old compiler's kv-cache Key
|
|
229
|
+
alloc_memory_by_key.pop("DramTensor", None) # kv-cache
|
|
230
|
+
kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
|
|
231
|
+
|
|
232
|
+
# Get the maximum number of blocks that can be allocated
|
|
233
|
+
buffer = sum(alloc_memory_by_key.values())
|
|
234
|
+
max_num_blocks = cls.get_maximum_num_blocks(
|
|
235
|
+
config=model_config,
|
|
236
|
+
tensor_parallel_size=rbln_config.tensor_parallel_size,
|
|
237
|
+
kvcache_block_size=rbln_config.kvcache_block_size,
|
|
238
|
+
kernel_size=kernel_size,
|
|
239
|
+
buffer=buffer,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Since our estimation logic is not always accurate,
|
|
243
|
+
# users can set `kvcache_num_blocks` to `max_num_blocks`.
|
|
244
|
+
# If the memory is not enough, the model will fail to compile.
|
|
245
|
+
if rbln_config.kvcache_num_blocks < max_num_blocks:
|
|
246
|
+
logger.warning(
|
|
247
|
+
f"Current `kvcache_num_blocks` setting is {rbln_config.kvcache_num_blocks}. "
|
|
248
|
+
"Our analysis indicates that additional memory is available for more blocks. "
|
|
249
|
+
f"Consider increasing `kvcache_num_blocks` to {max_num_blocks} for potentially improved performance. "
|
|
250
|
+
"Please be advised that our memory estimation algorithm has limitations, "
|
|
251
|
+
"and increasing this value may not guarantee successful model compilation."
|
|
252
|
+
)
|
|
@@ -23,6 +23,7 @@ different model architectures.
|
|
|
23
23
|
import inspect
|
|
24
24
|
from typing import TYPE_CHECKING, Optional, Union
|
|
25
25
|
|
|
26
|
+
from torch import nn
|
|
26
27
|
from transformers import (
|
|
27
28
|
AutoModel,
|
|
28
29
|
AutoModelForAudioClassification,
|
|
@@ -34,10 +35,7 @@ from transformers import (
|
|
|
34
35
|
AutoModelForTextEncoding,
|
|
35
36
|
PretrainedConfig,
|
|
36
37
|
)
|
|
37
|
-
from transformers.modeling_outputs import
|
|
38
|
-
BaseModelOutput,
|
|
39
|
-
QuestionAnsweringModelOutput,
|
|
40
|
-
)
|
|
38
|
+
from transformers.modeling_outputs import BaseModelOutput, QuestionAnsweringModelOutput
|
|
41
39
|
|
|
42
40
|
from ..configuration_utils import RBLNCompileConfig
|
|
43
41
|
from ..modeling import RBLNModel
|
|
@@ -60,6 +58,28 @@ class RBLNTransformerEncoder(RBLNModel):
|
|
|
60
58
|
rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
|
|
61
59
|
rbln_dtype = "int64"
|
|
62
60
|
|
|
61
|
+
@classmethod
|
|
62
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNTransformerEncoderConfig) -> nn.Module:
|
|
63
|
+
class TransformerEncoderWrapper(nn.Module):
|
|
64
|
+
# Parameters to disable for RBLN compilation
|
|
65
|
+
DISABLED_PARAMS = {"return_dict", "use_cache"}
|
|
66
|
+
|
|
67
|
+
def __init__(self, model: "PreTrainedModel", rbln_config: RBLNTransformerEncoderConfig):
|
|
68
|
+
super().__init__()
|
|
69
|
+
self.model = model
|
|
70
|
+
self.rbln_config = rbln_config
|
|
71
|
+
self._forward_signature = inspect.signature(model.forward)
|
|
72
|
+
|
|
73
|
+
def forward(self, *args, **kwargs):
|
|
74
|
+
# Disable parameters that are not compatible with RBLN compilation
|
|
75
|
+
for param_name in self.DISABLED_PARAMS:
|
|
76
|
+
if param_name in self._forward_signature.parameters:
|
|
77
|
+
kwargs[param_name] = False
|
|
78
|
+
|
|
79
|
+
return self.model(*args, **kwargs)
|
|
80
|
+
|
|
81
|
+
return TransformerEncoderWrapper(model, rbln_config).eval()
|
|
82
|
+
|
|
63
83
|
@classmethod
|
|
64
84
|
def _update_rbln_config(
|
|
65
85
|
cls,
|
|
@@ -130,10 +150,18 @@ class RBLNTransformerEncoder(RBLNModel):
|
|
|
130
150
|
"This is an internal error. Please report it to the developers."
|
|
131
151
|
)
|
|
132
152
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
153
|
+
if rbln_config.model_input_shapes is None:
|
|
154
|
+
input_info = [
|
|
155
|
+
(model_input_name, [rbln_config.batch_size, rbln_config.max_seq_len], cls.rbln_dtype)
|
|
156
|
+
for model_input_name in rbln_config.model_input_names
|
|
157
|
+
]
|
|
158
|
+
else:
|
|
159
|
+
input_info = [
|
|
160
|
+
(model_input_name, model_input_shape, cls.rbln_dtype)
|
|
161
|
+
for model_input_name, model_input_shape in zip(
|
|
162
|
+
rbln_config.model_input_names, rbln_config.model_input_shapes
|
|
163
|
+
)
|
|
164
|
+
]
|
|
137
165
|
|
|
138
166
|
rbln_config.set_compile_cfgs([RBLNCompileConfig(input_info=input_info)])
|
|
139
167
|
return rbln_config
|
|
@@ -203,7 +231,6 @@ class RBLNModelForQuestionAnswering(RBLNTransformerEncoder):
|
|
|
203
231
|
|
|
204
232
|
def _prepare_output(self, output, return_dict):
|
|
205
233
|
# Prepare QuestionAnswering specific output format.
|
|
206
|
-
|
|
207
234
|
start_logits, end_logits = output
|
|
208
235
|
|
|
209
236
|
if not return_dict:
|
|
@@ -240,6 +267,20 @@ class RBLNModelForImageClassification(RBLNImageModel):
|
|
|
240
267
|
class RBLNModelForDepthEstimation(RBLNImageModel):
|
|
241
268
|
auto_model_class = AutoModelForDepthEstimation
|
|
242
269
|
|
|
270
|
+
@classmethod
|
|
271
|
+
def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
|
|
272
|
+
class ImageModelWrapper(nn.Module):
|
|
273
|
+
def __init__(self, model: "PreTrainedModel", rbln_config: RBLNImageModelConfig):
|
|
274
|
+
super().__init__()
|
|
275
|
+
self.model = model
|
|
276
|
+
self.rbln_config = rbln_config
|
|
277
|
+
|
|
278
|
+
def forward(self, *args, **kwargs):
|
|
279
|
+
output = self.model(*args, return_dict=True, **kwargs)
|
|
280
|
+
return output.predicted_depth
|
|
281
|
+
|
|
282
|
+
return ImageModelWrapper(model, rbln_config).eval()
|
|
283
|
+
|
|
243
284
|
|
|
244
285
|
class RBLNModelForAudioClassification(RBLNModel):
|
|
245
286
|
"""
|
|
@@ -248,6 +289,7 @@ class RBLNModelForAudioClassification(RBLNModel):
|
|
|
248
289
|
|
|
249
290
|
A class to convert and run pre-trained transformers based AudioClassification models on RBLN devices.
|
|
250
291
|
It implements the methods to convert a pre-trained transformers AudioClassification model into a RBLN transformer model by:
|
|
292
|
+
|
|
251
293
|
- transferring the checkpoint weights of the original into an optimized RBLN graph,
|
|
252
294
|
- compiling the resulting graph using the RBLN compiler.
|
|
253
295
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at:
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
from transformers.modeling_outputs import ModelOutput
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class RBLNDecoderOnlyOutput(ModelOutput):
|
|
24
|
+
logits: torch.FloatTensor = None
|
|
25
|
+
generate_idx: torch.Tensor = None
|
|
26
|
+
padded_cache_lengths: int = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class RBLNGemma3ForCausalLMOutput(RBLNDecoderOnlyOutput):
|
|
31
|
+
attention_mask: Optional[torch.Tensor] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class RBLNSeq2SeqTSDecoderOutput(ModelOutput):
|
|
36
|
+
last_hidden_states: torch.FloatTensor = None
|
|
37
|
+
params: Tuple[torch.FloatTensor] = None
|
|
@@ -36,6 +36,8 @@ _import_structure = {
|
|
|
36
36
|
"RBLNAutoModelForSpeechSeq2Seq",
|
|
37
37
|
"RBLNAutoModelForVision2Seq",
|
|
38
38
|
"RBLNAutoModelForImageTextToText",
|
|
39
|
+
"RBLNAutoModelForTextEncoding",
|
|
40
|
+
"RBLNAutoModelForZeroShotObjectDetection",
|
|
39
41
|
],
|
|
40
42
|
"bart": [
|
|
41
43
|
"RBLNBartForConditionalGeneration",
|
|
@@ -73,6 +75,10 @@ _import_structure = {
|
|
|
73
75
|
"RBLNColPaliForRetrieval",
|
|
74
76
|
"RBLNColPaliForRetrievalConfig",
|
|
75
77
|
],
|
|
78
|
+
"colqwen2": [
|
|
79
|
+
"RBLNColQwen2ForRetrieval",
|
|
80
|
+
"RBLNColQwen2ForRetrievalConfig",
|
|
81
|
+
],
|
|
76
82
|
"distilbert": [
|
|
77
83
|
"RBLNDistilBertForQuestionAnswering",
|
|
78
84
|
"RBLNDistilBertForQuestionAnsweringConfig",
|
|
@@ -83,36 +89,60 @@ _import_structure = {
|
|
|
83
89
|
"RBLNQwen2_5_VLForConditionalGeneration",
|
|
84
90
|
"RBLNQwen2_5_VLForConditionalGenerationConfig",
|
|
85
91
|
],
|
|
92
|
+
"qwen2_vl": [
|
|
93
|
+
"RBLNQwen2VisionTransformerPretrainedModel",
|
|
94
|
+
"RBLNQwen2VisionTransformerPretrainedModelConfig",
|
|
95
|
+
"RBLNQwen2VLForConditionalGeneration",
|
|
96
|
+
"RBLNQwen2VLForConditionalGenerationConfig",
|
|
97
|
+
],
|
|
86
98
|
"decoderonly": [
|
|
99
|
+
"RBLNDecoderOnlyModelConfig",
|
|
100
|
+
"RBLNDecoderOnlyModel",
|
|
87
101
|
"RBLNDecoderOnlyModelForCausalLM",
|
|
88
102
|
"RBLNDecoderOnlyModelForCausalLMConfig",
|
|
103
|
+
"RBLNLoRAAdapterConfig",
|
|
104
|
+
"RBLNLoRAConfig",
|
|
89
105
|
],
|
|
106
|
+
"depth_anything": ["RBLNDepthAnythingForDepthEstimationConfig", "RBLNDepthAnythingForDepthEstimation"],
|
|
90
107
|
"dpt": [
|
|
91
108
|
"RBLNDPTForDepthEstimation",
|
|
92
109
|
"RBLNDPTForDepthEstimationConfig",
|
|
93
110
|
],
|
|
94
111
|
"exaone": ["RBLNExaoneForCausalLM", "RBLNExaoneForCausalLMConfig"],
|
|
95
|
-
"gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig"],
|
|
112
|
+
"gemma": ["RBLNGemmaForCausalLM", "RBLNGemmaForCausalLMConfig", "RBLNGemmaModel", "RBLNGemmaModelConfig"],
|
|
96
113
|
"gemma3": [
|
|
97
114
|
"RBLNGemma3ForCausalLM",
|
|
98
115
|
"RBLNGemma3ForCausalLMConfig",
|
|
99
116
|
"RBLNGemma3ForConditionalGeneration",
|
|
100
117
|
"RBLNGemma3ForConditionalGenerationConfig",
|
|
101
118
|
],
|
|
102
|
-
"gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig"],
|
|
119
|
+
"gpt2": ["RBLNGPT2LMHeadModel", "RBLNGPT2LMHeadModelConfig", "RBLNGPT2Model", "RBLNGPT2ModelConfig"],
|
|
103
120
|
"idefics3": [
|
|
104
121
|
"RBLNIdefics3VisionTransformer",
|
|
105
122
|
"RBLNIdefics3ForConditionalGeneration",
|
|
106
123
|
"RBLNIdefics3ForConditionalGenerationConfig",
|
|
107
124
|
"RBLNIdefics3VisionTransformerConfig",
|
|
108
125
|
],
|
|
109
|
-
"
|
|
110
|
-
"
|
|
126
|
+
"llava": ["RBLNLlavaForConditionalGeneration", "RBLNLlavaForConditionalGenerationConfig"],
|
|
127
|
+
"llama": ["RBLNLlamaForCausalLM", "RBLNLlamaForCausalLMConfig", "RBLNLlamaModel", "RBLNLlamaModelConfig"],
|
|
128
|
+
"opt": ["RBLNOPTForCausalLM", "RBLNOPTForCausalLMConfig", "RBLNOPTModel", "RBLNOPTModelConfig"],
|
|
129
|
+
"pegasus": [
|
|
130
|
+
"RBLNPegasusForConditionalGeneration",
|
|
131
|
+
"RBLNPegasusModel",
|
|
132
|
+
"RBLNPegasusForConditionalGenerationConfig",
|
|
133
|
+
"RBLNPegasusModelConfig",
|
|
134
|
+
],
|
|
111
135
|
"llava_next": ["RBLNLlavaNextForConditionalGeneration", "RBLNLlavaNextForConditionalGenerationConfig"],
|
|
112
136
|
"midm": ["RBLNMidmLMHeadModel", "RBLNMidmLMHeadModelConfig"],
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
|
|
137
|
+
"pixtral": ["RBLNPixtralVisionModel", "RBLNPixtralVisionModelConfig"],
|
|
138
|
+
"mistral": [
|
|
139
|
+
"RBLNMistralForCausalLM",
|
|
140
|
+
"RBLNMistralForCausalLMConfig",
|
|
141
|
+
"RBLNMistralModel",
|
|
142
|
+
"RBLNMistralModelConfig",
|
|
143
|
+
],
|
|
144
|
+
"phi": ["RBLNPhiForCausalLM", "RBLNPhiForCausalLMConfig", "RBLNPhiModel", "RBLNPhiModelConfig"],
|
|
145
|
+
"qwen2": ["RBLNQwen2ForCausalLM", "RBLNQwen2ForCausalLMConfig", "RBLNQwen2Model", "RBLNQwen2ModelConfig"],
|
|
116
146
|
"qwen3": ["RBLNQwen3ForCausalLM", "RBLNQwen3ForCausalLMConfig", "RBLNQwen3Model", "RBLNQwen3ModelConfig"],
|
|
117
147
|
"resnet": ["RBLNResNetForImageClassification", "RBLNResNetForImageClassificationConfig"],
|
|
118
148
|
"roberta": [
|
|
@@ -125,6 +155,10 @@ _import_structure = {
|
|
|
125
155
|
"RBLNSiglipVisionModel",
|
|
126
156
|
"RBLNSiglipVisionModelConfig",
|
|
127
157
|
],
|
|
158
|
+
"swin": [
|
|
159
|
+
"RBLNSwinBackbone",
|
|
160
|
+
"RBLNSwinBackboneConfig",
|
|
161
|
+
],
|
|
128
162
|
"time_series_transformer": [
|
|
129
163
|
"RBLNTimeSeriesTransformerForPrediction",
|
|
130
164
|
"RBLNTimeSeriesTransformerForPredictionConfig",
|
|
@@ -144,13 +178,18 @@ _import_structure = {
|
|
|
144
178
|
"RBLNXLMRobertaForSequenceClassification",
|
|
145
179
|
"RBLNXLMRobertaForSequenceClassificationConfig",
|
|
146
180
|
],
|
|
181
|
+
"grounding_dino": [
|
|
182
|
+
"RBLNGroundingDinoForObjectDetection",
|
|
183
|
+
"RBLNGroundingDinoForObjectDetectionConfig",
|
|
184
|
+
"RBLNGroundingDinoEncoder",
|
|
185
|
+
"RBLNGroundingDinoEncoderConfig",
|
|
186
|
+
"RBLNGroundingDinoDecoder",
|
|
187
|
+
"RBLNGroundingDinoDecoderConfig",
|
|
188
|
+
],
|
|
147
189
|
}
|
|
148
190
|
|
|
149
191
|
if TYPE_CHECKING:
|
|
150
|
-
from .audio_spectrogram_transformer import
|
|
151
|
-
RBLNASTForAudioClassification,
|
|
152
|
-
RBLNASTForAudioClassificationConfig,
|
|
153
|
-
)
|
|
192
|
+
from .audio_spectrogram_transformer import RBLNASTForAudioClassification, RBLNASTForAudioClassificationConfig
|
|
154
193
|
from .auto import (
|
|
155
194
|
RBLNAutoModel,
|
|
156
195
|
RBLNAutoModelForAudioClassification,
|
|
@@ -164,7 +203,9 @@ if TYPE_CHECKING:
|
|
|
164
203
|
RBLNAutoModelForSeq2SeqLM,
|
|
165
204
|
RBLNAutoModelForSequenceClassification,
|
|
166
205
|
RBLNAutoModelForSpeechSeq2Seq,
|
|
206
|
+
RBLNAutoModelForTextEncoding,
|
|
167
207
|
RBLNAutoModelForVision2Seq,
|
|
208
|
+
RBLNAutoModelForZeroShotObjectDetection,
|
|
168
209
|
)
|
|
169
210
|
from .bart import (
|
|
170
211
|
RBLNBartForConditionalGeneration,
|
|
@@ -198,50 +239,69 @@ if TYPE_CHECKING:
|
|
|
198
239
|
RBLNCLIPVisionModelWithProjection,
|
|
199
240
|
RBLNCLIPVisionModelWithProjectionConfig,
|
|
200
241
|
)
|
|
201
|
-
from .colpali import
|
|
202
|
-
|
|
203
|
-
RBLNColPaliForRetrievalConfig,
|
|
204
|
-
)
|
|
242
|
+
from .colpali import RBLNColPaliForRetrieval, RBLNColPaliForRetrievalConfig
|
|
243
|
+
from .colqwen2 import RBLNColQwen2ForRetrieval, RBLNColQwen2ForRetrievalConfig
|
|
205
244
|
from .decoderonly import (
|
|
245
|
+
RBLNDecoderOnlyModel,
|
|
246
|
+
RBLNDecoderOnlyModelConfig,
|
|
206
247
|
RBLNDecoderOnlyModelForCausalLM,
|
|
207
248
|
RBLNDecoderOnlyModelForCausalLMConfig,
|
|
249
|
+
RBLNLoRAAdapterConfig,
|
|
250
|
+
RBLNLoRAConfig,
|
|
208
251
|
)
|
|
209
|
-
from .
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
)
|
|
213
|
-
from .dpt import (
|
|
214
|
-
RBLNDPTForDepthEstimation,
|
|
215
|
-
RBLNDPTForDepthEstimationConfig,
|
|
216
|
-
)
|
|
252
|
+
from .depth_anything import RBLNDepthAnythingForDepthEstimation, RBLNDepthAnythingForDepthEstimationConfig
|
|
253
|
+
from .distilbert import RBLNDistilBertForQuestionAnswering, RBLNDistilBertForQuestionAnsweringConfig
|
|
254
|
+
from .dpt import RBLNDPTForDepthEstimation, RBLNDPTForDepthEstimationConfig
|
|
217
255
|
from .exaone import RBLNExaoneForCausalLM, RBLNExaoneForCausalLMConfig
|
|
218
|
-
from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig
|
|
256
|
+
from .gemma import RBLNGemmaForCausalLM, RBLNGemmaForCausalLMConfig, RBLNGemmaModel, RBLNGemmaModelConfig
|
|
219
257
|
from .gemma3 import (
|
|
220
258
|
RBLNGemma3ForCausalLM,
|
|
221
259
|
RBLNGemma3ForCausalLMConfig,
|
|
222
260
|
RBLNGemma3ForConditionalGeneration,
|
|
223
261
|
RBLNGemma3ForConditionalGenerationConfig,
|
|
224
262
|
)
|
|
225
|
-
from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig
|
|
263
|
+
from .gpt2 import RBLNGPT2LMHeadModel, RBLNGPT2LMHeadModelConfig, RBLNGPT2Model, RBLNGPT2ModelConfig
|
|
264
|
+
from .grounding_dino import (
|
|
265
|
+
RBLNGroundingDinoDecoder,
|
|
266
|
+
RBLNGroundingDinoDecoderConfig,
|
|
267
|
+
RBLNGroundingDinoEncoder,
|
|
268
|
+
RBLNGroundingDinoEncoderConfig,
|
|
269
|
+
RBLNGroundingDinoForObjectDetection,
|
|
270
|
+
RBLNGroundingDinoForObjectDetectionConfig,
|
|
271
|
+
)
|
|
226
272
|
from .idefics3 import (
|
|
227
273
|
RBLNIdefics3ForConditionalGeneration,
|
|
228
274
|
RBLNIdefics3ForConditionalGenerationConfig,
|
|
229
275
|
RBLNIdefics3VisionTransformer,
|
|
230
276
|
RBLNIdefics3VisionTransformerConfig,
|
|
231
277
|
)
|
|
232
|
-
from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig
|
|
278
|
+
from .llama import RBLNLlamaForCausalLM, RBLNLlamaForCausalLMConfig, RBLNLlamaModel, RBLNLlamaModelConfig
|
|
279
|
+
from .llava import RBLNLlavaForConditionalGeneration, RBLNLlavaForConditionalGenerationConfig
|
|
233
280
|
from .llava_next import RBLNLlavaNextForConditionalGeneration, RBLNLlavaNextForConditionalGenerationConfig
|
|
234
281
|
from .midm import RBLNMidmLMHeadModel, RBLNMidmLMHeadModelConfig
|
|
235
|
-
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig
|
|
236
|
-
from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig
|
|
237
|
-
from .
|
|
238
|
-
|
|
282
|
+
from .mistral import RBLNMistralForCausalLM, RBLNMistralForCausalLMConfig, RBLNMistralModel, RBLNMistralModelConfig
|
|
283
|
+
from .opt import RBLNOPTForCausalLM, RBLNOPTForCausalLMConfig, RBLNOPTModel, RBLNOPTModelConfig
|
|
284
|
+
from .pegasus import (
|
|
285
|
+
RBLNPegasusForConditionalGeneration,
|
|
286
|
+
RBLNPegasusForConditionalGenerationConfig,
|
|
287
|
+
RBLNPegasusModel,
|
|
288
|
+
RBLNPegasusModelConfig,
|
|
289
|
+
)
|
|
290
|
+
from .phi import RBLNPhiForCausalLM, RBLNPhiForCausalLMConfig, RBLNPhiModel, RBLNPhiModelConfig
|
|
291
|
+
from .pixtral import RBLNPixtralVisionModel, RBLNPixtralVisionModelConfig
|
|
292
|
+
from .qwen2 import RBLNQwen2ForCausalLM, RBLNQwen2ForCausalLMConfig, RBLNQwen2Model, RBLNQwen2ModelConfig
|
|
239
293
|
from .qwen2_5_vl import (
|
|
240
294
|
RBLNQwen2_5_VisionTransformerPretrainedModel,
|
|
241
295
|
RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
|
|
242
296
|
RBLNQwen2_5_VLForConditionalGeneration,
|
|
243
297
|
RBLNQwen2_5_VLForConditionalGenerationConfig,
|
|
244
298
|
)
|
|
299
|
+
from .qwen2_vl import (
|
|
300
|
+
RBLNQwen2VisionTransformerPretrainedModel,
|
|
301
|
+
RBLNQwen2VisionTransformerPretrainedModelConfig,
|
|
302
|
+
RBLNQwen2VLForConditionalGeneration,
|
|
303
|
+
RBLNQwen2VLForConditionalGenerationConfig,
|
|
304
|
+
)
|
|
245
305
|
from .qwen3 import RBLNQwen3ForCausalLM, RBLNQwen3ForCausalLMConfig, RBLNQwen3Model, RBLNQwen3ModelConfig
|
|
246
306
|
from .resnet import RBLNResNetForImageClassification, RBLNResNetForImageClassificationConfig
|
|
247
307
|
from .roberta import (
|
|
@@ -251,6 +311,7 @@ if TYPE_CHECKING:
|
|
|
251
311
|
RBLNRobertaForSequenceClassificationConfig,
|
|
252
312
|
)
|
|
253
313
|
from .siglip import RBLNSiglipVisionModel, RBLNSiglipVisionModelConfig
|
|
314
|
+
from .swin import RBLNSwinBackbone, RBLNSwinBackboneConfig
|
|
254
315
|
from .t5 import (
|
|
255
316
|
RBLNT5EncoderModel,
|
|
256
317
|
RBLNT5EncoderModelConfig,
|