optimum-rbln 0.8.2a0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +116 -9
- optimum/rbln/__version__.py +16 -3
- optimum/rbln/cli.py +660 -0
- optimum/rbln/configuration_utils.py +171 -43
- optimum/rbln/diffusers/__init__.py +19 -0
- optimum/rbln/diffusers/configurations/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/models/__init__.py +2 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +67 -0
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +4 -4
- optimum/rbln/diffusers/configurations/models/configuration_transformer_cosmos.py +12 -4
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +9 -4
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +3 -3
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +59 -0
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +3 -3
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +3 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +35 -19
- optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +14 -11
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +30 -20
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +13 -9
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +17 -13
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +17 -10
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +114 -0
- optimum/rbln/diffusers/modeling_diffusers.py +33 -18
- optimum/rbln/diffusers/models/__init__.py +4 -0
- optimum/rbln/diffusers/models/autoencoders/__init__.py +1 -0
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +32 -3
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +32 -6
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +275 -0
- optimum/rbln/diffusers/models/autoencoders/vae.py +27 -8
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +32 -3
- optimum/rbln/diffusers/models/controlnet.py +16 -1
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +17 -3
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +26 -3
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +23 -2
- optimum/rbln/diffusers/models/unets/__init__.py +1 -0
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +23 -4
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +201 -0
- optimum/rbln/diffusers/pipelines/__init__.py +15 -5
- optimum/rbln/diffusers/pipelines/auto_pipeline.py +307 -0
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +20 -0
- optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +23 -12
- optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +16 -46
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +31 -1
- optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +31 -1
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +15 -0
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +46 -0
- optimum/rbln/modeling.py +50 -24
- optimum/rbln/modeling_base.py +116 -35
- optimum/rbln/ops/attn.py +158 -0
- optimum/rbln/ops/flash_attn.py +166 -0
- optimum/rbln/ops/kv_cache_update.py +5 -0
- optimum/rbln/ops/linear.py +7 -0
- optimum/rbln/transformers/__init__.py +100 -0
- optimum/rbln/transformers/configuration_generic.py +7 -32
- optimum/rbln/transformers/modeling_attention_utils.py +385 -0
- optimum/rbln/transformers/modeling_generic.py +48 -65
- optimum/rbln/transformers/modeling_outputs.py +37 -0
- optimum/rbln/transformers/models/__init__.py +93 -30
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +28 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +68 -5
- optimum/rbln/transformers/models/auto/__init__.py +2 -0
- optimum/rbln/transformers/models/auto/auto_factory.py +92 -17
- optimum/rbln/transformers/models/auto/modeling_auto.py +45 -0
- optimum/rbln/transformers/models/bart/bart_architecture.py +2 -7
- optimum/rbln/transformers/models/bart/configuration_bart.py +2 -0
- optimum/rbln/transformers/models/bart/modeling_bart.py +23 -2
- optimum/rbln/transformers/models/bert/bert_architecture.py +16 -0
- optimum/rbln/transformers/models/bert/modeling_bert.py +93 -4
- optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +42 -11
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +135 -44
- optimum/rbln/transformers/models/clip/configuration_clip.py +21 -7
- optimum/rbln/transformers/models/clip/modeling_clip.py +183 -27
- optimum/rbln/transformers/models/colpali/colpali_architecture.py +3 -6
- optimum/rbln/transformers/models/colpali/configuration_colpali.py +37 -21
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +82 -104
- optimum/rbln/transformers/models/colqwen2/__init__.py +2 -0
- optimum/rbln/transformers/models/colqwen2/colqwen2_architecture.py +233 -0
- optimum/rbln/transformers/models/colqwen2/configuration_colqwen2.py +74 -0
- optimum/rbln/transformers/models/colqwen2/modeling_colqwen2.py +446 -0
- optimum/rbln/transformers/models/decoderonly/__init__.py +3 -2
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +114 -37
- optimum/rbln/transformers/models/decoderonly/configuration_lora.py +411 -0
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +323 -316
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +508 -0
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +119 -0
- optimum/rbln/transformers/models/decoderonly/lora_architecture.py +204 -0
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +486 -892
- optimum/rbln/transformers/models/depth_anything/__init__.py +16 -0
- optimum/rbln/transformers/models/depth_anything/configuration_depth_anything.py +24 -0
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +42 -0
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +24 -0
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +17 -0
- optimum/rbln/transformers/models/exaone/modeling_exaone.py +42 -4
- optimum/rbln/transformers/models/gemma/__init__.py +2 -2
- optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
- optimum/rbln/transformers/models/gemma/gemma_architecture.py +1 -4
- optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
- optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +49 -14
- optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +12 -2
- optimum/rbln/transformers/models/gemma3/gemma3_runtime_utils.py +245 -0
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +212 -504
- optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
- optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
- optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +10 -8
- optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
- optimum/rbln/transformers/models/grounding_dino/__init__.py +10 -0
- optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py +92 -0
- optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py +599 -0
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +1048 -0
- optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +35 -7
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +29 -32
- optimum/rbln/transformers/models/llama/__init__.py +2 -2
- optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
- optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
- optimum/rbln/transformers/models/llava/__init__.py +16 -0
- optimum/rbln/transformers/models/llava/configuration_llava.py +72 -0
- optimum/rbln/transformers/models/llava/modeling_llava.py +490 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +21 -6
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +234 -376
- optimum/rbln/transformers/models/midm/midm_architecture.py +4 -1
- optimum/rbln/transformers/models/midm/modeling_midm.py +42 -4
- optimum/rbln/transformers/models/mistral/__init__.py +2 -2
- optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
- optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
- optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
- optimum/rbln/transformers/models/opt/__init__.py +2 -2
- optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
- optimum/rbln/transformers/models/opt/modeling_opt.py +29 -17
- optimum/rbln/transformers/models/opt/opt_architecture.py +4 -4
- optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
- optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +38 -0
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +71 -0
- optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
- optimum/rbln/transformers/models/phi/__init__.py +2 -2
- optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
- optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
- optimum/rbln/transformers/models/phi/phi_architecture.py +11 -7
- optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
- optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +322 -0
- optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
- optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
- optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
- optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
- optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +21 -6
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +15 -22
- optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +28 -7
- optimum/rbln/transformers/models/qwen2_vl/__init__.py +19 -0
- optimum/rbln/transformers/models/qwen2_vl/configuration_qwen2_vl.py +88 -0
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +513 -0
- optimum/rbln/transformers/models/qwen2_vl/qwen2_vl_architecture.py +165 -0
- optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
- optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
- optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +133 -0
- optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +31 -0
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +17 -0
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +73 -0
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +33 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +21 -16
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +60 -13
- optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +2 -2
- optimum/rbln/transformers/models/siglip/__init__.py +2 -6
- optimum/rbln/transformers/models/siglip/configuration_siglip.py +1 -1
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +21 -16
- optimum/rbln/transformers/models/swin/__init__.py +16 -0
- optimum/rbln/transformers/models/swin/configuration_swin.py +42 -0
- optimum/rbln/transformers/models/swin/modeling_swin.py +354 -0
- optimum/rbln/transformers/models/t5/configuration_t5.py +2 -0
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/t5/t5_architecture.py +8 -1
- optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +3 -3
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +22 -16
- optimum/rbln/transformers/models/time_series_transformer/time_series_transformers_architecture.py +7 -1
- optimum/rbln/transformers/models/vit/modeling_vit.py +19 -0
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +15 -3
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +61 -8
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +12 -13
- optimum/rbln/transformers/models/whisper/generation_whisper.py +62 -6
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +32 -5
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +43 -0
- optimum/rbln/transformers/utils/rbln_quantization.py +400 -75
- optimum/rbln/transformers/utils/rbln_runtime_wrapper.py +79 -0
- optimum/rbln/utils/deprecation.py +213 -0
- optimum/rbln/utils/hub.py +22 -50
- optimum/rbln/utils/runtime_utils.py +85 -17
- optimum/rbln/utils/submodule.py +31 -9
- {optimum_rbln-0.8.2a0.dist-info → optimum_rbln-0.9.3.dist-info}/METADATA +8 -7
- optimum_rbln-0.9.3.dist-info/RECORD +264 -0
- {optimum_rbln-0.8.2a0.dist-info → optimum_rbln-0.9.3.dist-info}/WHEEL +1 -1
- optimum_rbln-0.9.3.dist-info/entry_points.txt +2 -0
- optimum_rbln-0.8.2a0.dist-info/RECORD +0 -211
- {optimum_rbln-0.8.2a0.dist-info → optimum_rbln-0.9.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional, Tuple
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
from ....transformers import RBLNCLIPTextModelWithProjectionConfig, RBLNT5EncoderModelConfig
|
|
@@ -40,7 +40,7 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
|
|
|
40
40
|
height: Optional[int] = None,
|
|
41
41
|
width: Optional[int] = None,
|
|
42
42
|
guidance_scale: Optional[float] = None,
|
|
43
|
-
**kwargs:
|
|
43
|
+
**kwargs: Any,
|
|
44
44
|
):
|
|
45
45
|
"""
|
|
46
46
|
Args:
|
|
@@ -64,7 +64,7 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
|
|
|
64
64
|
height (Optional[int]): Height of the generated images.
|
|
65
65
|
width (Optional[int]): Width of the generated images.
|
|
66
66
|
guidance_scale (Optional[float]): Scale for classifier-free guidance.
|
|
67
|
-
|
|
67
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
68
68
|
|
|
69
69
|
Raises:
|
|
70
70
|
ValueError: If both image_size and img_height/img_width are provided.
|
|
@@ -100,27 +100,31 @@ class RBLNStableDiffusion3PipelineBaseConfig(RBLNModelConfig):
|
|
|
100
100
|
|
|
101
101
|
max_seq_len = max_seq_len or 256
|
|
102
102
|
|
|
103
|
-
self.text_encoder = self.
|
|
104
|
-
|
|
103
|
+
self.text_encoder = self.initialize_submodule_config(
|
|
104
|
+
text_encoder,
|
|
105
|
+
cls_name="RBLNCLIPTextModelWithProjectionConfig",
|
|
106
|
+
batch_size=batch_size,
|
|
105
107
|
)
|
|
106
|
-
self.text_encoder_2 = self.
|
|
107
|
-
|
|
108
|
+
self.text_encoder_2 = self.initialize_submodule_config(
|
|
109
|
+
text_encoder_2,
|
|
110
|
+
cls_name="RBLNCLIPTextModelWithProjectionConfig",
|
|
111
|
+
batch_size=batch_size,
|
|
108
112
|
)
|
|
109
|
-
self.text_encoder_3 = self.
|
|
110
|
-
RBLNT5EncoderModelConfig,
|
|
113
|
+
self.text_encoder_3 = self.initialize_submodule_config(
|
|
111
114
|
text_encoder_3,
|
|
115
|
+
cls_name="RBLNT5EncoderModelConfig",
|
|
112
116
|
batch_size=batch_size,
|
|
113
117
|
max_seq_len=max_seq_len,
|
|
114
118
|
model_input_names=["input_ids"],
|
|
115
119
|
)
|
|
116
|
-
self.transformer = self.
|
|
117
|
-
RBLNSD3Transformer2DModelConfig,
|
|
120
|
+
self.transformer = self.initialize_submodule_config(
|
|
118
121
|
transformer,
|
|
122
|
+
cls_name="RBLNSD3Transformer2DModelConfig",
|
|
119
123
|
sample_size=sample_size,
|
|
120
124
|
)
|
|
121
|
-
self.vae = self.
|
|
122
|
-
RBLNAutoencoderKLConfig,
|
|
125
|
+
self.vae = self.initialize_submodule_config(
|
|
123
126
|
vae,
|
|
127
|
+
cls_name="RBLNAutoencoderKLConfig",
|
|
124
128
|
batch_size=batch_size,
|
|
125
129
|
uses_encoder=self.__class__._vae_uses_encoder,
|
|
126
130
|
sample_size=image_size,
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any,
|
|
15
|
+
from typing import Any, Optional, Tuple
|
|
16
16
|
|
|
17
17
|
from ....configuration_utils import RBLNModelConfig
|
|
18
18
|
from ....transformers import RBLNCLIPTextModelConfig, RBLNCLIPTextModelWithProjectionConfig
|
|
@@ -38,7 +38,7 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
|
|
|
38
38
|
sample_size: Optional[Tuple[int, int]] = None,
|
|
39
39
|
image_size: Optional[Tuple[int, int]] = None,
|
|
40
40
|
guidance_scale: Optional[float] = None,
|
|
41
|
-
**kwargs:
|
|
41
|
+
**kwargs: Any,
|
|
42
42
|
):
|
|
43
43
|
"""
|
|
44
44
|
Args:
|
|
@@ -59,7 +59,7 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
|
|
|
59
59
|
image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
|
|
60
60
|
Cannot be used together with img_height/img_width.
|
|
61
61
|
guidance_scale (Optional[float]): Scale for classifier-free guidance.
|
|
62
|
-
|
|
62
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
63
63
|
|
|
64
64
|
Raises:
|
|
65
65
|
ValueError: If both image_size and img_height/img_width are provided.
|
|
@@ -93,18 +93,25 @@ class RBLNStableDiffusionXLPipelineBaseConfig(RBLNModelConfig):
|
|
|
93
93
|
elif (img_height is not None and img_width is None) or (img_height is None and img_width is not None):
|
|
94
94
|
raise ValueError("Both img_height and img_width must be provided together if used")
|
|
95
95
|
|
|
96
|
-
self.text_encoder = self.
|
|
97
|
-
|
|
98
|
-
|
|
96
|
+
self.text_encoder = self.initialize_submodule_config(
|
|
97
|
+
text_encoder,
|
|
98
|
+
cls_name="RBLNCLIPTextModelConfig",
|
|
99
|
+
batch_size=batch_size,
|
|
100
|
+
)
|
|
101
|
+
self.text_encoder_2 = self.initialize_submodule_config(
|
|
102
|
+
text_encoder_2,
|
|
103
|
+
cls_name="RBLNCLIPTextModelWithProjectionConfig",
|
|
104
|
+
batch_size=batch_size,
|
|
99
105
|
)
|
|
100
|
-
|
|
101
|
-
|
|
106
|
+
|
|
107
|
+
self.unet = self.initialize_submodule_config(
|
|
102
108
|
unet,
|
|
109
|
+
cls_name="RBLNUNet2DConditionModelConfig",
|
|
103
110
|
sample_size=sample_size,
|
|
104
111
|
)
|
|
105
|
-
self.vae = self.
|
|
106
|
-
RBLNAutoencoderKLConfig,
|
|
112
|
+
self.vae = self.initialize_submodule_config(
|
|
107
113
|
vae,
|
|
114
|
+
cls_name="RBLNAutoencoderKLConfig",
|
|
108
115
|
batch_size=batch_size,
|
|
109
116
|
uses_encoder=self.__class__._vae_uses_encoder,
|
|
110
117
|
sample_size=image_size, # image size is equal to sample size in vae
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at:
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Optional
|
|
16
|
+
|
|
17
|
+
from ....configuration_utils import RBLNModelConfig
|
|
18
|
+
from ....transformers import RBLNCLIPVisionModelWithProjectionConfig
|
|
19
|
+
from ..models import RBLNAutoencoderKLTemporalDecoderConfig, RBLNUNetSpatioTemporalConditionModelConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RBLNStableVideoDiffusionPipelineConfig(RBLNModelConfig):
|
|
23
|
+
submodules = ["image_encoder", "unet", "vae"]
|
|
24
|
+
_vae_uses_encoder = True
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
|
|
29
|
+
unet: Optional[RBLNUNetSpatioTemporalConditionModelConfig] = None,
|
|
30
|
+
vae: Optional[RBLNAutoencoderKLTemporalDecoderConfig] = None,
|
|
31
|
+
*,
|
|
32
|
+
batch_size: Optional[int] = None,
|
|
33
|
+
height: Optional[int] = None,
|
|
34
|
+
width: Optional[int] = None,
|
|
35
|
+
num_frames: Optional[int] = None,
|
|
36
|
+
decode_chunk_size: Optional[int] = None,
|
|
37
|
+
guidance_scale: Optional[float] = None,
|
|
38
|
+
**kwargs: Any,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Args:
|
|
42
|
+
image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Configuration for the image encoder component.
|
|
43
|
+
Initialized as RBLNCLIPVisionModelWithProjectionConfig if not provided.
|
|
44
|
+
unet (Optional[RBLNUNetSpatioTemporalConditionModelConfig]): Configuration for the UNet model component.
|
|
45
|
+
Initialized as RBLNUNetSpatioTemporalConditionModelConfig if not provided.
|
|
46
|
+
vae (Optional[RBLNAutoencoderKLTemporalDecoderConfig]): Configuration for the VAE model component.
|
|
47
|
+
Initialized as RBLNAutoencoderKLTemporalDecoderConfig if not provided.
|
|
48
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
|
49
|
+
height (Optional[int]): Height of the generated images.
|
|
50
|
+
width (Optional[int]): Width of the generated images.
|
|
51
|
+
num_frames (Optional[int]): The number of frames in the generated video.
|
|
52
|
+
decode_chunk_size (Optional[int]): The number of frames to decode at once during VAE decoding.
|
|
53
|
+
Useful for managing memory usage during video generation.
|
|
54
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance.
|
|
55
|
+
kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: If both image_size and height/width are provided.
|
|
59
|
+
|
|
60
|
+
Note:
|
|
61
|
+
When guidance_scale > 1.0, the UNet batch size is automatically doubled to
|
|
62
|
+
accommodate classifier-free guidance.
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(**kwargs)
|
|
65
|
+
if height is not None and width is not None:
|
|
66
|
+
image_size = (height, width)
|
|
67
|
+
else:
|
|
68
|
+
# Get default image size from original class to set UNet, VAE image size
|
|
69
|
+
height = self.get_default_values_for_original_cls("__call__", ["height"])["height"]
|
|
70
|
+
width = self.get_default_values_for_original_cls("__call__", ["width"])["width"]
|
|
71
|
+
image_size = (height, width)
|
|
72
|
+
|
|
73
|
+
self.image_encoder = self.initialize_submodule_config(
|
|
74
|
+
image_encoder, cls_name="RBLNCLIPVisionModelWithProjectionConfig", batch_size=batch_size
|
|
75
|
+
)
|
|
76
|
+
self.unet = self.initialize_submodule_config(
|
|
77
|
+
unet,
|
|
78
|
+
cls_name="RBLNUNetSpatioTemporalConditionModelConfig",
|
|
79
|
+
num_frames=num_frames,
|
|
80
|
+
)
|
|
81
|
+
self.vae = self.initialize_submodule_config(
|
|
82
|
+
vae,
|
|
83
|
+
cls_name="RBLNAutoencoderKLTemporalDecoderConfig",
|
|
84
|
+
batch_size=batch_size,
|
|
85
|
+
num_frames=num_frames,
|
|
86
|
+
decode_chunk_size=decode_chunk_size,
|
|
87
|
+
uses_encoder=self.__class__._vae_uses_encoder,
|
|
88
|
+
sample_size=image_size, # image size is equal to sample size in vae
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Get default guidance scale from original class to set UNet batch size
|
|
92
|
+
if guidance_scale is None:
|
|
93
|
+
guidance_scale = self.get_default_values_for_original_cls("__call__", ["max_guidance_scale"])[
|
|
94
|
+
"max_guidance_scale"
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
if not self.unet.batch_size_is_specified:
|
|
98
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
|
99
|
+
if do_classifier_free_guidance:
|
|
100
|
+
self.unet.batch_size = self.image_encoder.batch_size * 2
|
|
101
|
+
else:
|
|
102
|
+
self.unet.batch_size = self.image_encoder.batch_size
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def batch_size(self):
|
|
106
|
+
return self.vae.batch_size
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def sample_size(self):
|
|
110
|
+
return self.unet.sample_size
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def image_size(self):
|
|
114
|
+
return self.vae.sample_size
|
|
@@ -33,6 +33,10 @@ if TYPE_CHECKING:
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class RBLNDiffusionMixinConfig(RBLNModelConfig):
|
|
36
|
+
"""
|
|
37
|
+
Configuration class for RBLN diffusion pipelines.
|
|
38
|
+
"""
|
|
39
|
+
|
|
36
40
|
pass
|
|
37
41
|
|
|
38
42
|
|
|
@@ -54,8 +58,8 @@ class RBLNDiffusionMixin:
|
|
|
54
58
|
```
|
|
55
59
|
|
|
56
60
|
Class Variables:
|
|
57
|
-
_submodules
|
|
58
|
-
_optional_submodules
|
|
61
|
+
- `_submodules`: List of submodule names that should be compiled (typically ["text_encoder", "unet", "vae"])
|
|
62
|
+
- `_optional_submodules`: List of submodule names compiled without inheriting RBLNModel (typically ["safety_checker"])
|
|
59
63
|
|
|
60
64
|
Methods:
|
|
61
65
|
from_pretrained: Creates and optionally compiles a model from a pretrained checkpoint
|
|
@@ -70,8 +74,6 @@ class RBLNDiffusionMixin:
|
|
|
70
74
|
_submodules = []
|
|
71
75
|
_optional_submodules = []
|
|
72
76
|
_prefix = {}
|
|
73
|
-
_rbln_config_class = None
|
|
74
|
-
_hf_class = None
|
|
75
77
|
|
|
76
78
|
@staticmethod
|
|
77
79
|
def _maybe_apply_and_fuse_lora(
|
|
@@ -114,14 +116,14 @@ class RBLNDiffusionMixin:
|
|
|
114
116
|
@classmethod
|
|
115
117
|
def get_rbln_config_class(cls) -> Type[RBLNModelConfig]:
|
|
116
118
|
# Lazily loads and caches the corresponding RBLN model config class.
|
|
117
|
-
if cls._rbln_config_class is None:
|
|
119
|
+
if "_rbln_config_class" not in cls.__dict__ or cls._rbln_config_class is None:
|
|
118
120
|
rbln_config_class_name = cls.__name__ + "Config"
|
|
119
121
|
cls._rbln_config_class = get_rbln_config_class(rbln_config_class_name)
|
|
120
122
|
return cls._rbln_config_class
|
|
121
123
|
|
|
122
124
|
@classmethod
|
|
123
125
|
def get_hf_class(cls):
|
|
124
|
-
if cls._hf_class is None:
|
|
126
|
+
if "_hf_class" not in cls.__dict__ or cls._hf_class is None:
|
|
125
127
|
hf_cls_name = cls.__name__[4:]
|
|
126
128
|
library = importlib.import_module("diffusers")
|
|
127
129
|
cls._hf_class = getattr(library, hf_cls_name, None)
|
|
@@ -132,20 +134,20 @@ class RBLNDiffusionMixin:
|
|
|
132
134
|
cls,
|
|
133
135
|
model_id: str,
|
|
134
136
|
*,
|
|
135
|
-
export: bool =
|
|
137
|
+
export: bool = None,
|
|
136
138
|
model_save_dir: Optional[PathLike] = None,
|
|
137
139
|
rbln_config: Dict[str, Any] = {},
|
|
138
140
|
lora_ids: Optional[Union[str, List[str]]] = None,
|
|
139
141
|
lora_weights_names: Optional[Union[str, List[str]]] = None,
|
|
140
142
|
lora_scales: Optional[Union[float, List[float]]] = None,
|
|
141
|
-
**kwargs:
|
|
143
|
+
**kwargs: Any,
|
|
142
144
|
) -> "RBLNDiffusionMixin":
|
|
143
145
|
"""
|
|
144
146
|
Load a pretrained diffusion pipeline from a model checkpoint, with optional compilation for RBLN NPUs.
|
|
145
147
|
|
|
146
148
|
This method has two distinct operating modes:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
+
- When `export=True`: Takes a PyTorch-based diffusion model, compiles it for RBLN NPUs, and loads the compiled model
|
|
150
|
+
- When `export=False`: Loads an already compiled RBLN model from `model_id` without recompilation
|
|
149
151
|
|
|
150
152
|
It supports various diffusion pipelines including Stable Diffusion, Kandinsky, ControlNet, and other diffusers-based models.
|
|
151
153
|
|
|
@@ -172,7 +174,7 @@ class RBLNDiffusionMixin:
|
|
|
172
174
|
Names of specific LoRA weight files to load, corresponding to lora_ids. Only used when `export=True`.
|
|
173
175
|
lora_scales:
|
|
174
176
|
Scaling factor(s) to apply to the LoRA adapter(s). Only used when `export=True`.
|
|
175
|
-
|
|
177
|
+
kwargs:
|
|
176
178
|
Additional arguments to pass to the underlying diffusion pipeline constructor or the
|
|
177
179
|
RBLN compilation process. These may include parameters specific to individual submodules
|
|
178
180
|
or the particular diffusion pipeline being used.
|
|
@@ -183,6 +185,20 @@ class RBLNDiffusionMixin:
|
|
|
183
185
|
"""
|
|
184
186
|
rbln_config, kwargs = cls.get_rbln_config_class().initialize_from_kwargs(rbln_config, **kwargs)
|
|
185
187
|
|
|
188
|
+
if export is None:
|
|
189
|
+
export = any(
|
|
190
|
+
not RBLNModel._is_compiled(
|
|
191
|
+
model_id,
|
|
192
|
+
token=kwargs.get("token"),
|
|
193
|
+
revision=kwargs.get("revision"),
|
|
194
|
+
force_download=kwargs.get("force_download", False),
|
|
195
|
+
cache_dir=kwargs.get("cache_dir"),
|
|
196
|
+
subfolder=submodule_name,
|
|
197
|
+
local_files_only=kwargs.get("local_files_only", False),
|
|
198
|
+
)
|
|
199
|
+
for submodule_name in cls._submodules
|
|
200
|
+
)
|
|
201
|
+
|
|
186
202
|
if export:
|
|
187
203
|
# keep submodules if user passed any of them.
|
|
188
204
|
passed_submodules = {
|
|
@@ -228,8 +244,8 @@ class RBLNDiffusionMixin:
|
|
|
228
244
|
device=rbln_config.device,
|
|
229
245
|
device_map=rbln_config.device_map,
|
|
230
246
|
create_runtimes=rbln_config.create_runtimes,
|
|
231
|
-
optimize_host_mem=rbln_config.optimize_host_memory,
|
|
232
247
|
activate_profiler=rbln_config.activate_profiler,
|
|
248
|
+
timeout=rbln_config.timeout,
|
|
233
249
|
):
|
|
234
250
|
model = super().from_pretrained(pretrained_model_name_or_path=model_id, **kwargs)
|
|
235
251
|
|
|
@@ -395,12 +411,11 @@ class RBLNDiffusionMixin:
|
|
|
395
411
|
# overwrite to replace incorrect config
|
|
396
412
|
model.save_config(model_save_dir)
|
|
397
413
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
model.compiled_models.extend(submodule.compiled_models)
|
|
414
|
+
# Keep compiled_model objs to further analysis. -> TODO: remove soon...
|
|
415
|
+
model.compiled_models = []
|
|
416
|
+
for name in cls._submodules:
|
|
417
|
+
submodule = getattr(model, name)
|
|
418
|
+
model.compiled_models.extend(submodule.compiled_models)
|
|
404
419
|
|
|
405
420
|
return model
|
|
406
421
|
|
|
@@ -22,9 +22,11 @@ _import_structure = {
|
|
|
22
22
|
"RBLNAutoencoderKL",
|
|
23
23
|
"RBLNAutoencoderKLCosmos",
|
|
24
24
|
"RBLNVQModel",
|
|
25
|
+
"RBLNAutoencoderKLTemporalDecoder",
|
|
25
26
|
],
|
|
26
27
|
"unets": [
|
|
27
28
|
"RBLNUNet2DConditionModel",
|
|
29
|
+
"RBLNUNetSpatioTemporalConditionModel",
|
|
28
30
|
],
|
|
29
31
|
"controlnet": ["RBLNControlNetModel"],
|
|
30
32
|
"transformers": [
|
|
@@ -38,6 +40,7 @@ if TYPE_CHECKING:
|
|
|
38
40
|
from .autoencoders import (
|
|
39
41
|
RBLNAutoencoderKL,
|
|
40
42
|
RBLNAutoencoderKLCosmos,
|
|
43
|
+
RBLNAutoencoderKLTemporalDecoder,
|
|
41
44
|
RBLNVQModel,
|
|
42
45
|
)
|
|
43
46
|
from .controlnet import RBLNControlNetModel
|
|
@@ -48,6 +51,7 @@ if TYPE_CHECKING:
|
|
|
48
51
|
)
|
|
49
52
|
from .unets import (
|
|
50
53
|
RBLNUNet2DConditionModel,
|
|
54
|
+
RBLNUNetSpatioTemporalConditionModel,
|
|
51
55
|
)
|
|
52
56
|
else:
|
|
53
57
|
import sys
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
|
16
16
|
|
|
17
17
|
import rebel
|
|
18
18
|
import torch
|
|
@@ -209,17 +209,46 @@ class RBLNAutoencoderKL(RBLNModel):
|
|
|
209
209
|
tensor_type="pt",
|
|
210
210
|
device=device_val,
|
|
211
211
|
activate_profiler=rbln_config.activate_profiler,
|
|
212
|
+
timeout=rbln_config.timeout,
|
|
212
213
|
)
|
|
213
214
|
for compiled_model, device_val in zip(compiled_models, device_vals)
|
|
214
215
|
]
|
|
215
216
|
|
|
216
|
-
def encode(
|
|
217
|
+
def encode(
|
|
218
|
+
self, x: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
|
|
219
|
+
) -> Union[torch.FloatTensor, AutoencoderKLOutput]:
|
|
220
|
+
"""
|
|
221
|
+
Encode an input image into a latent representation.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
x: The input image to encode.
|
|
225
|
+
return_dict:
|
|
226
|
+
Whether to return output as a dictionary. Defaults to True.
|
|
227
|
+
kwargs: Additional arguments to pass to the encoder.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
The latent representation or AutoencoderKLOutput if return_dict=True
|
|
231
|
+
"""
|
|
217
232
|
posterior = self.encoder.encode(x)
|
|
218
233
|
if not return_dict:
|
|
219
234
|
return (posterior,)
|
|
220
235
|
return AutoencoderKLOutput(latent_dist=posterior)
|
|
221
236
|
|
|
222
|
-
def decode(
|
|
237
|
+
def decode(
|
|
238
|
+
self, z: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
|
|
239
|
+
) -> Union[torch.FloatTensor, DecoderOutput]:
|
|
240
|
+
"""
|
|
241
|
+
Decode a latent representation into an image.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
z: The latent representation to decode.
|
|
245
|
+
return_dict:
|
|
246
|
+
Whether to return output as a dictionary. Defaults to True.
|
|
247
|
+
kwargs: Additional arguments to pass to the decoder.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
The decoded image or DecoderOutput if return_dict=True
|
|
251
|
+
"""
|
|
223
252
|
dec = self.decoder.decode(z)
|
|
224
253
|
if not return_dict:
|
|
225
254
|
return (dec,)
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import TYPE_CHECKING, Dict, List, Union
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
16
16
|
|
|
17
17
|
import rebel
|
|
18
18
|
import torch
|
|
@@ -68,7 +68,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
|
|
|
68
68
|
self.image_size = self.rbln_config.image_size
|
|
69
69
|
|
|
70
70
|
@classmethod
|
|
71
|
-
def
|
|
71
|
+
def _wrap_model_if_needed(
|
|
72
72
|
cls, model: torch.nn.Module, rbln_config: RBLNAutoencoderKLCosmosConfig
|
|
73
73
|
) -> torch.nn.Module:
|
|
74
74
|
decoder_model = _VAECosmosDecoder(model)
|
|
@@ -98,7 +98,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
|
|
|
98
98
|
|
|
99
99
|
compiled_models = {}
|
|
100
100
|
if rbln_config.uses_encoder:
|
|
101
|
-
encoder_model, decoder_model = cls.
|
|
101
|
+
encoder_model, decoder_model = cls._wrap_model_if_needed(model, rbln_config)
|
|
102
102
|
enc_compiled_model = cls.compile(
|
|
103
103
|
encoder_model,
|
|
104
104
|
rbln_compile_config=rbln_config.compile_cfgs[0],
|
|
@@ -107,7 +107,7 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
|
|
|
107
107
|
)
|
|
108
108
|
compiled_models["encoder"] = enc_compiled_model
|
|
109
109
|
else:
|
|
110
|
-
decoder_model = cls.
|
|
110
|
+
decoder_model = cls._wrap_model_if_needed(model, rbln_config)
|
|
111
111
|
dec_compiled_model = cls.compile(
|
|
112
112
|
decoder_model,
|
|
113
113
|
rbln_compile_config=rbln_config.compile_cfgs[-1],
|
|
@@ -200,17 +200,43 @@ class RBLNAutoencoderKLCosmos(RBLNModel):
|
|
|
200
200
|
tensor_type="pt",
|
|
201
201
|
device=device_val,
|
|
202
202
|
activate_profiler=rbln_config.activate_profiler,
|
|
203
|
+
timeout=rbln_config.timeout,
|
|
203
204
|
)
|
|
204
205
|
for compiled_model, device_val in zip(compiled_models, device_vals)
|
|
205
206
|
]
|
|
206
207
|
|
|
207
|
-
def encode(
|
|
208
|
+
def encode(
|
|
209
|
+
self, x: torch.FloatTensor, return_dict: bool = True, **kwargs: Dict[str, Any]
|
|
210
|
+
) -> Union[torch.FloatTensor, AutoencoderKLOutput]:
|
|
211
|
+
"""
|
|
212
|
+
Encode an input video into a latent representation.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
x: The input video to encode.
|
|
216
|
+
return_dict:
|
|
217
|
+
Whether to return output as a dictionary. Defaults to True.
|
|
218
|
+
kwargs: Additional arguments to pass to the encoder.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
The latent representation or AutoencoderKLOutput if return_dict=True
|
|
222
|
+
"""
|
|
208
223
|
posterior = self.encoder.encode(x)
|
|
209
224
|
if not return_dict:
|
|
210
225
|
return (posterior,)
|
|
211
226
|
return AutoencoderKLOutput(latent_dist=posterior)
|
|
212
227
|
|
|
213
|
-
def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
|
|
228
|
+
def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[torch.FloatTensor, DecoderOutput]:
|
|
229
|
+
"""
|
|
230
|
+
Decode a latent representation into a video.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
z: The latent representation to decode.
|
|
234
|
+
return_dict:
|
|
235
|
+
Whether to return output as a dictionary. Defaults to True.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The decoded video or DecoderOutput if return_dict=True
|
|
239
|
+
"""
|
|
214
240
|
decoded = self.decoder.decode(z)
|
|
215
241
|
|
|
216
242
|
if not return_dict:
|