optimum-rbln 0.7.4a4__py3-none-any.whl → 0.7.4a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +156 -36
- optimum/rbln/__version__.py +1 -1
- optimum/rbln/configuration_utils.py +772 -0
- optimum/rbln/diffusers/__init__.py +56 -0
- optimum/rbln/diffusers/configurations/__init__.py +30 -0
- optimum/rbln/diffusers/configurations/models/__init__.py +6 -0
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +66 -0
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +54 -0
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +44 -0
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +48 -0
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +66 -0
- optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +67 -0
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +30 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +221 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +285 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +118 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +143 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +124 -0
- optimum/rbln/diffusers/modeling_diffusers.py +63 -122
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +109 -128
- optimum/rbln/diffusers/models/autoencoders/vae.py +4 -6
- optimum/rbln/diffusers/models/autoencoders/vq_model.py +84 -85
- optimum/rbln/diffusers/models/controlnet.py +55 -70
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +40 -77
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +43 -68
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +110 -113
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +3 -4
- optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +2 -1
- optimum/rbln/modeling.py +58 -39
- optimum/rbln/modeling_base.py +85 -75
- optimum/rbln/transformers/__init__.py +79 -8
- optimum/rbln/transformers/configuration_alias.py +49 -0
- optimum/rbln/transformers/configuration_generic.py +142 -0
- optimum/rbln/transformers/modeling_generic.py +193 -280
- optimum/rbln/transformers/models/__init__.py +96 -34
- optimum/rbln/transformers/models/auto/auto_factory.py +3 -3
- optimum/rbln/transformers/models/bart/__init__.py +1 -0
- optimum/rbln/transformers/models/bart/configuration_bart.py +24 -0
- optimum/rbln/transformers/models/bart/modeling_bart.py +10 -84
- optimum/rbln/transformers/models/bert/__init__.py +1 -0
- optimum/rbln/transformers/models/bert/configuration_bert.py +31 -0
- optimum/rbln/transformers/models/bert/modeling_bert.py +7 -80
- optimum/rbln/transformers/models/clip/__init__.py +6 -0
- optimum/rbln/transformers/models/clip/configuration_clip.py +79 -0
- optimum/rbln/transformers/models/clip/modeling_clip.py +72 -75
- optimum/rbln/transformers/models/decoderonly/__init__.py +1 -0
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +90 -0
- optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +50 -43
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +114 -141
- optimum/rbln/transformers/models/dpt/__init__.py +1 -0
- optimum/rbln/transformers/models/dpt/configuration_dpt.py +19 -0
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +3 -76
- optimum/rbln/transformers/models/exaone/__init__.py +1 -0
- optimum/rbln/transformers/models/exaone/configuration_exaone.py +19 -0
- optimum/rbln/transformers/models/gemma/__init__.py +1 -0
- optimum/rbln/transformers/models/gemma/configuration_gemma.py +19 -0
- optimum/rbln/transformers/models/gpt2/__init__.py +1 -0
- optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +19 -0
- optimum/rbln/transformers/models/llama/__init__.py +1 -0
- optimum/rbln/transformers/models/llama/configuration_llama.py +19 -0
- optimum/rbln/transformers/models/llava_next/__init__.py +1 -0
- optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +46 -0
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +12 -23
- optimum/rbln/transformers/models/midm/__init__.py +1 -0
- optimum/rbln/transformers/models/midm/configuration_midm.py +19 -0
- optimum/rbln/transformers/models/mistral/__init__.py +1 -0
- optimum/rbln/transformers/models/mistral/configuration_mistral.py +19 -0
- optimum/rbln/transformers/models/phi/__init__.py +1 -0
- optimum/rbln/transformers/models/phi/configuration_phi.py +19 -0
- optimum/rbln/transformers/models/qwen2/__init__.py +1 -0
- optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +19 -0
- optimum/rbln/transformers/models/seq2seq/__init__.py +1 -0
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq2.py +66 -0
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +80 -97
- optimum/rbln/transformers/models/t5/__init__.py +1 -0
- optimum/rbln/transformers/models/t5/configuration_t5.py +24 -0
- optimum/rbln/transformers/models/t5/modeling_t5.py +22 -150
- optimum/rbln/transformers/models/time_series_transformers/__init__.py +1 -0
- optimum/rbln/transformers/models/time_series_transformers/configuration_time_series_transformer.py +34 -0
- optimum/rbln/transformers/models/time_series_transformers/modeling_time_series_transformers.py +52 -54
- optimum/rbln/transformers/models/wav2vec2/__init__.py +1 -0
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec.py +19 -0
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +9 -72
- optimum/rbln/transformers/models/whisper/__init__.py +1 -0
- optimum/rbln/transformers/models/whisper/configuration_whisper.py +64 -0
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +57 -72
- optimum/rbln/transformers/models/xlm_roberta/__init__.py +1 -0
- optimum/rbln/transformers/models/xlm_roberta/configuration_xlm_roberta.py +19 -0
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +3 -83
- optimum/rbln/utils/submodule.py +26 -43
- {optimum_rbln-0.7.4a4.dist-info → optimum_rbln-0.7.4a5.dist-info}/METADATA +1 -1
- optimum_rbln-0.7.4a5.dist-info/RECORD +162 -0
- optimum/rbln/modeling_config.py +0 -310
- optimum_rbln-0.7.4a4.dist-info/RECORD +0 -126
- {optimum_rbln-0.7.4a4.dist-info → optimum_rbln-0.7.4a5.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.7.4a4.dist-info → optimum_rbln-0.7.4a5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Optional, Tuple
|
16
|
+
|
17
|
+
from ....configuration_utils import RBLNModelConfig
|
18
|
+
from ....transformers import RBLNCLIPTextModelConfig, RBLNCLIPTextModelWithProjectionConfig
|
19
|
+
from ....utils.logging import get_logger
|
20
|
+
from ..models import RBLNAutoencoderKLConfig, RBLNControlNetModelConfig, RBLNUNet2DConditionModelConfig
|
21
|
+
|
22
|
+
|
23
|
+
logger = get_logger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class _RBLNStableDiffusionControlNetPipelineBaseConfig(RBLNModelConfig):
|
27
|
+
submodules = ["text_encoder", "unet", "vae", "controlnet"]
|
28
|
+
_vae_uses_encoder = False
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
|
33
|
+
unet: Optional[RBLNUNet2DConditionModelConfig] = None,
|
34
|
+
vae: Optional[RBLNAutoencoderKLConfig] = None,
|
35
|
+
controlnet: Optional[RBLNControlNetModelConfig] = None,
|
36
|
+
*,
|
37
|
+
batch_size: Optional[int] = None,
|
38
|
+
img_height: Optional[int] = None,
|
39
|
+
img_width: Optional[int] = None,
|
40
|
+
sample_size: Optional[Tuple[int, int]] = None,
|
41
|
+
image_size: Optional[Tuple[int, int]] = None,
|
42
|
+
guidance_scale: Optional[float] = None,
|
43
|
+
**kwargs,
|
44
|
+
):
|
45
|
+
"""
|
46
|
+
Args:
|
47
|
+
text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the text encoder component.
|
48
|
+
Initialized as RBLNCLIPTextModelConfig if not provided.
|
49
|
+
unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
|
50
|
+
Initialized as RBLNUNet2DConditionModelConfig if not provided.
|
51
|
+
vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
|
52
|
+
Initialized as RBLNAutoencoderKLConfig if not provided.
|
53
|
+
controlnet (Optional[RBLNControlNetModelConfig]): Configuration for the ControlNet model component.
|
54
|
+
Initialized as RBLNControlNetModelConfig if not provided.
|
55
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
56
|
+
img_height (Optional[int]): Height of the generated images.
|
57
|
+
img_width (Optional[int]): Width of the generated images.
|
58
|
+
sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
|
59
|
+
image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
|
60
|
+
Cannot be used together with img_height/img_width.
|
61
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
62
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
ValueError: If both image_size and img_height/img_width are provided.
|
66
|
+
|
67
|
+
Note:
|
68
|
+
When guidance_scale > 1.0, the UNet batch size is automatically doubled to
|
69
|
+
accommodate classifier-free guidance.
|
70
|
+
"""
|
71
|
+
super().__init__(**kwargs)
|
72
|
+
if image_size is not None and (img_height is not None or img_width is not None):
|
73
|
+
raise ValueError("image_size and img_height/img_width cannot both be provided")
|
74
|
+
|
75
|
+
if img_height is not None and img_width is not None:
|
76
|
+
image_size = (img_height, img_width)
|
77
|
+
|
78
|
+
self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
|
79
|
+
self.unet = self.init_submodule_config(
|
80
|
+
RBLNUNet2DConditionModelConfig,
|
81
|
+
unet,
|
82
|
+
batch_size=batch_size,
|
83
|
+
sample_size=sample_size,
|
84
|
+
)
|
85
|
+
self.vae = self.init_submodule_config(
|
86
|
+
RBLNAutoencoderKLConfig,
|
87
|
+
vae,
|
88
|
+
batch_size=batch_size,
|
89
|
+
uses_encoder=self.__class__._vae_uses_encoder,
|
90
|
+
sample_size=image_size, # image size is equal to sample size in vae
|
91
|
+
)
|
92
|
+
self.controlnet = self.init_submodule_config(RBLNControlNetModelConfig, controlnet, batch_size=batch_size)
|
93
|
+
|
94
|
+
if guidance_scale is not None:
|
95
|
+
logger.warning("Specifying `guidance_scale` is deprecated. It will be removed in a future version.")
|
96
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
97
|
+
if do_classifier_free_guidance:
|
98
|
+
self.unet.batch_size = self.text_encoder.batch_size * 2
|
99
|
+
self.controlnet.batch_size = self.text_encoder.batch_size * 2
|
100
|
+
|
101
|
+
@property
|
102
|
+
def batch_size(self):
|
103
|
+
return self.vae.batch_size
|
104
|
+
|
105
|
+
@property
|
106
|
+
def sample_size(self):
|
107
|
+
return self.unet.sample_size
|
108
|
+
|
109
|
+
@property
|
110
|
+
def image_size(self):
|
111
|
+
return self.vae.sample_size
|
112
|
+
|
113
|
+
|
114
|
+
class RBLNStableDiffusionControlNetPipelineConfig(_RBLNStableDiffusionControlNetPipelineBaseConfig):
|
115
|
+
_vae_uses_encoder = False
|
116
|
+
|
117
|
+
|
118
|
+
class RBLNStableDiffusionControlNetImg2ImgPipelineConfig(_RBLNStableDiffusionControlNetPipelineBaseConfig):
|
119
|
+
_vae_uses_encoder = True
|
120
|
+
|
121
|
+
|
122
|
+
class _RBLNStableDiffusionXLControlNetPipelineBaseConfig(RBLNModelConfig):
|
123
|
+
submodules = ["text_encoder", "text_encoder_2", "unet", "vae", "controlnet"]
|
124
|
+
_vae_uses_encoder = False
|
125
|
+
|
126
|
+
def __init__(
|
127
|
+
self,
|
128
|
+
text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
|
129
|
+
text_encoder_2: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
|
130
|
+
unet: Optional[RBLNUNet2DConditionModelConfig] = None,
|
131
|
+
vae: Optional[RBLNAutoencoderKLConfig] = None,
|
132
|
+
controlnet: Optional[RBLNControlNetModelConfig] = None,
|
133
|
+
*,
|
134
|
+
batch_size: Optional[int] = None,
|
135
|
+
img_height: Optional[int] = None,
|
136
|
+
img_width: Optional[int] = None,
|
137
|
+
sample_size: Optional[Tuple[int, int]] = None,
|
138
|
+
image_size: Optional[Tuple[int, int]] = None,
|
139
|
+
guidance_scale: Optional[float] = None,
|
140
|
+
**kwargs,
|
141
|
+
):
|
142
|
+
"""
|
143
|
+
Args:
|
144
|
+
text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the primary text encoder.
|
145
|
+
Initialized as RBLNCLIPTextModelConfig if not provided.
|
146
|
+
text_encoder_2 (Optional[RBLNCLIPTextModelWithProjectionConfig]): Configuration for the secondary text encoder.
|
147
|
+
Initialized as RBLNCLIPTextModelWithProjectionConfig if not provided.
|
148
|
+
unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
|
149
|
+
Initialized as RBLNUNet2DConditionModelConfig if not provided.
|
150
|
+
vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
|
151
|
+
Initialized as RBLNAutoencoderKLConfig if not provided.
|
152
|
+
controlnet (Optional[RBLNControlNetModelConfig]): Configuration for the ControlNet model component.
|
153
|
+
Initialized as RBLNControlNetModelConfig if not provided.
|
154
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
155
|
+
img_height (Optional[int]): Height of the generated images.
|
156
|
+
img_width (Optional[int]): Width of the generated images.
|
157
|
+
sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
|
158
|
+
image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
|
159
|
+
Cannot be used together with img_height/img_width.
|
160
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
161
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
162
|
+
|
163
|
+
Raises:
|
164
|
+
ValueError: If both image_size and img_height/img_width are provided.
|
165
|
+
|
166
|
+
Note:
|
167
|
+
When guidance_scale > 1.0, the UNet batch size is automatically doubled to
|
168
|
+
accommodate classifier-free guidance.
|
169
|
+
"""
|
170
|
+
super().__init__(**kwargs)
|
171
|
+
if image_size is not None and (img_height is not None or img_width is not None):
|
172
|
+
raise ValueError("image_size and img_height/img_width cannot both be provided")
|
173
|
+
|
174
|
+
if img_height is not None and img_width is not None:
|
175
|
+
image_size = (img_height, img_width)
|
176
|
+
|
177
|
+
self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
|
178
|
+
self.text_encoder_2 = self.init_submodule_config(
|
179
|
+
RBLNCLIPTextModelWithProjectionConfig, text_encoder_2, batch_size=batch_size
|
180
|
+
)
|
181
|
+
self.unet = self.init_submodule_config(
|
182
|
+
RBLNUNet2DConditionModelConfig,
|
183
|
+
unet,
|
184
|
+
batch_size=batch_size,
|
185
|
+
sample_size=sample_size,
|
186
|
+
)
|
187
|
+
self.vae = self.init_submodule_config(
|
188
|
+
RBLNAutoencoderKLConfig,
|
189
|
+
vae,
|
190
|
+
batch_size=batch_size,
|
191
|
+
uses_encoder=self.__class__._vae_uses_encoder,
|
192
|
+
sample_size=image_size, # image size is equal to sample size in vae
|
193
|
+
)
|
194
|
+
self.controlnet = self.init_submodule_config(RBLNControlNetModelConfig, controlnet, batch_size=batch_size)
|
195
|
+
|
196
|
+
if guidance_scale is not None:
|
197
|
+
logger.warning("Specifying `guidance_scale` is deprecated. It will be removed in a future version.")
|
198
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
199
|
+
if do_classifier_free_guidance:
|
200
|
+
self.unet.batch_size = self.text_encoder.batch_size * 2
|
201
|
+
self.controlnet.batch_size = self.text_encoder.batch_size * 2
|
202
|
+
|
203
|
+
@property
|
204
|
+
def batch_size(self):
|
205
|
+
return self.vae.batch_size
|
206
|
+
|
207
|
+
@property
|
208
|
+
def sample_size(self):
|
209
|
+
return self.unet.sample_size
|
210
|
+
|
211
|
+
@property
|
212
|
+
def image_size(self):
|
213
|
+
return self.vae.sample_size
|
214
|
+
|
215
|
+
|
216
|
+
class RBLNStableDiffusionXLControlNetPipelineConfig(_RBLNStableDiffusionXLControlNetPipelineBaseConfig):
|
217
|
+
_vae_uses_encoder = False
|
218
|
+
|
219
|
+
|
220
|
+
class RBLNStableDiffusionXLControlNetImg2ImgPipelineConfig(_RBLNStableDiffusionXLControlNetPipelineBaseConfig):
|
221
|
+
_vae_uses_encoder = True
|
@@ -0,0 +1,285 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Optional, Tuple
|
16
|
+
|
17
|
+
from ....configuration_utils import RBLNModelConfig
|
18
|
+
from ....transformers import RBLNCLIPTextModelWithProjectionConfig, RBLNCLIPVisionModelWithProjectionConfig
|
19
|
+
from ....utils.logging import get_logger
|
20
|
+
from ..models import RBLNUNet2DConditionModelConfig, RBLNVQModelConfig
|
21
|
+
from ..models.configuration_prior_transformer import RBLNPriorTransformerConfig
|
22
|
+
|
23
|
+
|
24
|
+
logger = get_logger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class _RBLNKandinskyV22PipelineBaseConfig(RBLNModelConfig):
|
28
|
+
submodules = ["unet", "movq"]
|
29
|
+
_movq_uses_encoder = False
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
unet: Optional[RBLNUNet2DConditionModelConfig] = None,
|
34
|
+
movq: Optional[RBLNVQModelConfig] = None,
|
35
|
+
*,
|
36
|
+
sample_size: Optional[Tuple[int, int]] = None,
|
37
|
+
batch_size: Optional[int] = None,
|
38
|
+
guidance_scale: Optional[float] = None,
|
39
|
+
image_size: Optional[Tuple[int, int]] = None,
|
40
|
+
img_height: Optional[int] = None,
|
41
|
+
img_width: Optional[int] = None,
|
42
|
+
**kwargs,
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
Args:
|
46
|
+
unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
|
47
|
+
Initialized as RBLNUNet2DConditionModelConfig if not provided.
|
48
|
+
movq (Optional[RBLNVQModelConfig]): Configuration for the MoVQ (VQ-GAN) model component.
|
49
|
+
Initialized as RBLNVQModelConfig if not provided.
|
50
|
+
sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
|
51
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
52
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
53
|
+
image_size (Optional[Tuple[int, int]]): Dimensions for the generated images.
|
54
|
+
Cannot be used together with img_height/img_width.
|
55
|
+
img_height (Optional[int]): Height of the generated images.
|
56
|
+
img_width (Optional[int]): Width of the generated images.
|
57
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
58
|
+
|
59
|
+
Raises:
|
60
|
+
ValueError: If both image_size and img_height/img_width are provided.
|
61
|
+
|
62
|
+
Note:
|
63
|
+
When guidance_scale > 1.0, the UNet batch size is automatically doubled to
|
64
|
+
accommodate classifier-free guidance.
|
65
|
+
"""
|
66
|
+
super().__init__(**kwargs)
|
67
|
+
if image_size is not None and (img_height is not None or img_width is not None):
|
68
|
+
raise ValueError("image_size and img_height/img_width cannot both be provided")
|
69
|
+
|
70
|
+
if img_height is not None and img_width is not None:
|
71
|
+
image_size = (img_height, img_width)
|
72
|
+
|
73
|
+
self.unet = self.init_submodule_config(
|
74
|
+
RBLNUNet2DConditionModelConfig, unet, batch_size=batch_size, sample_size=sample_size
|
75
|
+
)
|
76
|
+
self.movq = self.init_submodule_config(
|
77
|
+
RBLNVQModelConfig,
|
78
|
+
movq,
|
79
|
+
batch_size=batch_size,
|
80
|
+
sample_size=image_size, # image size is equal to sample size in vae
|
81
|
+
uses_encoder=self._movq_uses_encoder,
|
82
|
+
)
|
83
|
+
|
84
|
+
if guidance_scale is not None:
|
85
|
+
logger.warning("Specifying `guidance_scale` is deprecated. It will be removed in a future version.")
|
86
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
87
|
+
if do_classifier_free_guidance:
|
88
|
+
self.unet.batch_size = self.movq.batch_size * 2
|
89
|
+
|
90
|
+
@property
|
91
|
+
def batch_size(self):
|
92
|
+
return self.movq.batch_size
|
93
|
+
|
94
|
+
@property
|
95
|
+
def image_size(self):
|
96
|
+
return self.movq.sample_size
|
97
|
+
|
98
|
+
|
99
|
+
class RBLNKandinskyV22PipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
|
100
|
+
_movq_uses_encoder = False
|
101
|
+
|
102
|
+
|
103
|
+
class RBLNKandinskyV22Img2ImgPipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
|
104
|
+
_movq_uses_encoder = True
|
105
|
+
|
106
|
+
|
107
|
+
class RBLNKandinskyV22InpaintPipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
|
108
|
+
_movq_uses_encoder = True
|
109
|
+
|
110
|
+
|
111
|
+
class RBLNKandinskyV22PriorPipelineConfig(RBLNModelConfig):
|
112
|
+
submodules = ["text_encoder", "image_encoder", "prior"]
|
113
|
+
|
114
|
+
def __init__(
|
115
|
+
self,
|
116
|
+
text_encoder: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
|
117
|
+
image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
|
118
|
+
prior: Optional[RBLNPriorTransformerConfig] = None,
|
119
|
+
*,
|
120
|
+
batch_size: Optional[int] = None,
|
121
|
+
guidance_scale: Optional[float] = None,
|
122
|
+
**kwargs,
|
123
|
+
):
|
124
|
+
"""
|
125
|
+
Initialize a configuration for Kandinsky 2.2 prior pipeline optimized for RBLN NPU.
|
126
|
+
|
127
|
+
This configuration sets up the prior components of the Kandinsky 2.2 architecture, which includes
|
128
|
+
text and image encoders along with a prior transformer that maps text/image embeddings to
|
129
|
+
latent representations used to condition the diffusion process.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
text_encoder (Optional[RBLNCLIPTextModelWithProjectionConfig]): Configuration for the text encoder component.
|
133
|
+
Initialized as RBLNCLIPTextModelWithProjectionConfig if not provided.
|
134
|
+
image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Configuration for the image encoder component.
|
135
|
+
Initialized as RBLNCLIPVisionModelWithProjectionConfig if not provided.
|
136
|
+
prior (Optional[RBLNPriorTransformerConfig]): Configuration for the prior transformer component.
|
137
|
+
Initialized as RBLNPriorTransformerConfig if not provided.
|
138
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
139
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
140
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
141
|
+
|
142
|
+
Note:
|
143
|
+
When guidance_scale > 1.0, the prior batch size is automatically doubled to
|
144
|
+
accommodate classifier-free guidance.
|
145
|
+
"""
|
146
|
+
super().__init__(**kwargs)
|
147
|
+
self.text_encoder = self.init_submodule_config(
|
148
|
+
RBLNCLIPTextModelWithProjectionConfig, text_encoder, batch_size=batch_size
|
149
|
+
)
|
150
|
+
self.image_encoder = self.init_submodule_config(
|
151
|
+
RBLNCLIPVisionModelWithProjectionConfig, image_encoder, batch_size=batch_size
|
152
|
+
)
|
153
|
+
|
154
|
+
self.prior = self.init_submodule_config(RBLNPriorTransformerConfig, prior, batch_size=batch_size)
|
155
|
+
|
156
|
+
if guidance_scale is not None:
|
157
|
+
logger.warning("Specifying `guidance_scale` is deprecated. It will be removed in a future version.")
|
158
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
159
|
+
if do_classifier_free_guidance:
|
160
|
+
self.prior.batch_size = self.text_encoder.batch_size * 2
|
161
|
+
|
162
|
+
@property
|
163
|
+
def batch_size(self):
|
164
|
+
return self.text_encoder.batch_size
|
165
|
+
|
166
|
+
@property
|
167
|
+
def image_size(self):
|
168
|
+
return self.image_encoder.image_size
|
169
|
+
|
170
|
+
|
171
|
+
class _RBLNKandinskyV22CombinedPipelineBaseConfig(RBLNModelConfig):
|
172
|
+
submodules = ["prior_pipe", "decoder_pipe"]
|
173
|
+
_decoder_pipe_cls = RBLNKandinskyV22PipelineConfig
|
174
|
+
|
175
|
+
def __init__(
|
176
|
+
self,
|
177
|
+
prior_pipe: Optional[RBLNKandinskyV22PriorPipelineConfig] = None,
|
178
|
+
decoder_pipe: Optional[RBLNKandinskyV22PipelineConfig] = None,
|
179
|
+
*,
|
180
|
+
sample_size: Optional[Tuple[int, int]] = None,
|
181
|
+
image_size: Optional[Tuple[int, int]] = None,
|
182
|
+
batch_size: Optional[int] = None,
|
183
|
+
img_height: Optional[int] = None,
|
184
|
+
img_width: Optional[int] = None,
|
185
|
+
guidance_scale: Optional[float] = None,
|
186
|
+
prior_prior: Optional[RBLNPriorTransformerConfig] = None,
|
187
|
+
prior_image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
|
188
|
+
prior_text_encoder: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
|
189
|
+
unet: Optional[RBLNUNet2DConditionModelConfig] = None,
|
190
|
+
movq: Optional[RBLNVQModelConfig] = None,
|
191
|
+
**kwargs,
|
192
|
+
):
|
193
|
+
"""
|
194
|
+
Initialize a configuration for combined Kandinsky 2.2 pipelines optimized for RBLN NPU.
|
195
|
+
|
196
|
+
This configuration integrates both the prior and decoder components of Kandinsky 2.2 into
|
197
|
+
a unified pipeline, allowing for end-to-end text-to-image generation in a single model.
|
198
|
+
It combines the text/image encoding, prior mapping, and diffusion steps together.
|
199
|
+
|
200
|
+
Args:
|
201
|
+
prior_pipe (Optional[RBLNKandinskyV22PriorPipelineConfig]): Configuration for the prior pipeline.
|
202
|
+
Initialized as RBLNKandinskyV22PriorPipelineConfig if not provided.
|
203
|
+
decoder_pipe (Optional[RBLNKandinskyV22PipelineConfig]): Configuration for the decoder pipeline.
|
204
|
+
Initialized as RBLNKandinskyV22PipelineConfig if not provided.
|
205
|
+
sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
|
206
|
+
image_size (Optional[Tuple[int, int]]): Dimensions for the generated images.
|
207
|
+
Cannot be used together with img_height/img_width.
|
208
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
209
|
+
img_height (Optional[int]): Height of the generated images.
|
210
|
+
img_width (Optional[int]): Width of the generated images.
|
211
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
212
|
+
prior_prior (Optional[RBLNPriorTransformerConfig]): Direct configuration for the prior transformer.
|
213
|
+
Used if prior_pipe is not provided.
|
214
|
+
prior_image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Direct configuration for the image encoder.
|
215
|
+
Used if prior_pipe is not provided.
|
216
|
+
prior_text_encoder (Optional[RBLNCLIPTextModelWithProjectionConfig]): Direct configuration for the text encoder.
|
217
|
+
Used if prior_pipe is not provided.
|
218
|
+
unet (Optional[RBLNUNet2DConditionModelConfig]): Direct configuration for the UNet.
|
219
|
+
Used if decoder_pipe is not provided.
|
220
|
+
movq (Optional[RBLNVQModelConfig]): Direct configuration for the MoVQ (VQ-GAN) model.
|
221
|
+
Used if decoder_pipe is not provided.
|
222
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
223
|
+
"""
|
224
|
+
super().__init__(**kwargs)
|
225
|
+
self.prior_pipe = self.init_submodule_config(
|
226
|
+
RBLNKandinskyV22PriorPipelineConfig,
|
227
|
+
prior_pipe,
|
228
|
+
prior=prior_prior,
|
229
|
+
image_encoder=prior_image_encoder,
|
230
|
+
text_encoder=prior_text_encoder,
|
231
|
+
batch_size=batch_size,
|
232
|
+
guidance_scale=guidance_scale,
|
233
|
+
)
|
234
|
+
self.decoder_pipe = self.init_submodule_config(
|
235
|
+
self._decoder_pipe_cls,
|
236
|
+
decoder_pipe,
|
237
|
+
unet=unet,
|
238
|
+
movq=movq,
|
239
|
+
batch_size=batch_size,
|
240
|
+
sample_size=sample_size,
|
241
|
+
image_size=image_size,
|
242
|
+
img_height=img_height,
|
243
|
+
img_width=img_width,
|
244
|
+
guidance_scale=guidance_scale,
|
245
|
+
)
|
246
|
+
|
247
|
+
@property
|
248
|
+
def batch_size(self):
|
249
|
+
return self.prior_pipe.batch_size
|
250
|
+
|
251
|
+
@property
|
252
|
+
def image_size(self):
|
253
|
+
return self.prior_pipe.image_size
|
254
|
+
|
255
|
+
@property
|
256
|
+
def prior_prior(self):
|
257
|
+
return self.prior_pipe.prior
|
258
|
+
|
259
|
+
@property
|
260
|
+
def prior_image_encoder(self):
|
261
|
+
return self.prior_pipe.image_encoder
|
262
|
+
|
263
|
+
@property
|
264
|
+
def prior_text_encoder(self):
|
265
|
+
return self.prior_pipe.text_encoder
|
266
|
+
|
267
|
+
@property
|
268
|
+
def unet(self):
|
269
|
+
return self.decoder_pipe.unet
|
270
|
+
|
271
|
+
@property
|
272
|
+
def movq(self):
|
273
|
+
return self.decoder_pipe.movq
|
274
|
+
|
275
|
+
|
276
|
+
class RBLNKandinskyV22CombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
|
277
|
+
_decoder_pipe_cls = RBLNKandinskyV22PipelineConfig
|
278
|
+
|
279
|
+
|
280
|
+
class RBLNKandinskyV22InpaintCombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
|
281
|
+
_decoder_pipe_cls = RBLNKandinskyV22InpaintPipelineConfig
|
282
|
+
|
283
|
+
|
284
|
+
class RBLNKandinskyV22Img2ImgCombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
|
285
|
+
_decoder_pipe_cls = RBLNKandinskyV22Img2ImgPipelineConfig
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Optional, Tuple
|
16
|
+
|
17
|
+
from ....configuration_utils import RBLNModelConfig
|
18
|
+
from ....transformers import RBLNCLIPTextModelConfig
|
19
|
+
from ....utils.logging import get_logger
|
20
|
+
from ..models import RBLNAutoencoderKLConfig, RBLNUNet2DConditionModelConfig
|
21
|
+
|
22
|
+
|
23
|
+
logger = get_logger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class _RBLNStableDiffusionPipelineBaseConfig(RBLNModelConfig):
|
27
|
+
submodules = ["text_encoder", "unet", "vae"]
|
28
|
+
_vae_uses_encoder = False
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
|
33
|
+
unet: Optional[RBLNUNet2DConditionModelConfig] = None,
|
34
|
+
vae: Optional[RBLNAutoencoderKLConfig] = None,
|
35
|
+
*,
|
36
|
+
batch_size: Optional[int] = None,
|
37
|
+
img_height: Optional[int] = None,
|
38
|
+
img_width: Optional[int] = None,
|
39
|
+
sample_size: Optional[Tuple[int, int]] = None,
|
40
|
+
image_size: Optional[Tuple[int, int]] = None,
|
41
|
+
guidance_scale: Optional[float] = None,
|
42
|
+
**kwargs,
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
Args:
|
46
|
+
text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the text encoder component.
|
47
|
+
Initialized as RBLNCLIPTextModelConfig if not provided.
|
48
|
+
unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
|
49
|
+
Initialized as RBLNUNet2DConditionModelConfig if not provided.
|
50
|
+
vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
|
51
|
+
Initialized as RBLNAutoencoderKLConfig if not provided.
|
52
|
+
batch_size (Optional[int]): Batch size for inference, applied to all submodules.
|
53
|
+
img_height (Optional[int]): Height of the generated images.
|
54
|
+
img_width (Optional[int]): Width of the generated images.
|
55
|
+
sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
|
56
|
+
image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
|
57
|
+
Cannot be used together with img_height/img_width.
|
58
|
+
guidance_scale (Optional[float]): Scale for classifier-free guidance. Deprecated parameter.
|
59
|
+
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
60
|
+
|
61
|
+
Raises:
|
62
|
+
ValueError: If both image_size and img_height/img_width are provided.
|
63
|
+
|
64
|
+
Note:
|
65
|
+
When guidance_scale > 1.0, the UNet batch size is automatically doubled to
|
66
|
+
accommodate classifier-free guidance.
|
67
|
+
"""
|
68
|
+
super().__init__(**kwargs)
|
69
|
+
if image_size is not None and (img_height is not None or img_width is not None):
|
70
|
+
raise ValueError("image_size and img_height/img_width cannot both be provided")
|
71
|
+
|
72
|
+
if img_height is not None and img_width is not None:
|
73
|
+
image_size = (img_height, img_width)
|
74
|
+
|
75
|
+
self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
|
76
|
+
self.unet = self.init_submodule_config(
|
77
|
+
RBLNUNet2DConditionModelConfig,
|
78
|
+
unet,
|
79
|
+
batch_size=batch_size,
|
80
|
+
sample_size=sample_size,
|
81
|
+
)
|
82
|
+
self.vae = self.init_submodule_config(
|
83
|
+
RBLNAutoencoderKLConfig,
|
84
|
+
vae,
|
85
|
+
batch_size=batch_size,
|
86
|
+
uses_encoder=self.__class__._vae_uses_encoder,
|
87
|
+
sample_size=image_size, # image size is equal to sample size in vae
|
88
|
+
)
|
89
|
+
|
90
|
+
if guidance_scale is not None:
|
91
|
+
logger.warning("Specifying `guidance_scale` is deprecated. It will be removed in a future version.")
|
92
|
+
do_classifier_free_guidance = guidance_scale > 1.0
|
93
|
+
if do_classifier_free_guidance:
|
94
|
+
self.unet.batch_size = self.text_encoder.batch_size * 2
|
95
|
+
|
96
|
+
@property
|
97
|
+
def batch_size(self):
|
98
|
+
return self.vae.batch_size
|
99
|
+
|
100
|
+
@property
|
101
|
+
def sample_size(self):
|
102
|
+
return self.unet.sample_size
|
103
|
+
|
104
|
+
@property
|
105
|
+
def image_size(self):
|
106
|
+
return self.vae.sample_size
|
107
|
+
|
108
|
+
|
109
|
+
class RBLNStableDiffusionPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
|
110
|
+
_vae_uses_encoder = False
|
111
|
+
|
112
|
+
|
113
|
+
class RBLNStableDiffusionImg2ImgPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
|
114
|
+
_vae_uses_encoder = True
|
115
|
+
|
116
|
+
|
117
|
+
class RBLNStableDiffusionInpaintPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
|
118
|
+
_vae_uses_encoder = True
|