optimum-rbln 0.7.3.post1__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. optimum/rbln/__init__.py +173 -35
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +816 -0
  4. optimum/rbln/diffusers/__init__.py +56 -0
  5. optimum/rbln/diffusers/configurations/__init__.py +30 -0
  6. optimum/rbln/diffusers/configurations/models/__init__.py +6 -0
  7. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +66 -0
  8. optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +62 -0
  9. optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +52 -0
  10. optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +56 -0
  11. optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +74 -0
  12. optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +67 -0
  13. optimum/rbln/diffusers/configurations/pipelines/__init__.py +30 -0
  14. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +236 -0
  15. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +289 -0
  16. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +118 -0
  17. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +143 -0
  18. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +124 -0
  19. optimum/rbln/diffusers/modeling_diffusers.py +111 -137
  20. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +109 -128
  21. optimum/rbln/diffusers/models/autoencoders/vae.py +4 -6
  22. optimum/rbln/diffusers/models/autoencoders/vq_model.py +84 -85
  23. optimum/rbln/diffusers/models/controlnet.py +56 -71
  24. optimum/rbln/diffusers/models/transformers/prior_transformer.py +40 -77
  25. optimum/rbln/diffusers/models/transformers/transformer_sd3.py +44 -69
  26. optimum/rbln/diffusers/models/unets/unet_2d_condition.py +111 -114
  27. optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +3 -4
  28. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +2 -0
  29. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +2 -0
  30. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +2 -0
  31. optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +2 -0
  32. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +2 -0
  33. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -0
  34. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +2 -0
  35. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py +2 -0
  36. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +2 -0
  37. optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -1
  38. optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +2 -0
  39. optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +2 -0
  40. optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +2 -0
  41. optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +2 -0
  42. optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +2 -0
  43. optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +2 -0
  44. optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +2 -0
  45. optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +2 -0
  46. optimum/rbln/modeling.py +66 -40
  47. optimum/rbln/modeling_base.py +111 -86
  48. optimum/rbln/ops/__init__.py +4 -7
  49. optimum/rbln/ops/attn.py +271 -205
  50. optimum/rbln/ops/flash_attn.py +161 -67
  51. optimum/rbln/ops/kv_cache_update.py +4 -40
  52. optimum/rbln/ops/linear.py +25 -0
  53. optimum/rbln/transformers/__init__.py +97 -8
  54. optimum/rbln/transformers/configuration_alias.py +49 -0
  55. optimum/rbln/transformers/configuration_generic.py +142 -0
  56. optimum/rbln/transformers/modeling_generic.py +193 -280
  57. optimum/rbln/transformers/models/__init__.py +120 -32
  58. optimum/rbln/transformers/models/auto/auto_factory.py +6 -6
  59. optimum/rbln/transformers/models/bart/__init__.py +2 -0
  60. optimum/rbln/transformers/models/bart/configuration_bart.py +24 -0
  61. optimum/rbln/transformers/models/bart/modeling_bart.py +11 -86
  62. optimum/rbln/transformers/models/bert/__init__.py +1 -0
  63. optimum/rbln/transformers/models/bert/configuration_bert.py +31 -0
  64. optimum/rbln/transformers/models/bert/modeling_bert.py +7 -80
  65. optimum/rbln/transformers/models/clip/__init__.py +6 -0
  66. optimum/rbln/transformers/models/clip/configuration_clip.py +79 -0
  67. optimum/rbln/transformers/models/clip/modeling_clip.py +72 -75
  68. optimum/rbln/transformers/models/decoderonly/__init__.py +11 -0
  69. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +90 -0
  70. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +197 -178
  71. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +343 -249
  72. optimum/rbln/transformers/models/dpt/__init__.py +1 -0
  73. optimum/rbln/transformers/models/dpt/configuration_dpt.py +19 -0
  74. optimum/rbln/transformers/models/dpt/modeling_dpt.py +3 -76
  75. optimum/rbln/transformers/models/exaone/__init__.py +1 -0
  76. optimum/rbln/transformers/models/exaone/configuration_exaone.py +19 -0
  77. optimum/rbln/transformers/models/gemma/__init__.py +1 -0
  78. optimum/rbln/transformers/models/gemma/configuration_gemma.py +19 -0
  79. optimum/rbln/transformers/models/gpt2/__init__.py +1 -0
  80. optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +19 -0
  81. optimum/rbln/transformers/models/idefics3/__init__.py +16 -0
  82. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +51 -0
  83. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +459 -0
  84. optimum/rbln/transformers/models/llama/__init__.py +1 -0
  85. optimum/rbln/transformers/models/llama/configuration_llama.py +19 -0
  86. optimum/rbln/transformers/models/llava_next/__init__.py +1 -0
  87. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +46 -0
  88. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +18 -23
  89. optimum/rbln/transformers/models/midm/__init__.py +1 -0
  90. optimum/rbln/transformers/models/midm/configuration_midm.py +19 -0
  91. optimum/rbln/transformers/models/mistral/__init__.py +1 -0
  92. optimum/rbln/transformers/models/mistral/configuration_mistral.py +19 -0
  93. optimum/rbln/transformers/models/phi/__init__.py +1 -0
  94. optimum/rbln/transformers/models/phi/configuration_phi.py +19 -0
  95. optimum/rbln/transformers/models/qwen2/__init__.py +1 -0
  96. optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +19 -0
  97. optimum/rbln/transformers/models/qwen2_5_vl/__init__.py +19 -0
  98. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +68 -0
  99. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +608 -0
  100. optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +214 -0
  101. optimum/rbln/transformers/models/seq2seq/__init__.py +1 -0
  102. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq2.py +66 -0
  103. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +99 -118
  104. optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +12 -21
  105. optimum/rbln/transformers/models/t5/__init__.py +2 -0
  106. optimum/rbln/transformers/models/t5/configuration_t5.py +24 -0
  107. optimum/rbln/transformers/models/t5/modeling_t5.py +23 -151
  108. optimum/rbln/transformers/models/t5/t5_architecture.py +10 -5
  109. optimum/rbln/transformers/models/time_series_transformers/__init__.py +26 -0
  110. optimum/rbln/transformers/models/time_series_transformers/configuration_time_series_transformer.py +34 -0
  111. optimum/rbln/transformers/models/time_series_transformers/modeling_time_series_transformers.py +420 -0
  112. optimum/rbln/transformers/models/time_series_transformers/time_series_transformers_architecture.py +331 -0
  113. optimum/rbln/transformers/models/wav2vec2/__init__.py +1 -0
  114. optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec.py +19 -0
  115. optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +9 -72
  116. optimum/rbln/transformers/models/whisper/__init__.py +2 -0
  117. optimum/rbln/transformers/models/whisper/configuration_whisper.py +64 -0
  118. optimum/rbln/transformers/models/whisper/modeling_whisper.py +135 -100
  119. optimum/rbln/transformers/models/whisper/whisper_architecture.py +73 -40
  120. optimum/rbln/transformers/models/xlm_roberta/__init__.py +1 -0
  121. optimum/rbln/transformers/models/xlm_roberta/configuration_xlm_roberta.py +19 -0
  122. optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +3 -83
  123. optimum/rbln/utils/hub.py +2 -2
  124. optimum/rbln/utils/import_utils.py +23 -6
  125. optimum/rbln/utils/model_utils.py +4 -4
  126. optimum/rbln/utils/runtime_utils.py +33 -2
  127. optimum/rbln/utils/submodule.py +36 -44
  128. {optimum_rbln-0.7.3.post1.dist-info → optimum_rbln-0.7.4.dist-info}/METADATA +6 -6
  129. optimum_rbln-0.7.4.dist-info/RECORD +169 -0
  130. optimum/rbln/modeling_config.py +0 -310
  131. optimum_rbln-0.7.3.post1.dist-info/RECORD +0 -122
  132. {optimum_rbln-0.7.3.post1.dist-info → optimum_rbln-0.7.4.dist-info}/WHEEL +0 -0
  133. {optimum_rbln-0.7.3.post1.dist-info → optimum_rbln-0.7.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Tuple
16
+
17
+ from ....configuration_utils import RBLNModelConfig
18
+ from ....transformers import RBLNCLIPTextModelConfig, RBLNCLIPTextModelWithProjectionConfig
19
+ from ..models import RBLNAutoencoderKLConfig, RBLNControlNetModelConfig, RBLNUNet2DConditionModelConfig
20
+
21
+
22
+ class _RBLNStableDiffusionControlNetPipelineBaseConfig(RBLNModelConfig):
23
+ submodules = ["text_encoder", "unet", "vae", "controlnet"]
24
+ _vae_uses_encoder = False
25
+
26
+ def __init__(
27
+ self,
28
+ text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
29
+ unet: Optional[RBLNUNet2DConditionModelConfig] = None,
30
+ vae: Optional[RBLNAutoencoderKLConfig] = None,
31
+ controlnet: Optional[RBLNControlNetModelConfig] = None,
32
+ *,
33
+ batch_size: Optional[int] = None,
34
+ img_height: Optional[int] = None,
35
+ img_width: Optional[int] = None,
36
+ sample_size: Optional[Tuple[int, int]] = None,
37
+ image_size: Optional[Tuple[int, int]] = None,
38
+ guidance_scale: Optional[float] = None,
39
+ **kwargs,
40
+ ):
41
+ """
42
+ Args:
43
+ text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the text encoder component.
44
+ Initialized as RBLNCLIPTextModelConfig if not provided.
45
+ unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
46
+ Initialized as RBLNUNet2DConditionModelConfig if not provided.
47
+ vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
48
+ Initialized as RBLNAutoencoderKLConfig if not provided.
49
+ controlnet (Optional[RBLNControlNetModelConfig]): Configuration for the ControlNet model component.
50
+ Initialized as RBLNControlNetModelConfig if not provided.
51
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
52
+ img_height (Optional[int]): Height of the generated images.
53
+ img_width (Optional[int]): Width of the generated images.
54
+ sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
55
+ image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
56
+ Cannot be used together with img_height/img_width.
57
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
58
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
59
+
60
+ Raises:
61
+ ValueError: If both image_size and img_height/img_width are provided.
62
+
63
+ Note:
64
+ When guidance_scale > 1.0, the UNet batch size is automatically doubled to
65
+ accommodate classifier-free guidance.
66
+ """
67
+ super().__init__(**kwargs)
68
+ if image_size is not None and (img_height is not None or img_width is not None):
69
+ raise ValueError("image_size and img_height/img_width cannot both be provided")
70
+
71
+ if img_height is not None and img_width is not None:
72
+ image_size = (img_height, img_width)
73
+
74
+ self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
75
+ self.unet = self.init_submodule_config(
76
+ RBLNUNet2DConditionModelConfig,
77
+ unet,
78
+ sample_size=sample_size,
79
+ )
80
+ self.vae = self.init_submodule_config(
81
+ RBLNAutoencoderKLConfig,
82
+ vae,
83
+ batch_size=batch_size,
84
+ uses_encoder=self.__class__._vae_uses_encoder,
85
+ sample_size=image_size, # image size is equal to sample size in vae
86
+ )
87
+ self.controlnet = self.init_submodule_config(RBLNControlNetModelConfig, controlnet)
88
+
89
+ # Get default guidance scale from original class to set UNet and ControlNet batch size
90
+ if guidance_scale is None:
91
+ guidance_scale = self.get_default_values_for_original_cls("__call__", ["guidance_scale"])["guidance_scale"]
92
+
93
+ if guidance_scale is not None:
94
+ do_classifier_free_guidance = guidance_scale > 1.0
95
+ if do_classifier_free_guidance:
96
+ if not self.unet.batch_size_is_specified:
97
+ self.unet.batch_size = self.text_encoder.batch_size * 2
98
+ if not self.controlnet.batch_size_is_specified:
99
+ self.controlnet.batch_size = self.text_encoder.batch_size * 2
100
+ else:
101
+ if not self.unet.batch_size_is_specified:
102
+ self.unet.batch_size = self.text_encoder.batch_size
103
+ if not self.controlnet.batch_size_is_specified:
104
+ self.controlnet.batch_size = self.text_encoder.batch_size
105
+
106
+ @property
107
+ def batch_size(self):
108
+ return self.vae.batch_size
109
+
110
+ @property
111
+ def sample_size(self):
112
+ return self.unet.sample_size
113
+
114
+ @property
115
+ def image_size(self):
116
+ return self.vae.sample_size
117
+
118
+
119
+ class RBLNStableDiffusionControlNetPipelineConfig(_RBLNStableDiffusionControlNetPipelineBaseConfig):
120
+ _vae_uses_encoder = False
121
+
122
+
123
+ class RBLNStableDiffusionControlNetImg2ImgPipelineConfig(_RBLNStableDiffusionControlNetPipelineBaseConfig):
124
+ _vae_uses_encoder = True
125
+
126
+
127
+ class _RBLNStableDiffusionXLControlNetPipelineBaseConfig(RBLNModelConfig):
128
+ submodules = ["text_encoder", "text_encoder_2", "unet", "vae", "controlnet"]
129
+ _vae_uses_encoder = False
130
+
131
+ def __init__(
132
+ self,
133
+ text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
134
+ text_encoder_2: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
135
+ unet: Optional[RBLNUNet2DConditionModelConfig] = None,
136
+ vae: Optional[RBLNAutoencoderKLConfig] = None,
137
+ controlnet: Optional[RBLNControlNetModelConfig] = None,
138
+ *,
139
+ batch_size: Optional[int] = None,
140
+ img_height: Optional[int] = None,
141
+ img_width: Optional[int] = None,
142
+ sample_size: Optional[Tuple[int, int]] = None,
143
+ image_size: Optional[Tuple[int, int]] = None,
144
+ guidance_scale: Optional[float] = None,
145
+ **kwargs,
146
+ ):
147
+ """
148
+ Args:
149
+ text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the primary text encoder.
150
+ Initialized as RBLNCLIPTextModelConfig if not provided.
151
+ text_encoder_2 (Optional[RBLNCLIPTextModelWithProjectionConfig]): Configuration for the secondary text encoder.
152
+ Initialized as RBLNCLIPTextModelWithProjectionConfig if not provided.
153
+ unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
154
+ Initialized as RBLNUNet2DConditionModelConfig if not provided.
155
+ vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
156
+ Initialized as RBLNAutoencoderKLConfig if not provided.
157
+ controlnet (Optional[RBLNControlNetModelConfig]): Configuration for the ControlNet model component.
158
+ Initialized as RBLNControlNetModelConfig if not provided.
159
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
160
+ img_height (Optional[int]): Height of the generated images.
161
+ img_width (Optional[int]): Width of the generated images.
162
+ sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
163
+ image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
164
+ Cannot be used together with img_height/img_width.
165
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
166
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
167
+
168
+ Raises:
169
+ ValueError: If both image_size and img_height/img_width are provided.
170
+
171
+ Note:
172
+ When guidance_scale > 1.0, the UNet batch size is automatically doubled to
173
+ accommodate classifier-free guidance.
174
+ """
175
+ super().__init__(**kwargs)
176
+ if image_size is not None and (img_height is not None or img_width is not None):
177
+ raise ValueError("image_size and img_height/img_width cannot both be provided")
178
+
179
+ if img_height is not None and img_width is not None:
180
+ image_size = (img_height, img_width)
181
+
182
+ self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
183
+ self.text_encoder_2 = self.init_submodule_config(
184
+ RBLNCLIPTextModelWithProjectionConfig, text_encoder_2, batch_size=batch_size
185
+ )
186
+ self.unet = self.init_submodule_config(
187
+ RBLNUNet2DConditionModelConfig,
188
+ unet,
189
+ sample_size=sample_size,
190
+ )
191
+ self.vae = self.init_submodule_config(
192
+ RBLNAutoencoderKLConfig,
193
+ vae,
194
+ batch_size=batch_size,
195
+ uses_encoder=self.__class__._vae_uses_encoder,
196
+ sample_size=image_size, # image size is equal to sample size in vae
197
+ )
198
+ self.controlnet = self.init_submodule_config(RBLNControlNetModelConfig, controlnet)
199
+
200
+ # Get default guidance scale from original class to set UNet and ControlNet batch size
201
+ guidance_scale = (
202
+ guidance_scale
203
+ or self.get_default_values_for_original_cls("__call__", ["guidance_scale"])["guidance_scale"]
204
+ )
205
+
206
+ do_classifier_free_guidance = guidance_scale > 1.0
207
+ if do_classifier_free_guidance:
208
+ if not self.unet.batch_size_is_specified:
209
+ self.unet.batch_size = self.text_encoder.batch_size * 2
210
+ if not self.controlnet.batch_size_is_specified:
211
+ self.controlnet.batch_size = self.text_encoder.batch_size * 2
212
+ else:
213
+ if not self.unet.batch_size_is_specified:
214
+ self.unet.batch_size = self.text_encoder.batch_size
215
+ if not self.controlnet.batch_size_is_specified:
216
+ self.controlnet.batch_size = self.text_encoder.batch_size
217
+
218
+ @property
219
+ def batch_size(self):
220
+ return self.vae.batch_size
221
+
222
+ @property
223
+ def sample_size(self):
224
+ return self.unet.sample_size
225
+
226
+ @property
227
+ def image_size(self):
228
+ return self.vae.sample_size
229
+
230
+
231
+ class RBLNStableDiffusionXLControlNetPipelineConfig(_RBLNStableDiffusionXLControlNetPipelineBaseConfig):
232
+ _vae_uses_encoder = False
233
+
234
+
235
+ class RBLNStableDiffusionXLControlNetImg2ImgPipelineConfig(_RBLNStableDiffusionXLControlNetPipelineBaseConfig):
236
+ _vae_uses_encoder = True
@@ -0,0 +1,289 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Tuple
16
+
17
+ from ....configuration_utils import RBLNModelConfig
18
+ from ....transformers import RBLNCLIPTextModelWithProjectionConfig, RBLNCLIPVisionModelWithProjectionConfig
19
+ from ..models import RBLNUNet2DConditionModelConfig, RBLNVQModelConfig
20
+ from ..models.configuration_prior_transformer import RBLNPriorTransformerConfig
21
+
22
+
23
+ class _RBLNKandinskyV22PipelineBaseConfig(RBLNModelConfig):
24
+ submodules = ["unet", "movq"]
25
+ _movq_uses_encoder = False
26
+
27
+ def __init__(
28
+ self,
29
+ unet: Optional[RBLNUNet2DConditionModelConfig] = None,
30
+ movq: Optional[RBLNVQModelConfig] = None,
31
+ *,
32
+ sample_size: Optional[Tuple[int, int]] = None,
33
+ batch_size: Optional[int] = None,
34
+ guidance_scale: Optional[float] = None,
35
+ image_size: Optional[Tuple[int, int]] = None,
36
+ img_height: Optional[int] = None,
37
+ img_width: Optional[int] = None,
38
+ **kwargs,
39
+ ):
40
+ """
41
+ Args:
42
+ unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
43
+ Initialized as RBLNUNet2DConditionModelConfig if not provided.
44
+ movq (Optional[RBLNVQModelConfig]): Configuration for the MoVQ (VQ-GAN) model component.
45
+ Initialized as RBLNVQModelConfig if not provided.
46
+ sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
47
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
48
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
49
+ image_size (Optional[Tuple[int, int]]): Dimensions for the generated images.
50
+ Cannot be used together with img_height/img_width.
51
+ img_height (Optional[int]): Height of the generated images.
52
+ img_width (Optional[int]): Width of the generated images.
53
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
54
+
55
+ Raises:
56
+ ValueError: If both image_size and img_height/img_width are provided.
57
+
58
+ Note:
59
+ When guidance_scale > 1.0, the UNet batch size is automatically doubled to
60
+ accommodate classifier-free guidance.
61
+ """
62
+ super().__init__(**kwargs)
63
+ if image_size is not None and (img_height is not None or img_width is not None):
64
+ raise ValueError("image_size and img_height/img_width cannot both be provided")
65
+
66
+ if img_height is not None and img_width is not None:
67
+ image_size = (img_height, img_width)
68
+
69
+ self.unet = self.init_submodule_config(RBLNUNet2DConditionModelConfig, unet, sample_size=sample_size)
70
+ self.movq = self.init_submodule_config(
71
+ RBLNVQModelConfig,
72
+ movq,
73
+ batch_size=batch_size,
74
+ sample_size=image_size, # image size is equal to sample size in vae
75
+ uses_encoder=self._movq_uses_encoder,
76
+ )
77
+
78
+ # Get default guidance scale from original class to set UNet batch size
79
+ if guidance_scale is None:
80
+ guidance_scale = self.get_default_values_for_original_cls("__call__", ["guidance_scale"])["guidance_scale"]
81
+
82
+ if not self.unet.batch_size_is_specified:
83
+ do_classifier_free_guidance = guidance_scale > 1.0
84
+ if do_classifier_free_guidance:
85
+ self.unet.batch_size = self.movq.batch_size * 2
86
+ else:
87
+ self.unet.batch_size = self.movq.batch_size
88
+
89
+ @property
90
+ def batch_size(self):
91
+ return self.movq.batch_size
92
+
93
+ @property
94
+ def image_size(self):
95
+ return self.movq.sample_size
96
+
97
+
98
+ class RBLNKandinskyV22PipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
99
+ _movq_uses_encoder = False
100
+
101
+
102
+ class RBLNKandinskyV22Img2ImgPipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
103
+ _movq_uses_encoder = True
104
+
105
+
106
+ class RBLNKandinskyV22InpaintPipelineConfig(_RBLNKandinskyV22PipelineBaseConfig):
107
+ _movq_uses_encoder = True
108
+
109
+
110
+ class RBLNKandinskyV22PriorPipelineConfig(RBLNModelConfig):
111
+ submodules = ["text_encoder", "image_encoder", "prior"]
112
+
113
+ def __init__(
114
+ self,
115
+ text_encoder: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
116
+ image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
117
+ prior: Optional[RBLNPriorTransformerConfig] = None,
118
+ *,
119
+ batch_size: Optional[int] = None,
120
+ guidance_scale: Optional[float] = None,
121
+ **kwargs,
122
+ ):
123
+ """
124
+ Initialize a configuration for Kandinsky 2.2 prior pipeline optimized for RBLN NPU.
125
+
126
+ This configuration sets up the prior components of the Kandinsky 2.2 architecture, which includes
127
+ text and image encoders along with a prior transformer that maps text/image embeddings to
128
+ latent representations used to condition the diffusion process.
129
+
130
+ Args:
131
+ text_encoder (Optional[RBLNCLIPTextModelWithProjectionConfig]): Configuration for the text encoder component.
132
+ Initialized as RBLNCLIPTextModelWithProjectionConfig if not provided.
133
+ image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Configuration for the image encoder component.
134
+ Initialized as RBLNCLIPVisionModelWithProjectionConfig if not provided.
135
+ prior (Optional[RBLNPriorTransformerConfig]): Configuration for the prior transformer component.
136
+ Initialized as RBLNPriorTransformerConfig if not provided.
137
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
138
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
139
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
140
+
141
+ Note:
142
+ When guidance_scale > 1.0, the prior batch size is automatically doubled to
143
+ accommodate classifier-free guidance.
144
+ """
145
+ super().__init__(**kwargs)
146
+ self.text_encoder = self.init_submodule_config(
147
+ RBLNCLIPTextModelWithProjectionConfig, text_encoder, batch_size=batch_size
148
+ )
149
+ self.image_encoder = self.init_submodule_config(
150
+ RBLNCLIPVisionModelWithProjectionConfig, image_encoder, batch_size=batch_size
151
+ )
152
+
153
+ self.prior = self.init_submodule_config(RBLNPriorTransformerConfig, prior)
154
+
155
+ # Get default guidance scale from original class to set UNet batch size
156
+ if guidance_scale is None:
157
+ guidance_scale = self.get_default_values_for_original_cls("__call__", ["guidance_scale"])["guidance_scale"]
158
+
159
+ if not self.prior.batch_size_is_specified:
160
+ do_classifier_free_guidance = guidance_scale > 1.0
161
+ if do_classifier_free_guidance:
162
+ self.prior.batch_size = self.text_encoder.batch_size * 2
163
+ else:
164
+ self.prior.batch_size = self.text_encoder.batch_size
165
+
166
+ @property
167
+ def batch_size(self):
168
+ return self.text_encoder.batch_size
169
+
170
+ @property
171
+ def image_size(self):
172
+ return self.image_encoder.image_size
173
+
174
+
175
+ class _RBLNKandinskyV22CombinedPipelineBaseConfig(RBLNModelConfig):
176
+ submodules = ["prior_pipe", "decoder_pipe"]
177
+ _decoder_pipe_cls = RBLNKandinskyV22PipelineConfig
178
+
179
+ def __init__(
180
+ self,
181
+ prior_pipe: Optional[RBLNKandinskyV22PriorPipelineConfig] = None,
182
+ decoder_pipe: Optional[RBLNKandinskyV22PipelineConfig] = None,
183
+ *,
184
+ sample_size: Optional[Tuple[int, int]] = None,
185
+ image_size: Optional[Tuple[int, int]] = None,
186
+ batch_size: Optional[int] = None,
187
+ img_height: Optional[int] = None,
188
+ img_width: Optional[int] = None,
189
+ guidance_scale: Optional[float] = None,
190
+ prior_prior: Optional[RBLNPriorTransformerConfig] = None,
191
+ prior_image_encoder: Optional[RBLNCLIPVisionModelWithProjectionConfig] = None,
192
+ prior_text_encoder: Optional[RBLNCLIPTextModelWithProjectionConfig] = None,
193
+ unet: Optional[RBLNUNet2DConditionModelConfig] = None,
194
+ movq: Optional[RBLNVQModelConfig] = None,
195
+ **kwargs,
196
+ ):
197
+ """
198
+ Initialize a configuration for combined Kandinsky 2.2 pipelines optimized for RBLN NPU.
199
+
200
+ This configuration integrates both the prior and decoder components of Kandinsky 2.2 into
201
+ a unified pipeline, allowing for end-to-end text-to-image generation in a single model.
202
+ It combines the text/image encoding, prior mapping, and diffusion steps together.
203
+
204
+ Args:
205
+ prior_pipe (Optional[RBLNKandinskyV22PriorPipelineConfig]): Configuration for the prior pipeline.
206
+ Initialized as RBLNKandinskyV22PriorPipelineConfig if not provided.
207
+ decoder_pipe (Optional[RBLNKandinskyV22PipelineConfig]): Configuration for the decoder pipeline.
208
+ Initialized as RBLNKandinskyV22PipelineConfig if not provided.
209
+ sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
210
+ image_size (Optional[Tuple[int, int]]): Dimensions for the generated images.
211
+ Cannot be used together with img_height/img_width.
212
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
213
+ img_height (Optional[int]): Height of the generated images.
214
+ img_width (Optional[int]): Width of the generated images.
215
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
216
+ prior_prior (Optional[RBLNPriorTransformerConfig]): Direct configuration for the prior transformer.
217
+ Used if prior_pipe is not provided.
218
+ prior_image_encoder (Optional[RBLNCLIPVisionModelWithProjectionConfig]): Direct configuration for the image encoder.
219
+ Used if prior_pipe is not provided.
220
+ prior_text_encoder (Optional[RBLNCLIPTextModelWithProjectionConfig]): Direct configuration for the text encoder.
221
+ Used if prior_pipe is not provided.
222
+ unet (Optional[RBLNUNet2DConditionModelConfig]): Direct configuration for the UNet.
223
+ Used if decoder_pipe is not provided.
224
+ movq (Optional[RBLNVQModelConfig]): Direct configuration for the MoVQ (VQ-GAN) model.
225
+ Used if decoder_pipe is not provided.
226
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
227
+ """
228
+ super().__init__(**kwargs)
229
+ self.prior_pipe = self.init_submodule_config(
230
+ RBLNKandinskyV22PriorPipelineConfig,
231
+ prior_pipe,
232
+ prior=prior_prior,
233
+ image_encoder=prior_image_encoder,
234
+ text_encoder=prior_text_encoder,
235
+ batch_size=batch_size,
236
+ guidance_scale=guidance_scale,
237
+ )
238
+ self.decoder_pipe = self.init_submodule_config(
239
+ self._decoder_pipe_cls,
240
+ decoder_pipe,
241
+ unet=unet,
242
+ movq=movq,
243
+ batch_size=batch_size,
244
+ sample_size=sample_size,
245
+ image_size=image_size,
246
+ img_height=img_height,
247
+ img_width=img_width,
248
+ guidance_scale=guidance_scale,
249
+ )
250
+
251
+ @property
252
+ def batch_size(self):
253
+ return self.prior_pipe.batch_size
254
+
255
+ @property
256
+ def image_size(self):
257
+ return self.prior_pipe.image_size
258
+
259
+ @property
260
+ def prior_prior(self):
261
+ return self.prior_pipe.prior
262
+
263
+ @property
264
+ def prior_image_encoder(self):
265
+ return self.prior_pipe.image_encoder
266
+
267
+ @property
268
+ def prior_text_encoder(self):
269
+ return self.prior_pipe.text_encoder
270
+
271
+ @property
272
+ def unet(self):
273
+ return self.decoder_pipe.unet
274
+
275
+ @property
276
+ def movq(self):
277
+ return self.decoder_pipe.movq
278
+
279
+
280
+ class RBLNKandinskyV22CombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
281
+ _decoder_pipe_cls = RBLNKandinskyV22PipelineConfig
282
+
283
+
284
+ class RBLNKandinskyV22InpaintCombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
285
+ _decoder_pipe_cls = RBLNKandinskyV22InpaintPipelineConfig
286
+
287
+
288
+ class RBLNKandinskyV22Img2ImgCombinedPipelineConfig(_RBLNKandinskyV22CombinedPipelineBaseConfig):
289
+ _decoder_pipe_cls = RBLNKandinskyV22Img2ImgPipelineConfig
@@ -0,0 +1,118 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Tuple
16
+
17
+ from ....configuration_utils import RBLNModelConfig
18
+ from ....transformers import RBLNCLIPTextModelConfig
19
+ from ..models import RBLNAutoencoderKLConfig, RBLNUNet2DConditionModelConfig
20
+
21
+
22
+ class _RBLNStableDiffusionPipelineBaseConfig(RBLNModelConfig):
23
+ submodules = ["text_encoder", "unet", "vae"]
24
+ _vae_uses_encoder = False
25
+
26
+ def __init__(
27
+ self,
28
+ text_encoder: Optional[RBLNCLIPTextModelConfig] = None,
29
+ unet: Optional[RBLNUNet2DConditionModelConfig] = None,
30
+ vae: Optional[RBLNAutoencoderKLConfig] = None,
31
+ *,
32
+ batch_size: Optional[int] = None,
33
+ img_height: Optional[int] = None,
34
+ img_width: Optional[int] = None,
35
+ sample_size: Optional[Tuple[int, int]] = None,
36
+ image_size: Optional[Tuple[int, int]] = None,
37
+ guidance_scale: Optional[float] = None,
38
+ **kwargs,
39
+ ):
40
+ """
41
+ Args:
42
+ text_encoder (Optional[RBLNCLIPTextModelConfig]): Configuration for the text encoder component.
43
+ Initialized as RBLNCLIPTextModelConfig if not provided.
44
+ unet (Optional[RBLNUNet2DConditionModelConfig]): Configuration for the UNet model component.
45
+ Initialized as RBLNUNet2DConditionModelConfig if not provided.
46
+ vae (Optional[RBLNAutoencoderKLConfig]): Configuration for the VAE model component.
47
+ Initialized as RBLNAutoencoderKLConfig if not provided.
48
+ batch_size (Optional[int]): Batch size for inference, applied to all submodules.
49
+ img_height (Optional[int]): Height of the generated images.
50
+ img_width (Optional[int]): Width of the generated images.
51
+ sample_size (Optional[Tuple[int, int]]): Spatial dimensions for the UNet model.
52
+ image_size (Optional[Tuple[int, int]]): Alternative way to specify image dimensions.
53
+ Cannot be used together with img_height/img_width.
54
+ guidance_scale (Optional[float]): Scale for classifier-free guidance.
55
+ **kwargs: Additional arguments passed to the parent RBLNModelConfig.
56
+
57
+ Raises:
58
+ ValueError: If both image_size and img_height/img_width are provided.
59
+
60
+ Note:
61
+ When guidance_scale > 1.0, the UNet batch size is automatically doubled to
62
+ accommodate classifier-free guidance.
63
+ """
64
+ super().__init__(**kwargs)
65
+ if image_size is not None and (img_height is not None or img_width is not None):
66
+ raise ValueError("image_size and img_height/img_width cannot both be provided")
67
+
68
+ if img_height is not None and img_width is not None:
69
+ image_size = (img_height, img_width)
70
+
71
+ self.text_encoder = self.init_submodule_config(RBLNCLIPTextModelConfig, text_encoder, batch_size=batch_size)
72
+ self.unet = self.init_submodule_config(
73
+ RBLNUNet2DConditionModelConfig,
74
+ unet,
75
+ sample_size=sample_size,
76
+ )
77
+ self.vae = self.init_submodule_config(
78
+ RBLNAutoencoderKLConfig,
79
+ vae,
80
+ batch_size=batch_size,
81
+ uses_encoder=self.__class__._vae_uses_encoder,
82
+ sample_size=image_size, # image size is equal to sample size in vae
83
+ )
84
+
85
+ # Get default guidance scale from original class to set UNet batch size
86
+ if guidance_scale is None:
87
+ guidance_scale = self.get_default_values_for_original_cls("__call__", ["guidance_scale"])["guidance_scale"]
88
+
89
+ if not self.unet.batch_size_is_specified:
90
+ do_classifier_free_guidance = guidance_scale > 1.0
91
+ if do_classifier_free_guidance:
92
+ self.unet.batch_size = self.text_encoder.batch_size * 2
93
+ else:
94
+ self.unet.batch_size = self.text_encoder.batch_size
95
+
96
+ @property
97
+ def batch_size(self):
98
+ return self.vae.batch_size
99
+
100
+ @property
101
+ def sample_size(self):
102
+ return self.unet.sample_size
103
+
104
+ @property
105
+ def image_size(self):
106
+ return self.vae.sample_size
107
+
108
+
109
+ class RBLNStableDiffusionPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
110
+ _vae_uses_encoder = False
111
+
112
+
113
+ class RBLNStableDiffusionImg2ImgPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
114
+ _vae_uses_encoder = True
115
+
116
+
117
+ class RBLNStableDiffusionInpaintPipelineConfig(_RBLNStableDiffusionPipelineBaseConfig):
118
+ _vae_uses_encoder = True