optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +0 -12
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +2 -4
- optimum/rbln/diffusers/__init__.py +0 -12
- optimum/rbln/diffusers/configurations/__init__.py +0 -3
- optimum/rbln/diffusers/configurations/models/__init__.py +0 -2
- optimum/rbln/diffusers/configurations/pipelines/__init__.py +0 -3
- optimum/rbln/diffusers/models/__init__.py +3 -17
- optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -1
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +3 -3
- optimum/rbln/diffusers/models/autoencoders/vae.py +8 -27
- optimum/rbln/diffusers/models/controlnet.py +1 -16
- optimum/rbln/diffusers/models/transformers/prior_transformer.py +2 -16
- optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -16
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -14
- optimum/rbln/diffusers/models/unets/__init__.py +0 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -17
- optimum/rbln/diffusers/pipelines/__init__.py +0 -4
- optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -20
- optimum/rbln/modeling.py +45 -20
- optimum/rbln/modeling_base.py +1 -0
- optimum/rbln/transformers/configuration_generic.py +27 -0
- optimum/rbln/transformers/modeling_attention_utils.py +109 -242
- optimum/rbln/transformers/modeling_generic.py +61 -2
- optimum/rbln/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +2 -28
- optimum/rbln/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +5 -68
- optimum/rbln/transformers/models/bart/modeling_bart.py +2 -23
- optimum/rbln/transformers/models/bert/modeling_bert.py +1 -86
- optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +15 -42
- optimum/rbln/transformers/models/clip/modeling_clip.py +2 -40
- optimum/rbln/transformers/models/colpali/modeling_colpali.py +44 -5
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +1 -6
- optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py +2 -6
- optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py +9 -17
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +12 -36
- optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py +0 -17
- optimum/rbln/transformers/models/distilbert/modeling_distilbert.py +0 -24
- optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -17
- optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +5 -3
- optimum/rbln/transformers/models/grounding_dino/modeling_grounding_dino.py +8 -24
- optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +5 -3
- optimum/rbln/transformers/models/llava/modeling_llava.py +24 -36
- optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +4 -2
- optimum/rbln/transformers/models/opt/modeling_opt.py +2 -2
- optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +1 -1
- optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +1 -13
- optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +3 -2
- optimum/rbln/transformers/models/qwen2_vl/modeling_qwen2_vl.py +3 -2
- optimum/rbln/transformers/models/resnet/configuration_resnet.py +0 -17
- optimum/rbln/transformers/models/resnet/modeling_resnet.py +0 -73
- optimum/rbln/transformers/models/roberta/modeling_roberta.py +0 -33
- optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +4 -2
- optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +10 -34
- optimum/rbln/transformers/models/siglip/modeling_siglip.py +1 -17
- optimum/rbln/transformers/models/swin/modeling_swin.py +1 -14
- optimum/rbln/transformers/models/t5/modeling_t5.py +2 -2
- optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +2 -16
- optimum/rbln/transformers/models/vit/modeling_vit.py +0 -19
- optimum/rbln/transformers/models/wav2vec2/configuration_wav2vec2.py +3 -15
- optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +8 -60
- optimum/rbln/transformers/models/whisper/generation_whisper.py +14 -48
- optimum/rbln/transformers/models/whisper/modeling_whisper.py +2 -2
- optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -43
- optimum/rbln/transformers/utils/rbln_quantization.py +0 -9
- optimum/rbln/utils/depreacate_utils.py +16 -0
- optimum/rbln/utils/hub.py +3 -14
- optimum/rbln/utils/runtime_utils.py +0 -32
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/METADATA +2 -2
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/RECORD +72 -79
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/WHEEL +1 -1
- optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_temporal_decoder.py +0 -67
- optimum/rbln/diffusers/configurations/models/configuration_unet_spatio_temporal_condition.py +0 -59
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_video_diffusion.py +0 -114
- optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +0 -275
- optimum/rbln/diffusers/models/unets/unet_spatio_temporal_condition.py +0 -201
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/__init__.py +0 -15
- optimum/rbln/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +0 -46
- optimum/rbln/utils/deprecation.py +0 -213
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/entry_points.txt +0 -0
- {optimum_rbln-0.9.3.dist-info → optimum_rbln-0.9.3rc0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
-
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at:
|
|
6
|
-
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
16
|
-
|
|
17
|
-
import rebel
|
|
18
|
-
import torch # noqa: I001
|
|
19
|
-
from diffusers import AutoencoderKLTemporalDecoder
|
|
20
|
-
from diffusers.models.autoencoders.vae import DecoderOutput
|
|
21
|
-
from diffusers.models.modeling_outputs import AutoencoderKLOutput
|
|
22
|
-
from transformers import PretrainedConfig
|
|
23
|
-
|
|
24
|
-
from ....configuration_utils import RBLNCompileConfig
|
|
25
|
-
from ....modeling import RBLNModel
|
|
26
|
-
from ....utils.logging import get_logger
|
|
27
|
-
from ...configurations import RBLNAutoencoderKLTemporalDecoderConfig
|
|
28
|
-
from ...modeling_diffusers import RBLNDiffusionMixin
|
|
29
|
-
from .vae import (
|
|
30
|
-
DiagonalGaussianDistribution,
|
|
31
|
-
RBLNRuntimeVAEDecoder,
|
|
32
|
-
RBLNRuntimeVAEEncoder,
|
|
33
|
-
_VAEEncoder,
|
|
34
|
-
_VAETemporalDecoder,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if TYPE_CHECKING:
|
|
39
|
-
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
|
|
40
|
-
|
|
41
|
-
from ...modeling_diffusers import RBLNDiffusionMixin, RBLNDiffusionMixinConfig
|
|
42
|
-
|
|
43
|
-
logger = get_logger(__name__)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class RBLNAutoencoderKLTemporalDecoder(RBLNModel):
|
|
47
|
-
auto_model_class = AutoencoderKLTemporalDecoder
|
|
48
|
-
hf_library_name = "diffusers"
|
|
49
|
-
_rbln_config_class = RBLNAutoencoderKLTemporalDecoderConfig
|
|
50
|
-
|
|
51
|
-
def __post_init__(self, **kwargs):
|
|
52
|
-
super().__post_init__(**kwargs)
|
|
53
|
-
|
|
54
|
-
if self.rbln_config.uses_encoder:
|
|
55
|
-
self.encoder = RBLNRuntimeVAEEncoder(runtime=self.model[0], main_input_name="x")
|
|
56
|
-
self.decoder = RBLNRuntimeVAEDecoder(runtime=self.model[-1], main_input_name="z")
|
|
57
|
-
self.image_size = self.rbln_config.image_size
|
|
58
|
-
|
|
59
|
-
@classmethod
|
|
60
|
-
def _wrap_model_if_needed(
|
|
61
|
-
cls, model: torch.nn.Module, rbln_config: RBLNAutoencoderKLTemporalDecoderConfig
|
|
62
|
-
) -> torch.nn.Module:
|
|
63
|
-
decoder_model = _VAETemporalDecoder(model)
|
|
64
|
-
decoder_model.num_frames = rbln_config.decode_chunk_size
|
|
65
|
-
decoder_model.eval()
|
|
66
|
-
|
|
67
|
-
if rbln_config.uses_encoder:
|
|
68
|
-
encoder_model = _VAEEncoder(model)
|
|
69
|
-
encoder_model.eval()
|
|
70
|
-
return encoder_model, decoder_model
|
|
71
|
-
else:
|
|
72
|
-
return decoder_model
|
|
73
|
-
|
|
74
|
-
@classmethod
|
|
75
|
-
def get_compiled_model(
|
|
76
|
-
cls, model, rbln_config: RBLNAutoencoderKLTemporalDecoderConfig
|
|
77
|
-
) -> Dict[str, rebel.RBLNCompiledModel]:
|
|
78
|
-
compiled_models = {}
|
|
79
|
-
if rbln_config.uses_encoder:
|
|
80
|
-
encoder_model, decoder_model = cls._wrap_model_if_needed(model, rbln_config)
|
|
81
|
-
enc_compiled_model = cls.compile(
|
|
82
|
-
encoder_model,
|
|
83
|
-
rbln_compile_config=rbln_config.compile_cfgs[0],
|
|
84
|
-
create_runtimes=rbln_config.create_runtimes,
|
|
85
|
-
device=rbln_config.device_map["encoder"],
|
|
86
|
-
)
|
|
87
|
-
compiled_models["encoder"] = enc_compiled_model
|
|
88
|
-
else:
|
|
89
|
-
decoder_model = cls._wrap_model_if_needed(model, rbln_config)
|
|
90
|
-
dec_compiled_model = cls.compile(
|
|
91
|
-
decoder_model,
|
|
92
|
-
rbln_compile_config=rbln_config.compile_cfgs[-1],
|
|
93
|
-
create_runtimes=rbln_config.create_runtimes,
|
|
94
|
-
device=rbln_config.device_map["decoder"],
|
|
95
|
-
)
|
|
96
|
-
compiled_models["decoder"] = dec_compiled_model
|
|
97
|
-
|
|
98
|
-
return compiled_models
|
|
99
|
-
|
|
100
|
-
@classmethod
|
|
101
|
-
def get_vae_sample_size(
|
|
102
|
-
cls,
|
|
103
|
-
pipe: "RBLNDiffusionMixin",
|
|
104
|
-
rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
|
|
105
|
-
return_vae_scale_factor: bool = False,
|
|
106
|
-
) -> Tuple[int, int]:
|
|
107
|
-
sample_size = rbln_config.sample_size
|
|
108
|
-
if hasattr(pipe, "vae_scale_factor"):
|
|
109
|
-
vae_scale_factor = pipe.vae_scale_factor
|
|
110
|
-
else:
|
|
111
|
-
if hasattr(pipe.vae.config, "block_out_channels"):
|
|
112
|
-
vae_scale_factor = 2 ** (len(pipe.vae.config.block_out_channels) - 1)
|
|
113
|
-
else:
|
|
114
|
-
vae_scale_factor = 8 # vae image processor default value 8 (int)
|
|
115
|
-
|
|
116
|
-
if sample_size is None:
|
|
117
|
-
sample_size = pipe.unet.config.sample_size
|
|
118
|
-
if isinstance(sample_size, int):
|
|
119
|
-
sample_size = (sample_size, sample_size)
|
|
120
|
-
sample_size = (sample_size[0] * vae_scale_factor, sample_size[1] * vae_scale_factor)
|
|
121
|
-
|
|
122
|
-
if return_vae_scale_factor:
|
|
123
|
-
return sample_size, vae_scale_factor
|
|
124
|
-
else:
|
|
125
|
-
return sample_size
|
|
126
|
-
|
|
127
|
-
@classmethod
|
|
128
|
-
def update_rbln_config_using_pipe(
|
|
129
|
-
cls, pipe: "RBLNDiffusionMixin", rbln_config: "RBLNDiffusionMixinConfig", submodule_name: str
|
|
130
|
-
) -> "RBLNDiffusionMixinConfig":
|
|
131
|
-
rbln_config.vae.sample_size, rbln_config.vae.vae_scale_factor = cls.get_vae_sample_size(
|
|
132
|
-
pipe, rbln_config.vae, return_vae_scale_factor=True
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
if rbln_config.vae.num_frames is None:
|
|
136
|
-
if hasattr(pipe.unet.config, "num_frames"):
|
|
137
|
-
rbln_config.vae.num_frames = pipe.unet.config.num_frames
|
|
138
|
-
else:
|
|
139
|
-
raise ValueError("num_frames should be specified in unet config.json")
|
|
140
|
-
|
|
141
|
-
if rbln_config.vae.decode_chunk_size is None:
|
|
142
|
-
rbln_config.vae.decode_chunk_size = rbln_config.vae.num_frames
|
|
143
|
-
|
|
144
|
-
def chunk_frame(num_frames, decode_chunk_size):
|
|
145
|
-
# get closest divisor to num_frames
|
|
146
|
-
divisors = [i for i in range(1, num_frames) if num_frames % i == 0]
|
|
147
|
-
closest = min(divisors, key=lambda x: abs(x - decode_chunk_size))
|
|
148
|
-
if decode_chunk_size != closest:
|
|
149
|
-
logger.warning(
|
|
150
|
-
f"To ensure successful model compilation and prevent device OOM, {decode_chunk_size} is set to {closest}."
|
|
151
|
-
)
|
|
152
|
-
return closest
|
|
153
|
-
|
|
154
|
-
decode_chunk_size = chunk_frame(rbln_config.vae.num_frames, rbln_config.vae.decode_chunk_size)
|
|
155
|
-
rbln_config.vae.decode_chunk_size = decode_chunk_size
|
|
156
|
-
return rbln_config
|
|
157
|
-
|
|
158
|
-
@classmethod
|
|
159
|
-
def _update_rbln_config(
|
|
160
|
-
cls,
|
|
161
|
-
preprocessors: Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"],
|
|
162
|
-
model: "PreTrainedModel",
|
|
163
|
-
model_config: "PretrainedConfig",
|
|
164
|
-
rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
|
|
165
|
-
) -> RBLNAutoencoderKLTemporalDecoderConfig:
|
|
166
|
-
if rbln_config.sample_size is None:
|
|
167
|
-
rbln_config.sample_size = model_config.sample_size
|
|
168
|
-
|
|
169
|
-
if rbln_config.vae_scale_factor is None:
|
|
170
|
-
if hasattr(model_config, "block_out_channels"):
|
|
171
|
-
rbln_config.vae_scale_factor = 2 ** (len(model_config.block_out_channels) - 1)
|
|
172
|
-
else:
|
|
173
|
-
# vae image processor default value 8 (int)
|
|
174
|
-
rbln_config.vae_scale_factor = 8
|
|
175
|
-
|
|
176
|
-
compile_cfgs = []
|
|
177
|
-
if rbln_config.uses_encoder:
|
|
178
|
-
vae_enc_input_info = [
|
|
179
|
-
(
|
|
180
|
-
"x",
|
|
181
|
-
[
|
|
182
|
-
rbln_config.batch_size,
|
|
183
|
-
model_config.in_channels,
|
|
184
|
-
rbln_config.sample_size[0],
|
|
185
|
-
rbln_config.sample_size[1],
|
|
186
|
-
],
|
|
187
|
-
"float32",
|
|
188
|
-
)
|
|
189
|
-
]
|
|
190
|
-
compile_cfgs.append(RBLNCompileConfig(compiled_model_name="encoder", input_info=vae_enc_input_info))
|
|
191
|
-
|
|
192
|
-
decode_batch_size = rbln_config.batch_size * rbln_config.decode_chunk_size
|
|
193
|
-
vae_dec_input_info = [
|
|
194
|
-
(
|
|
195
|
-
"z",
|
|
196
|
-
[
|
|
197
|
-
decode_batch_size,
|
|
198
|
-
model_config.latent_channels,
|
|
199
|
-
rbln_config.latent_sample_size[0],
|
|
200
|
-
rbln_config.latent_sample_size[1],
|
|
201
|
-
],
|
|
202
|
-
"float32",
|
|
203
|
-
)
|
|
204
|
-
]
|
|
205
|
-
compile_cfgs.append(RBLNCompileConfig(compiled_model_name="decoder", input_info=vae_dec_input_info))
|
|
206
|
-
|
|
207
|
-
rbln_config.set_compile_cfgs(compile_cfgs)
|
|
208
|
-
return rbln_config
|
|
209
|
-
|
|
210
|
-
@classmethod
|
|
211
|
-
def _create_runtimes(
|
|
212
|
-
cls,
|
|
213
|
-
compiled_models: List[rebel.RBLNCompiledModel],
|
|
214
|
-
rbln_config: RBLNAutoencoderKLTemporalDecoderConfig,
|
|
215
|
-
) -> List[rebel.Runtime]:
|
|
216
|
-
if len(compiled_models) == 1:
|
|
217
|
-
# decoder
|
|
218
|
-
expected_models = ["decoder"]
|
|
219
|
-
else:
|
|
220
|
-
expected_models = ["encoder", "decoder"]
|
|
221
|
-
|
|
222
|
-
if any(model_name not in rbln_config.device_map for model_name in expected_models):
|
|
223
|
-
cls._raise_missing_compiled_file_error(expected_models)
|
|
224
|
-
|
|
225
|
-
device_vals = [rbln_config.device_map[model_name] for model_name in expected_models]
|
|
226
|
-
return [
|
|
227
|
-
rebel.Runtime(
|
|
228
|
-
compiled_model,
|
|
229
|
-
tensor_type="pt",
|
|
230
|
-
device=device_val,
|
|
231
|
-
activate_profiler=rbln_config.activate_profiler,
|
|
232
|
-
timeout=rbln_config.timeout,
|
|
233
|
-
)
|
|
234
|
-
for compiled_model, device_val in zip(compiled_models, device_vals)
|
|
235
|
-
]
|
|
236
|
-
|
|
237
|
-
def encode(
|
|
238
|
-
self, x: torch.FloatTensor, return_dict: bool = True
|
|
239
|
-
) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
|
|
240
|
-
"""
|
|
241
|
-
Encode an input image into a latent representation.
|
|
242
|
-
|
|
243
|
-
Args:
|
|
244
|
-
x: The input image to encode.
|
|
245
|
-
return_dict:
|
|
246
|
-
Whether to return output as a dictionary. Defaults to True.
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
The latent representation or AutoencoderKLOutput if return_dict=True
|
|
250
|
-
"""
|
|
251
|
-
posterior = self.encoder.encode(x)
|
|
252
|
-
|
|
253
|
-
if not return_dict:
|
|
254
|
-
return (posterior,)
|
|
255
|
-
|
|
256
|
-
return AutoencoderKLOutput(latent_dist=posterior)
|
|
257
|
-
|
|
258
|
-
def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> torch.FloatTensor:
|
|
259
|
-
"""
|
|
260
|
-
Decode a latent representation into a video.
|
|
261
|
-
|
|
262
|
-
Args:
|
|
263
|
-
z: The latent representation to decode.
|
|
264
|
-
return_dict:
|
|
265
|
-
Whether to return output as a dictionary. Defaults to True.
|
|
266
|
-
|
|
267
|
-
Returns:
|
|
268
|
-
The decoded video or DecoderOutput if return_dict=True
|
|
269
|
-
"""
|
|
270
|
-
decoded = self.decoder.decode(z)
|
|
271
|
-
|
|
272
|
-
if not return_dict:
|
|
273
|
-
return (decoded,)
|
|
274
|
-
|
|
275
|
-
return DecoderOutput(sample=decoded)
|
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
-
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at:
|
|
6
|
-
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from dataclasses import dataclass
|
|
16
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
17
|
-
|
|
18
|
-
import torch
|
|
19
|
-
from diffusers.models.unets.unet_spatio_temporal_condition import (
|
|
20
|
-
UNetSpatioTemporalConditionModel,
|
|
21
|
-
UNetSpatioTemporalConditionOutput,
|
|
22
|
-
)
|
|
23
|
-
from transformers import PretrainedConfig
|
|
24
|
-
|
|
25
|
-
from ....configuration_utils import RBLNCompileConfig
|
|
26
|
-
from ....modeling import RBLNModel
|
|
27
|
-
from ....utils.logging import get_logger
|
|
28
|
-
from ...configurations import RBLNUNetSpatioTemporalConditionModelConfig
|
|
29
|
-
from ...modeling_diffusers import RBLNDiffusionMixin, RBLNDiffusionMixinConfig
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
if TYPE_CHECKING:
|
|
33
|
-
from transformers import AutoFeatureExtractor, AutoProcessor, PreTrainedModel
|
|
34
|
-
|
|
35
|
-
logger = get_logger(__name__)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class _UNet_STCM(torch.nn.Module):
|
|
39
|
-
def __init__(self, unet: "UNetSpatioTemporalConditionModel"):
|
|
40
|
-
super().__init__()
|
|
41
|
-
self.unet = unet
|
|
42
|
-
|
|
43
|
-
def forward(
|
|
44
|
-
self,
|
|
45
|
-
sample: torch.Tensor,
|
|
46
|
-
timestep: Union[torch.Tensor, float, int],
|
|
47
|
-
encoder_hidden_states: torch.Tensor,
|
|
48
|
-
added_time_ids: torch.Tensor,
|
|
49
|
-
) -> torch.Tensor:
|
|
50
|
-
unet_out = self.unet(
|
|
51
|
-
sample=sample,
|
|
52
|
-
timestep=timestep,
|
|
53
|
-
encoder_hidden_states=encoder_hidden_states,
|
|
54
|
-
added_time_ids=added_time_ids,
|
|
55
|
-
return_dict=False,
|
|
56
|
-
)
|
|
57
|
-
return unet_out
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class RBLNUNetSpatioTemporalConditionModel(RBLNModel):
|
|
61
|
-
hf_library_name = "diffusers"
|
|
62
|
-
auto_model_class = UNetSpatioTemporalConditionModel
|
|
63
|
-
_rbln_config_class = RBLNUNetSpatioTemporalConditionModelConfig
|
|
64
|
-
output_class = UNetSpatioTemporalConditionOutput
|
|
65
|
-
output_key = "sample"
|
|
66
|
-
|
|
67
|
-
def __post_init__(self, **kwargs):
|
|
68
|
-
super().__post_init__(**kwargs)
|
|
69
|
-
self.in_features = self.rbln_config.in_features
|
|
70
|
-
if self.in_features is not None:
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class LINEAR1:
|
|
74
|
-
in_features: int
|
|
75
|
-
|
|
76
|
-
@dataclass
|
|
77
|
-
class ADDEMBEDDING:
|
|
78
|
-
linear_1: LINEAR1
|
|
79
|
-
|
|
80
|
-
self.add_embedding = ADDEMBEDDING(LINEAR1(self.in_features))
|
|
81
|
-
|
|
82
|
-
@classmethod
|
|
83
|
-
def _wrap_model_if_needed(
|
|
84
|
-
cls, model: torch.nn.Module, rbln_config: RBLNUNetSpatioTemporalConditionModelConfig
|
|
85
|
-
) -> torch.nn.Module:
|
|
86
|
-
return _UNet_STCM(model).eval()
|
|
87
|
-
|
|
88
|
-
@classmethod
|
|
89
|
-
def get_unet_sample_size(
|
|
90
|
-
cls,
|
|
91
|
-
pipe: RBLNDiffusionMixin,
|
|
92
|
-
rbln_config: RBLNUNetSpatioTemporalConditionModelConfig,
|
|
93
|
-
image_size: Optional[Tuple[int, int]] = None,
|
|
94
|
-
) -> Union[int, Tuple[int, int]]:
|
|
95
|
-
scale_factor = pipe.vae_scale_factor
|
|
96
|
-
|
|
97
|
-
if image_size is None:
|
|
98
|
-
vae_sample_size = pipe.vae.config.sample_size
|
|
99
|
-
if isinstance(vae_sample_size, int):
|
|
100
|
-
vae_sample_size = (vae_sample_size, vae_sample_size)
|
|
101
|
-
|
|
102
|
-
sample_size = (
|
|
103
|
-
vae_sample_size[0] // scale_factor,
|
|
104
|
-
vae_sample_size[1] // scale_factor,
|
|
105
|
-
)
|
|
106
|
-
else:
|
|
107
|
-
sample_size = (image_size[0] // scale_factor, image_size[1] // scale_factor)
|
|
108
|
-
return sample_size
|
|
109
|
-
|
|
110
|
-
@classmethod
|
|
111
|
-
def update_rbln_config_using_pipe(
|
|
112
|
-
cls, pipe: RBLNDiffusionMixin, rbln_config: "RBLNDiffusionMixinConfig", submodule_name: str
|
|
113
|
-
) -> Dict[str, Any]:
|
|
114
|
-
rbln_config.unet.sample_size = cls.get_unet_sample_size(
|
|
115
|
-
pipe, rbln_config.unet, image_size=rbln_config.image_size
|
|
116
|
-
)
|
|
117
|
-
return rbln_config
|
|
118
|
-
|
|
119
|
-
@classmethod
|
|
120
|
-
def _update_rbln_config(
|
|
121
|
-
cls,
|
|
122
|
-
preprocessors: Union["AutoFeatureExtractor", "AutoProcessor"],
|
|
123
|
-
model: "PreTrainedModel",
|
|
124
|
-
model_config: "PretrainedConfig",
|
|
125
|
-
rbln_config: RBLNUNetSpatioTemporalConditionModelConfig,
|
|
126
|
-
) -> RBLNUNetSpatioTemporalConditionModelConfig:
|
|
127
|
-
if rbln_config.num_frames is None:
|
|
128
|
-
rbln_config.num_frames = model_config.num_frames
|
|
129
|
-
|
|
130
|
-
if rbln_config.sample_size is None:
|
|
131
|
-
rbln_config.sample_size = model_config.sample_size
|
|
132
|
-
|
|
133
|
-
input_info = [
|
|
134
|
-
(
|
|
135
|
-
"sample",
|
|
136
|
-
[
|
|
137
|
-
rbln_config.batch_size,
|
|
138
|
-
rbln_config.num_frames,
|
|
139
|
-
model_config.in_channels,
|
|
140
|
-
rbln_config.sample_size[0],
|
|
141
|
-
rbln_config.sample_size[1],
|
|
142
|
-
],
|
|
143
|
-
"float32",
|
|
144
|
-
),
|
|
145
|
-
("timestep", [], "float32"),
|
|
146
|
-
("encoder_hidden_states", [rbln_config.batch_size, 1, model_config.cross_attention_dim], "float32"),
|
|
147
|
-
("added_time_ids", [rbln_config.batch_size, 3], "float32"),
|
|
148
|
-
]
|
|
149
|
-
|
|
150
|
-
if hasattr(model_config, "addition_time_embed_dim"):
|
|
151
|
-
rbln_config.in_features = model_config.projection_class_embeddings_input_dim
|
|
152
|
-
|
|
153
|
-
rbln_compile_config = RBLNCompileConfig(input_info=input_info)
|
|
154
|
-
rbln_config.set_compile_cfgs([rbln_compile_config])
|
|
155
|
-
|
|
156
|
-
return rbln_config
|
|
157
|
-
|
|
158
|
-
@property
|
|
159
|
-
def compiled_batch_size(self):
|
|
160
|
-
return self.rbln_config.compile_cfgs[0].input_info[0][1][0]
|
|
161
|
-
|
|
162
|
-
def forward(
|
|
163
|
-
self,
|
|
164
|
-
sample: torch.Tensor,
|
|
165
|
-
timestep: Union[torch.Tensor, float, int],
|
|
166
|
-
encoder_hidden_states: torch.Tensor,
|
|
167
|
-
added_time_ids: torch.Tensor,
|
|
168
|
-
return_dict: bool = True,
|
|
169
|
-
**kwargs,
|
|
170
|
-
) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
|
|
171
|
-
"""
|
|
172
|
-
Forward pass for the RBLN-optimized UNetSpatioTemporalConditionModel.
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
sample (torch.Tensor): The noisy input tensor with the following shape `(batch, channel, height, width)`.
|
|
176
|
-
timestep (Union[torch.Tensor, float, int]): The number of timesteps to denoise an input.
|
|
177
|
-
encoder_hidden_states (torch.Tensor): The encoder hidden states.
|
|
178
|
-
added_time_ids (torch.Tensor): A tensor containing additional sinusoidal embeddings and added to the time embeddings.
|
|
179
|
-
return_dict (bool): Whether or not to return a [`~diffusers.models.unets.unet_spatio_temporal_condition.UNetSpatioTemporalConditionOutput`] instead of a plain tuple.
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
(Union[`~diffusers.models.unets.unet_spatio_temporal_condition.UNetSpatioTemporalConditionOutput`], Tuple)
|
|
183
|
-
"""
|
|
184
|
-
sample_batch_size = sample.size()[0]
|
|
185
|
-
compiled_batch_size = self.compiled_batch_size
|
|
186
|
-
if sample_batch_size != compiled_batch_size and (
|
|
187
|
-
sample_batch_size * 2 == compiled_batch_size or sample_batch_size == compiled_batch_size * 2
|
|
188
|
-
):
|
|
189
|
-
raise ValueError(
|
|
190
|
-
f"Mismatch between UNet's runtime batch size ({sample_batch_size}) and compiled batch size ({compiled_batch_size}). "
|
|
191
|
-
"This may be caused by the 'guidance scale' parameter, which doubles the runtime batch size in Stable Diffusion. "
|
|
192
|
-
"Adjust the batch size during compilation or modify the 'guidance scale' to match the compiled batch size.\n\n"
|
|
193
|
-
"For details, see: https://docs.rbln.ai/software/optimum/model_api.html#stable-diffusion"
|
|
194
|
-
)
|
|
195
|
-
return super().forward(
|
|
196
|
-
sample.contiguous(),
|
|
197
|
-
timestep.float(),
|
|
198
|
-
encoder_hidden_states,
|
|
199
|
-
added_time_ids,
|
|
200
|
-
return_dict=return_dict,
|
|
201
|
-
)
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
-
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at:
|
|
6
|
-
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from .pipeline_stable_video_diffusion import RBLNStableVideoDiffusionPipeline
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# Copyright 2025 Rebellions Inc. All rights reserved.
|
|
2
|
-
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at:
|
|
6
|
-
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from diffusers import StableVideoDiffusionPipeline
|
|
17
|
-
|
|
18
|
-
from ....utils.logging import get_logger
|
|
19
|
-
from ...configurations import RBLNStableVideoDiffusionPipelineConfig
|
|
20
|
-
from ...modeling_diffusers import RBLNDiffusionMixin
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
logger = get_logger(__name__)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class RBLNStableVideoDiffusionPipeline(RBLNDiffusionMixin, StableVideoDiffusionPipeline):
|
|
27
|
-
"""
|
|
28
|
-
RBLN-accelerated implementation of Stable Video Diffusion pipeline for image-to-video generation.
|
|
29
|
-
|
|
30
|
-
This pipeline compiles Stable Video Diffusion models to run efficiently on RBLN NPUs, enabling high-performance
|
|
31
|
-
inference for generating videos from images with optimized memory usage and throughput.
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
original_class = StableVideoDiffusionPipeline
|
|
35
|
-
_rbln_config_class = RBLNStableVideoDiffusionPipelineConfig
|
|
36
|
-
_submodules = ["image_encoder", "unet", "vae"]
|
|
37
|
-
|
|
38
|
-
def handle_additional_kwargs(self, **kwargs):
|
|
39
|
-
compiled_num_frames = self.unet.rbln_config.num_frames
|
|
40
|
-
if compiled_num_frames is not None:
|
|
41
|
-
kwargs["num_frames"] = compiled_num_frames
|
|
42
|
-
|
|
43
|
-
compiled_decode_chunk_size = self.vae.rbln_config.decode_chunk_size
|
|
44
|
-
if compiled_decode_chunk_size is not None:
|
|
45
|
-
kwargs["decode_chunk_size"] = compiled_decode_chunk_size
|
|
46
|
-
return kwargs
|