optimum-rbln 0.7.4a8__py3-none-any.whl → 0.7.5a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__init__.py +1 -0
- optimum/rbln/__version__.py +2 -2
- optimum/rbln/configuration_utils.py +30 -4
- optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +8 -0
- optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +8 -0
- optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +8 -0
- optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +8 -0
- optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +32 -17
- optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +19 -15
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +8 -8
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +8 -8
- optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +8 -8
- optimum/rbln/diffusers/modeling_diffusers.py +9 -2
- optimum/rbln/diffusers/models/controlnet.py +1 -1
- optimum/rbln/diffusers/models/transformers/transformer_sd3.py +1 -1
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py +1 -1
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +2 -0
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +2 -0
- optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +2 -0
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -0
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py +2 -0
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +2 -0
- optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +2 -0
- optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +31 -1
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +103 -59
- optimum/rbln/utils/import_utils.py +23 -6
- {optimum_rbln-0.7.4a8.dist-info → optimum_rbln-0.7.5a0.dist-info}/METADATA +1 -1
- {optimum_rbln-0.7.4a8.dist-info → optimum_rbln-0.7.5a0.dist-info}/RECORD +41 -41
- {optimum_rbln-0.7.4a8.dist-info → optimum_rbln-0.7.5a0.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.7.4a8.dist-info → optimum_rbln-0.7.5a0.dist-info}/licenses/LICENSE +0 -0
@@ -37,6 +37,7 @@ from diffusers.utils import deprecate, logging
|
|
37
37
|
from diffusers.utils.torch_utils import is_compiled_module, is_torch_version
|
38
38
|
|
39
39
|
from ....utils.decorator_utils import remove_compile_time_kwargs
|
40
|
+
from ...configurations import RBLNStableDiffusionXLControlNetPipelineConfig
|
40
41
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
41
42
|
from ...models import RBLNControlNetModel
|
42
43
|
from ...pipelines.controlnet.multicontrolnet import RBLNMultiControlNetModel
|
@@ -47,6 +48,7 @@ logger = logging.get_logger(__name__)
|
|
47
48
|
|
48
49
|
class RBLNStableDiffusionXLControlNetPipeline(RBLNDiffusionMixin, StableDiffusionXLControlNetPipeline):
|
49
50
|
original_class = StableDiffusionXLControlNetPipeline
|
51
|
+
_rbln_config_class = RBLNStableDiffusionXLControlNetPipelineConfig
|
50
52
|
_submodules = ["text_encoder", "text_encoder_2", "unet", "vae", "controlnet"]
|
51
53
|
|
52
54
|
# Almost copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.py
|
@@ -37,6 +37,7 @@ from diffusers.utils import deprecate, logging
|
|
37
37
|
from diffusers.utils.torch_utils import is_compiled_module
|
38
38
|
|
39
39
|
from ....utils.decorator_utils import remove_compile_time_kwargs
|
40
|
+
from ...configurations import RBLNStableDiffusionXLControlNetImg2ImgPipelineConfig
|
40
41
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
41
42
|
from ...models import RBLNControlNetModel
|
42
43
|
from ...pipelines.controlnet.multicontrolnet import RBLNMultiControlNetModel
|
@@ -47,6 +48,7 @@ logger = logging.get_logger(__name__)
|
|
47
48
|
|
48
49
|
class RBLNStableDiffusionXLControlNetImg2ImgPipeline(RBLNDiffusionMixin, StableDiffusionXLControlNetImg2ImgPipeline):
|
49
50
|
original_class = StableDiffusionXLControlNetImg2ImgPipeline
|
51
|
+
_rbln_config_class = RBLNStableDiffusionXLControlNetImg2ImgPipelineConfig
|
50
52
|
_submodules = ["text_encoder", "text_encoder_2", "unet", "vae", "controlnet"]
|
51
53
|
|
52
54
|
# Almost copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl_img2img.py
|
@@ -14,11 +14,13 @@
|
|
14
14
|
|
15
15
|
from diffusers import KandinskyV22Pipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNKandinskyV22PipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNKandinskyV22Pipeline(RBLNDiffusionMixin, KandinskyV22Pipeline):
|
21
22
|
original_class = KandinskyV22Pipeline
|
23
|
+
_rbln_config_class = RBLNKandinskyV22PipelineConfig
|
22
24
|
_submodules = ["unet", "movq"]
|
23
25
|
|
24
26
|
def get_compiled_image_size(self):
|
@@ -29,6 +29,7 @@ from transformers import (
|
|
29
29
|
CLIPVisionModelWithProjection,
|
30
30
|
)
|
31
31
|
|
32
|
+
from ...configurations import RBLNKandinskyV22CombinedPipelineConfig
|
32
33
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
33
34
|
from .pipeline_kandinsky2_2 import RBLNKandinskyV22Pipeline
|
34
35
|
from .pipeline_kandinsky2_2_img2img import RBLNKandinskyV22Img2ImgPipeline
|
@@ -38,6 +39,7 @@ from .pipeline_kandinsky2_2_prior import RBLNKandinskyV22PriorPipeline
|
|
38
39
|
|
39
40
|
class RBLNKandinskyV22CombinedPipeline(RBLNDiffusionMixin, KandinskyV22CombinedPipeline):
|
40
41
|
original_class = KandinskyV22CombinedPipeline
|
42
|
+
_rbln_config_class = RBLNKandinskyV22CombinedPipelineConfig
|
41
43
|
_connected_classes = {"prior_pipe": RBLNKandinskyV22PriorPipeline, "decoder_pipe": RBLNKandinskyV22Pipeline}
|
42
44
|
_submodules = ["prior_image_encoder", "prior_text_encoder", "prior_prior", "unet", "movq"]
|
43
45
|
_prefix = {"prior_pipe": "prior_"}
|
@@ -14,11 +14,13 @@
|
|
14
14
|
|
15
15
|
from diffusers import KandinskyV22Img2ImgPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNKandinskyV22Img2ImgPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNKandinskyV22Img2ImgPipeline(RBLNDiffusionMixin, KandinskyV22Img2ImgPipeline):
|
21
22
|
original_class = KandinskyV22Img2ImgPipeline
|
23
|
+
_rbln_config_class = RBLNKandinskyV22Img2ImgPipelineConfig
|
22
24
|
_submodules = ["unet", "movq"]
|
23
25
|
|
24
26
|
def get_compiled_image_size(self):
|
@@ -14,11 +14,13 @@
|
|
14
14
|
|
15
15
|
from diffusers import KandinskyV22InpaintPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNKandinskyV22InpaintPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNKandinskyV22InpaintPipeline(RBLNDiffusionMixin, KandinskyV22InpaintPipeline):
|
21
22
|
original_class = KandinskyV22InpaintPipeline
|
23
|
+
_rbln_config_class = RBLNKandinskyV22InpaintPipelineConfig
|
22
24
|
_submodules = ["unet", "movq"]
|
23
25
|
|
24
26
|
def get_compiled_image_size(self):
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import KandinskyV22PriorPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNKandinskyV22PriorPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNKandinskyV22PriorPipeline(RBLNDiffusionMixin, KandinskyV22PriorPipeline):
|
21
22
|
original_class = KandinskyV22PriorPipeline
|
23
|
+
_rbln_config_class = RBLNKandinskyV22PriorPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "image_encoder", "prior"]
|
@@ -15,9 +15,11 @@
|
|
15
15
|
|
16
16
|
from diffusers import StableDiffusionPipeline
|
17
17
|
|
18
|
+
from ...configurations import RBLNStableDiffusionPipelineConfig
|
18
19
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
19
20
|
|
20
21
|
|
21
22
|
class RBLNStableDiffusionPipeline(RBLNDiffusionMixin, StableDiffusionPipeline):
|
22
23
|
original_class = StableDiffusionPipeline
|
24
|
+
_rbln_config_class = RBLNStableDiffusionPipelineConfig
|
23
25
|
_submodules = ["vae", "text_encoder", "unet"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusionImg2ImgPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusionImg2ImgPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusionImg2ImgPipeline(RBLNDiffusionMixin, StableDiffusionImg2ImgPipeline):
|
21
22
|
original_class = StableDiffusionImg2ImgPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusionImg2ImgPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "unet", "vae"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusionInpaintPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusionInpaintPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusionInpaintPipeline(RBLNDiffusionMixin, StableDiffusionInpaintPipeline):
|
21
22
|
original_class = StableDiffusionInpaintPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusionInpaintPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "unet", "vae"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusion3Pipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusion3PipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusion3Pipeline(RBLNDiffusionMixin, StableDiffusion3Pipeline):
|
21
22
|
original_class = StableDiffusion3Pipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusion3PipelineConfig
|
22
24
|
_submodules = ["transformer", "text_encoder_3", "text_encoder", "text_encoder_2", "vae"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusion3Img2ImgPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusion3Img2ImgPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusion3Img2ImgPipeline(RBLNDiffusionMixin, StableDiffusion3Img2ImgPipeline):
|
21
22
|
original_class = StableDiffusion3Img2ImgPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusion3Img2ImgPipelineConfig
|
22
24
|
_submodules = ["transformer", "text_encoder_3", "text_encoder", "text_encoder_2", "vae"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusion3InpaintPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusion3InpaintPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusion3InpaintPipeline(RBLNDiffusionMixin, StableDiffusion3InpaintPipeline):
|
21
22
|
original_class = StableDiffusion3InpaintPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusion3InpaintPipelineConfig
|
22
24
|
_submodules = ["transformer", "text_encoder_3", "text_encoder", "text_encoder_2", "vae"]
|
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusionXLPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusionXLPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusionXLPipeline(RBLNDiffusionMixin, StableDiffusionXLPipeline):
|
21
22
|
original_class = StableDiffusionXLPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusionXLPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "text_encoder_2", "unet", "vae"]
|
optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
CHANGED
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusionXLImg2ImgPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusionXLImg2ImgPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusionXLImg2ImgPipeline(RBLNDiffusionMixin, StableDiffusionXLImg2ImgPipeline):
|
21
22
|
original_class = StableDiffusionXLImg2ImgPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusionXLImg2ImgPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "text_encoder_2", "unet", "vae"]
|
optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
CHANGED
@@ -14,9 +14,11 @@
|
|
14
14
|
|
15
15
|
from diffusers import StableDiffusionXLInpaintPipeline
|
16
16
|
|
17
|
+
from ...configurations import RBLNStableDiffusionXLInpaintPipelineConfig
|
17
18
|
from ...modeling_diffusers import RBLNDiffusionMixin
|
18
19
|
|
19
20
|
|
20
21
|
class RBLNStableDiffusionXLInpaintPipeline(RBLNDiffusionMixin, StableDiffusionXLInpaintPipeline):
|
21
22
|
original_class = StableDiffusionXLInpaintPipeline
|
23
|
+
_rbln_config_class = RBLNStableDiffusionXLInpaintPipelineConfig
|
22
24
|
_submodules = ["text_encoder", "text_encoder_2", "unet", "vae"]
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from typing import Any, Dict, Optional
|
15
|
+
from typing import Any, Dict, List, Optional
|
16
16
|
|
17
17
|
import rebel
|
18
18
|
|
@@ -37,6 +37,7 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
|
|
37
37
|
quantization: Optional[Dict[str, Any]] = None,
|
38
38
|
prefill_chunk_size: Optional[int] = None,
|
39
39
|
kvcache_num_blocks: Optional[int] = None,
|
40
|
+
decoder_batch_sizes: Optional[List[int]] = None,
|
40
41
|
**kwargs,
|
41
42
|
):
|
42
43
|
"""
|
@@ -53,6 +54,13 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
|
|
53
54
|
prefill_chunk_size (Optional[int]): The chunk size for prefilling the KV cache. Defaults to 128,
|
54
55
|
and must be a positive integer divisible by 64.
|
55
56
|
kvcache_num_blocks (Optional[int]): The number of blocks in the KV cache.
|
57
|
+
decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
|
58
|
+
This allows the model to handle varying batch sizes efficiently during generation. If not specified,
|
59
|
+
defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
|
60
|
+
1) All values must be less than or equal to the main batch size.
|
61
|
+
2) The list will be sorted in descending order (larger batch sizes first).
|
62
|
+
3) If using multiple decoders, at least one batch size should match the main batch size.
|
63
|
+
|
56
64
|
**kwargs: Additional arguments passed to the parent RBLNModelConfig.
|
57
65
|
|
58
66
|
Raises:
|
@@ -88,3 +96,25 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
|
|
88
96
|
raise ValueError("`prefill_chunk_size` must be a positive integer divisible by 64.")
|
89
97
|
|
90
98
|
self.kvcache_num_blocks = kvcache_num_blocks
|
99
|
+
self.decoder_batch_sizes = decoder_batch_sizes
|
100
|
+
if self.decoder_batch_sizes is None:
|
101
|
+
self.decoder_batch_sizes = [self.batch_size]
|
102
|
+
|
103
|
+
if self.use_multiple_decoder:
|
104
|
+
if max(self.decoder_batch_sizes) > self.batch_size:
|
105
|
+
raise ValueError(
|
106
|
+
f"Decoder batch size ({max(self.decoder_batch_sizes)}) must be less than or equal to the runtime batch size ({self.batch_size})."
|
107
|
+
)
|
108
|
+
if max(self.decoder_batch_sizes) < self.batch_size:
|
109
|
+
logger.warning(
|
110
|
+
f"Maximum decoder batch size ({max(self.decoder_batch_sizes)}) is less than the model's batch size ({self.batch_size}). "
|
111
|
+
"Appending the model's batch size to the decoder batch size."
|
112
|
+
)
|
113
|
+
self.decoder_batch_sizes.append(self.batch_size)
|
114
|
+
|
115
|
+
# Larger batch size should be at the beginning of the list.
|
116
|
+
self.decoder_batch_sizes.sort(reverse=True)
|
117
|
+
|
118
|
+
@property
|
119
|
+
def use_multiple_decoder(self):
|
120
|
+
return isinstance(self.decoder_batch_sizes, list) and len(self.decoder_batch_sizes) > 1
|
@@ -229,6 +229,12 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
|
|
229
229
|
|
230
230
|
attention_mask = self.dec_attn_mask
|
231
231
|
|
232
|
+
if self.batch_size < block_tables.shape[0]:
|
233
|
+
block_tables = block_tables[: self.batch_size]
|
234
|
+
|
235
|
+
if self.batch_size < attention_mask.shape[0]:
|
236
|
+
attention_mask = attention_mask[: self.batch_size]
|
237
|
+
|
232
238
|
logits = super().forward(
|
233
239
|
inputs,
|
234
240
|
cache_position,
|
@@ -417,19 +423,24 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
417
423
|
use_attention_mask=self.rbln_config.use_attention_mask,
|
418
424
|
attn_impl=self.rbln_config.attn_impl,
|
419
425
|
)
|
420
|
-
self.
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
426
|
+
self.decoders = {}
|
427
|
+
for i, batch_size in enumerate(self.rbln_config.decoder_batch_sizes):
|
428
|
+
self.decoders[batch_size] = RBLNRuntimeModel(
|
429
|
+
runtime=self.model[i + 1],
|
430
|
+
main_input_name=main_input_name,
|
431
|
+
embed_tokens=self.embed_tokens,
|
432
|
+
phase="decode",
|
433
|
+
batch_size=batch_size,
|
434
|
+
dec_attn_mask=dec_attn_mask,
|
435
|
+
block_tables=block_tables,
|
436
|
+
free_block_pool=free_block_pool,
|
437
|
+
kvcache_block_size=self.rbln_config.kvcache_block_size,
|
438
|
+
use_attention_mask=self.rbln_config.use_attention_mask,
|
439
|
+
attn_impl=self.rbln_config.attn_impl,
|
440
|
+
)
|
441
|
+
|
442
|
+
# NOTE(eunji): Use a decoder whose batch size matches the model's main batch size for compatibility.
|
443
|
+
self.decoder = self.decoders[self.rbln_config.batch_size]
|
433
444
|
|
434
445
|
@classmethod
|
435
446
|
def save_torch_artifacts(
|
@@ -547,7 +558,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
547
558
|
|
548
559
|
rbln_compile_configs = rbln_config.compile_cfgs
|
549
560
|
prefill_compile_config = rbln_compile_configs[0]
|
550
|
-
dec_compile_config = rbln_compile_configs[1]
|
551
561
|
|
552
562
|
context = CompileContext(use_weight_sharing=True)
|
553
563
|
|
@@ -562,33 +572,42 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
562
572
|
static_tensors[name] = tensor
|
563
573
|
context.mark_static_address(tensor)
|
564
574
|
|
565
|
-
dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
|
566
|
-
|
567
575
|
@QuantizationManager.with_quantization_env
|
568
|
-
def compile_model(
|
576
|
+
def compile_model(wrapped_model, compile_config, example_inputs, compile_context, **kwargs):
|
569
577
|
try:
|
570
578
|
original_linear = torch.nn.functional.linear
|
571
579
|
torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
|
572
|
-
|
573
|
-
compiled_prefill = RBLNModel.compile(
|
580
|
+
compiled_model = RBLNModel.compile(
|
574
581
|
wrapped_model,
|
575
|
-
|
576
|
-
example_inputs=
|
577
|
-
compile_context=
|
582
|
+
compile_config,
|
583
|
+
example_inputs=example_inputs,
|
584
|
+
compile_context=compile_context,
|
578
585
|
)
|
579
|
-
|
580
|
-
wrapped_model.phase = "decode"
|
581
|
-
compiled_decoder = RBLNModel.compile(
|
582
|
-
wrapped_model,
|
583
|
-
dec_compile_config,
|
584
|
-
example_inputs=dec_example_inputs,
|
585
|
-
compile_context=context,
|
586
|
-
)
|
587
|
-
return {"prefill": compiled_prefill, "decoder": compiled_decoder}
|
586
|
+
return compiled_model
|
588
587
|
finally:
|
589
588
|
torch.nn.functional.linear = original_linear
|
590
589
|
|
591
|
-
|
590
|
+
wrapped_model.phase = "prefill"
|
591
|
+
compiled_prefill = compile_model(
|
592
|
+
wrapped_model,
|
593
|
+
prefill_compile_config,
|
594
|
+
prefill_example_inputs,
|
595
|
+
context,
|
596
|
+
quantize_config=rbln_config.quantization,
|
597
|
+
)
|
598
|
+
|
599
|
+
wrapped_model.phase = "decode"
|
600
|
+
compiled_models = {"prefill": compiled_prefill}
|
601
|
+
for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_compile_configs[1:]):
|
602
|
+
dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
|
603
|
+
compiled_decoder = compile_model(
|
604
|
+
wrapped_model,
|
605
|
+
dec_compile_config,
|
606
|
+
dec_example_inputs,
|
607
|
+
context,
|
608
|
+
quantize_config=rbln_config.quantization,
|
609
|
+
)
|
610
|
+
compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
|
592
611
|
|
593
612
|
# check if the memory is enough to have additional blocks
|
594
613
|
required_num_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
|
@@ -613,8 +632,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
613
632
|
alloc_memory_by_key: Dict[str, int] = {
|
614
633
|
key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
|
615
634
|
}
|
616
|
-
for
|
617
|
-
|
635
|
+
for batch_size in rbln_config.decoder_batch_sizes:
|
636
|
+
for key, memory_per_node in (
|
637
|
+
compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
|
638
|
+
):
|
639
|
+
alloc_memory_by_key[key] += sum(memory_per_node)
|
618
640
|
alloc_memory_by_key.pop("PortRecur") # kv-cache
|
619
641
|
kernel_size = alloc_memory_by_key.pop("Kernel") # model weight
|
620
642
|
|
@@ -650,6 +672,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
650
672
|
n_model_params: Optional[int] = None,
|
651
673
|
kernel_size: Optional[int] = None,
|
652
674
|
buffer: Optional[int] = None,
|
675
|
+
num_runtimes: int = 2,
|
653
676
|
) -> int:
|
654
677
|
"""
|
655
678
|
We are finding max_n_blocks(x) that satisfies the following equation:
|
@@ -721,7 +744,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
721
744
|
|
722
745
|
if buffer is None:
|
723
746
|
# TODO: Accurate buffer estimation
|
724
|
-
|
747
|
+
buffer_per_runtime_per_core = 2**28 # 256MB per runtime
|
748
|
+
buffer_per_core = buffer_per_runtime_per_core * num_runtimes # 1 for prefill, 1 for decoder
|
725
749
|
buffer = buffer_per_core * tensor_parallel_size
|
726
750
|
available_dram -= buffer
|
727
751
|
|
@@ -839,6 +863,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
839
863
|
kvcache_block_size=rbln_config.kvcache_block_size,
|
840
864
|
nbits_per_param=16 if not rbln_config.quantization else 4, # TODO(jongho): FIX Ad-hoc
|
841
865
|
n_model_params=sum(p.numel() for p in model.parameters()),
|
866
|
+
num_runtimes=1 + len(rbln_config.decoder_batch_sizes),
|
842
867
|
)
|
843
868
|
|
844
869
|
max_num_blocks = min(max_num_blocks, estimated_max_num_blocks)
|
@@ -881,24 +906,28 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
881
906
|
hidden_size=hidden_size,
|
882
907
|
head_dim=head_dim,
|
883
908
|
)
|
884
|
-
dec_input_info = cls.get_input_info(
|
885
|
-
batch_size=rbln_config.batch_size,
|
886
|
-
query_length=1,
|
887
|
-
use_inputs_embeds=rbln_config.use_inputs_embeds,
|
888
|
-
use_attention_mask=rbln_config.use_attention_mask,
|
889
|
-
max_seq_len=rbln_config.max_seq_len,
|
890
|
-
kvcache_block_size=rbln_config.kvcache_block_size,
|
891
|
-
kvcache_num_blocks=rbln_config.kvcache_num_blocks,
|
892
|
-
num_key_value_heads=num_key_value_heads,
|
893
|
-
num_hidden_layers=num_hidden_layers,
|
894
|
-
hidden_size=hidden_size,
|
895
|
-
head_dim=head_dim,
|
896
|
-
)
|
897
909
|
|
898
910
|
prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
|
899
|
-
dec_compile_config = RBLNCompileConfig(compiled_model_name="decoder", input_info=dec_input_info)
|
900
911
|
|
901
|
-
|
912
|
+
dec_compile_configs = []
|
913
|
+
for batch_size in rbln_config.decoder_batch_sizes:
|
914
|
+
dec_input_info = cls.get_input_info(
|
915
|
+
batch_size=batch_size,
|
916
|
+
query_length=1,
|
917
|
+
use_inputs_embeds=rbln_config.use_inputs_embeds,
|
918
|
+
use_attention_mask=rbln_config.use_attention_mask,
|
919
|
+
max_seq_len=rbln_config.max_seq_len,
|
920
|
+
kvcache_block_size=rbln_config.kvcache_block_size,
|
921
|
+
kvcache_num_blocks=rbln_config.kvcache_num_blocks,
|
922
|
+
num_key_value_heads=num_key_value_heads,
|
923
|
+
num_hidden_layers=num_hidden_layers,
|
924
|
+
hidden_size=hidden_size,
|
925
|
+
head_dim=head_dim,
|
926
|
+
)
|
927
|
+
dec_compile_configs.append(
|
928
|
+
RBLNCompileConfig(compiled_model_name=f"decoder_batch_{batch_size}", input_info=dec_input_info)
|
929
|
+
)
|
930
|
+
rbln_config.set_compile_cfgs([prefill_compile_config, *dec_compile_configs])
|
902
931
|
|
903
932
|
return rbln_config
|
904
933
|
|
@@ -908,8 +937,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
908
937
|
compiled_models: List[rebel.RBLNCompiledModel],
|
909
938
|
rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
|
910
939
|
) -> List[rebel.Runtime]:
|
911
|
-
|
912
|
-
|
940
|
+
expected_model_names = [
|
941
|
+
"prefill",
|
942
|
+
*[f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes],
|
943
|
+
]
|
944
|
+
if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
|
945
|
+
cls._raise_missing_compiled_file_error(expected_model_names)
|
913
946
|
|
914
947
|
return [
|
915
948
|
rebel.Runtime(
|
@@ -918,12 +951,15 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
918
951
|
device=rbln_config.device_map["prefill"],
|
919
952
|
activate_profiler=rbln_config.activate_profiler,
|
920
953
|
),
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
954
|
+
*[
|
955
|
+
rebel.Runtime(
|
956
|
+
compiled_models[i + 1],
|
957
|
+
tensor_type="pt",
|
958
|
+
device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
|
959
|
+
activate_profiler=rbln_config.activate_profiler,
|
960
|
+
)
|
961
|
+
for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
|
962
|
+
],
|
927
963
|
]
|
928
964
|
|
929
965
|
def get_decoder(self):
|
@@ -1024,7 +1060,15 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
1024
1060
|
logits = torch.cat(logits, dim=0)
|
1025
1061
|
# Decoder
|
1026
1062
|
else:
|
1027
|
-
|
1063
|
+
inputs = inputs_embeds if inputs_embeds is not None else input_ids
|
1064
|
+
batch_size = inputs.shape[0]
|
1065
|
+
if batch_size not in self.decoders:
|
1066
|
+
raise ValueError(
|
1067
|
+
f"No decoder runtime available for batch size {batch_size}. "
|
1068
|
+
f"Available batch sizes are: {list(self.decoders.keys())}. "
|
1069
|
+
f"Please run your model with one of these batch sizes or add support for batch size {batch_size}."
|
1070
|
+
)
|
1071
|
+
logits = self.decoders[batch_size](
|
1028
1072
|
input_ids=input_ids,
|
1029
1073
|
inputs_embeds=inputs_embeds,
|
1030
1074
|
cache_position=cache_position,
|
@@ -144,10 +144,27 @@ def check_version_compats() -> None:
|
|
144
144
|
except importlib.metadata.PackageNotFoundError:
|
145
145
|
warnings.warn(f"optimum-rbln requires {compat.package_name} to be installed.", ImportWarning)
|
146
146
|
continue
|
147
|
+
# For versions 0.7.2 and above, don't show warning for rebel-compiler if base versions match
|
147
148
|
|
148
|
-
if
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
149
|
+
if compat.package_name == "rebel-compiler":
|
150
|
+
# For optimum-rbln versions 0.7.2 and above, suppress the warning if the base versions of
|
151
|
+
# optimum-rbln and rebel-compiler match (e.g., 0.7.x with 0.7.y).
|
152
|
+
if (
|
153
|
+
Version(my_version) >= Version("0.7.2")
|
154
|
+
and Version(my_version).base_version == Version(dep_version).base_version
|
155
|
+
):
|
156
|
+
continue
|
157
|
+
else:
|
158
|
+
warnings.warn(
|
159
|
+
f"Version mismatch detected: optimum-rbln v{my_version} and {compat.package_name} v{dep_version} have different base versions. "
|
160
|
+
f"For optimal performance and compatibility, please ensure both packages share the same major and minor version numbers. "
|
161
|
+
"Please refer to our SDK release notes at https://docs.rbln.ai/about_atom/release_note.html",
|
162
|
+
ImportWarning,
|
163
|
+
)
|
164
|
+
else:
|
165
|
+
if not Version(compat.min_version) <= Version(dep_version) < Version(compat.max_version):
|
166
|
+
warnings.warn(
|
167
|
+
f"optimum-rbln v{my_version} is compatible to {compat.package_name} v{compat.min_version} to v{compat.max_version}. (you are currently using v{dep_version})\n"
|
168
|
+
"Please refer to our SDK release notes at https://docs.rbln.ai/about_atom/release_note.html",
|
169
|
+
ImportWarning,
|
170
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: optimum-rbln
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.5a0
|
4
4
|
Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
|
5
5
|
Project-URL: Homepage, https://rebellions.ai
|
6
6
|
Project-URL: Documentation, https://docs.rbln.ai
|