optimum-rbln 0.7.3a4__py3-none-any.whl → 0.7.3a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
optimum/rbln/__init__.py CHANGED
@@ -78,9 +78,13 @@ _import_structure = {
78
78
  "RBLNAutoencoderKL",
79
79
  "RBLNControlNetModel",
80
80
  "RBLNPriorTransformer",
81
+ "RBLNKandinskyV22CombinedPipeline",
82
+ "RBLNKandinskyV22Img2ImgCombinedPipeline",
81
83
  "RBLNKandinskyV22InpaintCombinedPipeline",
82
84
  "RBLNKandinskyV22InpaintPipeline",
85
+ "RBLNKandinskyV22Img2ImgPipeline",
83
86
  "RBLNKandinskyV22PriorPipeline",
87
+ "RBLNKandinskyV22Pipeline",
84
88
  "RBLNStableDiffusionPipeline",
85
89
  "RBLNStableDiffusionXLPipeline",
86
90
  "RBLNUNet2DConditionModel",
@@ -107,8 +111,12 @@ if TYPE_CHECKING:
107
111
  RBLNAutoencoderKL,
108
112
  RBLNControlNetModel,
109
113
  RBLNDiffusionMixin,
114
+ RBLNKandinskyV22CombinedPipeline,
115
+ RBLNKandinskyV22Img2ImgCombinedPipeline,
116
+ RBLNKandinskyV22Img2ImgPipeline,
110
117
  RBLNKandinskyV22InpaintCombinedPipeline,
111
118
  RBLNKandinskyV22InpaintPipeline,
119
+ RBLNKandinskyV22Pipeline,
112
120
  RBLNKandinskyV22PriorPipeline,
113
121
  RBLNMultiControlNetModel,
114
122
  RBLNPriorTransformer,
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.7.3a4'
21
- __version_tuple__ = version_tuple = (0, 7, 3, 'a4')
20
+ __version__ = version = '0.7.3a6'
21
+ __version_tuple__ = version_tuple = (0, 7, 3, 'a6')
@@ -24,9 +24,13 @@ ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES["optimum.rbln"])
24
24
 
25
25
  _import_structure = {
26
26
  "pipelines": [
27
+ "RBLNKandinskyV22CombinedPipeline",
28
+ "RBLNKandinskyV22Img2ImgCombinedPipeline",
27
29
  "RBLNKandinskyV22InpaintCombinedPipeline",
28
30
  "RBLNKandinskyV22InpaintPipeline",
31
+ "RBLNKandinskyV22Img2ImgPipeline",
29
32
  "RBLNKandinskyV22PriorPipeline",
33
+ "RBLNKandinskyV22Pipeline",
30
34
  "RBLNStableDiffusionPipeline",
31
35
  "RBLNStableDiffusionXLPipeline",
32
36
  "RBLNStableDiffusionImg2ImgPipeline",
@@ -66,8 +70,12 @@ if TYPE_CHECKING:
66
70
  RBLNVQModel,
67
71
  )
68
72
  from .pipelines import (
73
+ RBLNKandinskyV22CombinedPipeline,
74
+ RBLNKandinskyV22Img2ImgCombinedPipeline,
75
+ RBLNKandinskyV22Img2ImgPipeline,
69
76
  RBLNKandinskyV22InpaintCombinedPipeline,
70
77
  RBLNKandinskyV22InpaintPipeline,
78
+ RBLNKandinskyV22Pipeline,
71
79
  RBLNKandinskyV22PriorPipeline,
72
80
  RBLNMultiControlNetModel,
73
81
  RBLNStableDiffusion3Img2ImgPipeline,
@@ -90,9 +90,17 @@ class RBLNVQModel(RBLNModel):
90
90
  model_config: "PretrainedConfig",
91
91
  rbln_kwargs: Dict[str, Any] = {},
92
92
  ) -> RBLNConfig:
93
- batch_size = rbln_kwargs.get("batch_size") or 1
94
- height = rbln_kwargs.get("img_height") or 512
95
- width = rbln_kwargs.get("img_width") or 512
93
+ batch_size = rbln_kwargs.get("batch_size")
94
+ if batch_size is None:
95
+ batch_size = 1
96
+
97
+ height = rbln_kwargs.get("img_height")
98
+ if height is None:
99
+ height = 512
100
+
101
+ width = rbln_kwargs.get("img_width")
102
+ if width is None:
103
+ width = 512
96
104
 
97
105
  if hasattr(model_config, "block_out_channels"):
98
106
  scale_factor = 2 ** (len(model_config.block_out_channels) - 1)
@@ -176,15 +176,22 @@ class RBLNUNet2DConditionModel(RBLNModel):
176
176
  raise ValueError("Both image height and image width must be given or not given")
177
177
  elif image_size[0] is None and image_size[1] is None:
178
178
  if rbln_config["img2img_pipeline"]:
179
- # In case of img2img, sample size of unet is determined by vae encoder.
180
- vae_sample_size = pipe.vae.config.sample_size
181
- if isinstance(vae_sample_size, int):
182
- sample_size = vae_sample_size // scale_factor
183
- else:
184
- sample_size = (
185
- vae_sample_size[0] // scale_factor,
186
- vae_sample_size[1] // scale_factor,
179
+ if hasattr(pipe, "vae"):
180
+ # In case of img2img, sample size of unet is determined by vae encoder.
181
+ vae_sample_size = pipe.vae.config.sample_size
182
+ if isinstance(vae_sample_size, int):
183
+ sample_size = vae_sample_size // scale_factor
184
+ else:
185
+ sample_size = (
186
+ vae_sample_size[0] // scale_factor,
187
+ vae_sample_size[1] // scale_factor,
188
+ )
189
+ elif hasattr(pipe, "movq"):
190
+ logger.warning(
191
+ "RBLN config 'img_height' and 'img_width' should have been provided for this pipeline. "
192
+ "Both variable will be set 512 by default."
187
193
  )
194
+ sample_size = (512 // scale_factor, 512 // scale_factor)
188
195
  else:
189
196
  sample_size = pipe.unet.config.sample_size
190
197
  else:
@@ -26,9 +26,13 @@ _import_structure = {
26
26
  "RBLNStableDiffusionXLControlNetPipeline",
27
27
  ],
28
28
  "kandinsky2_2": [
29
+ "RBLNKandinskyV22CombinedPipeline",
30
+ "RBLNKandinskyV22Img2ImgCombinedPipeline",
29
31
  "RBLNKandinskyV22InpaintCombinedPipeline",
30
32
  "RBLNKandinskyV22InpaintPipeline",
33
+ "RBLNKandinskyV22Img2ImgPipeline",
31
34
  "RBLNKandinskyV22PriorPipeline",
35
+ "RBLNKandinskyV22Pipeline",
32
36
  ],
33
37
  "stable_diffusion": [
34
38
  "RBLNStableDiffusionImg2ImgPipeline",
@@ -55,8 +59,12 @@ if TYPE_CHECKING:
55
59
  RBLNStableDiffusionXLControlNetPipeline,
56
60
  )
57
61
  from .kandinsky2_2 import (
62
+ RBLNKandinskyV22CombinedPipeline,
63
+ RBLNKandinskyV22Img2ImgCombinedPipeline,
64
+ RBLNKandinskyV22Img2ImgPipeline,
58
65
  RBLNKandinskyV22InpaintCombinedPipeline,
59
66
  RBLNKandinskyV22InpaintPipeline,
67
+ RBLNKandinskyV22Pipeline,
60
68
  RBLNKandinskyV22PriorPipeline,
61
69
  )
62
70
  from .stable_diffusion import (
@@ -12,6 +12,12 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .pipeline_kandinsky2_2_combined import RBLNKandinskyV22InpaintCombinedPipeline
15
+ from .pipeline_kandinsky2_2 import RBLNKandinskyV22Pipeline
16
+ from .pipeline_kandinsky2_2_combined import (
17
+ RBLNKandinskyV22CombinedPipeline,
18
+ RBLNKandinskyV22Img2ImgCombinedPipeline,
19
+ RBLNKandinskyV22InpaintCombinedPipeline,
20
+ )
21
+ from .pipeline_kandinsky2_2_img2img import RBLNKandinskyV22Img2ImgPipeline
16
22
  from .pipeline_kandinsky2_2_inpaint import RBLNKandinskyV22InpaintPipeline
17
23
  from .pipeline_kandinsky2_2_prior import RBLNKandinskyV22PriorPipeline
@@ -0,0 +1,25 @@
1
+ # Copyright 2024 Rebellions Inc.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from diffusers import KandinskyV22Pipeline
16
+
17
+ from ...modeling_diffusers import RBLNDiffusionMixin
18
+
19
+
20
+ class RBLNKandinskyV22Pipeline(RBLNDiffusionMixin, KandinskyV22Pipeline):
21
+ original_class = KandinskyV22Pipeline
22
+ _submodules = ["unet", "movq"]
23
+
24
+ def get_compiled_image_size(self):
25
+ return self.movq.image_size
@@ -14,6 +14,8 @@
14
14
 
15
15
  from diffusers import (
16
16
  DDPMScheduler,
17
+ KandinskyV22CombinedPipeline,
18
+ KandinskyV22Img2ImgCombinedPipeline,
17
19
  KandinskyV22InpaintCombinedPipeline,
18
20
  PriorTransformer,
19
21
  UnCLIPScheduler,
@@ -28,10 +30,114 @@ from transformers import (
28
30
  )
29
31
 
30
32
  from ...modeling_diffusers import RBLNDiffusionMixin
33
+ from .pipeline_kandinsky2_2 import RBLNKandinskyV22Pipeline
34
+ from .pipeline_kandinsky2_2_img2img import RBLNKandinskyV22Img2ImgPipeline
31
35
  from .pipeline_kandinsky2_2_inpaint import RBLNKandinskyV22InpaintPipeline
32
36
  from .pipeline_kandinsky2_2_prior import RBLNKandinskyV22PriorPipeline
33
37
 
34
38
 
39
+ class RBLNKandinskyV22CombinedPipeline(RBLNDiffusionMixin, KandinskyV22CombinedPipeline):
40
+ original_class = KandinskyV22CombinedPipeline
41
+ _connected_classes = {"prior_pipe": RBLNKandinskyV22PriorPipeline, "decoder_pipe": RBLNKandinskyV22Pipeline}
42
+ _submodules = ["prior_pipe", "decoder_pipe"]
43
+ _prefix = {"prior_pipe": "prior_"}
44
+
45
+ def __init__(
46
+ self,
47
+ unet: "UNet2DConditionModel",
48
+ scheduler: "DDPMScheduler",
49
+ movq: "VQModel",
50
+ prior_prior: "PriorTransformer",
51
+ prior_image_encoder: "CLIPVisionModelWithProjection",
52
+ prior_text_encoder: "CLIPTextModelWithProjection",
53
+ prior_tokenizer: "CLIPTokenizer",
54
+ prior_scheduler: "UnCLIPScheduler",
55
+ prior_image_processor: "CLIPImageProcessor",
56
+ ):
57
+ RBLNDiffusionMixin.__init__(self)
58
+ super(KandinskyV22CombinedPipeline, self).__init__()
59
+
60
+ self.register_modules(
61
+ unet=unet,
62
+ scheduler=scheduler,
63
+ movq=movq,
64
+ prior_prior=prior_prior,
65
+ prior_image_encoder=prior_image_encoder,
66
+ prior_text_encoder=prior_text_encoder,
67
+ prior_tokenizer=prior_tokenizer,
68
+ prior_scheduler=prior_scheduler,
69
+ prior_image_processor=prior_image_processor,
70
+ )
71
+
72
+ self.prior_pipe = RBLNKandinskyV22PriorPipeline(
73
+ prior=prior_prior,
74
+ image_encoder=prior_image_encoder,
75
+ text_encoder=prior_text_encoder,
76
+ tokenizer=prior_tokenizer,
77
+ scheduler=prior_scheduler,
78
+ image_processor=prior_image_processor,
79
+ )
80
+ self.decoder_pipe = RBLNKandinskyV22Pipeline(
81
+ unet=unet,
82
+ scheduler=scheduler,
83
+ movq=movq,
84
+ )
85
+
86
+ def get_compiled_image_size(self):
87
+ return self.movq.image_size
88
+
89
+
90
+ class RBLNKandinskyV22Img2ImgCombinedPipeline(RBLNDiffusionMixin, KandinskyV22Img2ImgCombinedPipeline):
91
+ original_class = KandinskyV22Img2ImgCombinedPipeline
92
+ _connected_classes = {"prior_pipe": RBLNKandinskyV22PriorPipeline, "decoder_pipe": RBLNKandinskyV22Img2ImgPipeline}
93
+ _submodules = ["prior_pipe", "decoder_pipe"]
94
+ _prefix = {"prior_pipe": "prior_"}
95
+
96
+ def __init__(
97
+ self,
98
+ unet: "UNet2DConditionModel",
99
+ scheduler: "DDPMScheduler",
100
+ movq: "VQModel",
101
+ prior_prior: "PriorTransformer",
102
+ prior_image_encoder: "CLIPVisionModelWithProjection",
103
+ prior_text_encoder: "CLIPTextModelWithProjection",
104
+ prior_tokenizer: "CLIPTokenizer",
105
+ prior_scheduler: "UnCLIPScheduler",
106
+ prior_image_processor: "CLIPImageProcessor",
107
+ ):
108
+ RBLNDiffusionMixin.__init__(self)
109
+ super(KandinskyV22Img2ImgCombinedPipeline, self).__init__()
110
+
111
+ self.register_modules(
112
+ unet=unet,
113
+ scheduler=scheduler,
114
+ movq=movq,
115
+ prior_prior=prior_prior,
116
+ prior_image_encoder=prior_image_encoder,
117
+ prior_text_encoder=prior_text_encoder,
118
+ prior_tokenizer=prior_tokenizer,
119
+ prior_scheduler=prior_scheduler,
120
+ prior_image_processor=prior_image_processor,
121
+ )
122
+
123
+ self.prior_pipe = RBLNKandinskyV22PriorPipeline(
124
+ prior=prior_prior,
125
+ image_encoder=prior_image_encoder,
126
+ text_encoder=prior_text_encoder,
127
+ tokenizer=prior_tokenizer,
128
+ scheduler=prior_scheduler,
129
+ image_processor=prior_image_processor,
130
+ )
131
+ self.decoder_pipe = RBLNKandinskyV22Img2ImgPipeline(
132
+ unet=unet,
133
+ scheduler=scheduler,
134
+ movq=movq,
135
+ )
136
+
137
+ def get_compiled_image_size(self):
138
+ return self.movq.image_size
139
+
140
+
35
141
  class RBLNKandinskyV22InpaintCombinedPipeline(RBLNDiffusionMixin, KandinskyV22InpaintCombinedPipeline):
36
142
  original_class = KandinskyV22InpaintCombinedPipeline
37
143
  _connected_classes = {"prior_pipe": RBLNKandinskyV22PriorPipeline, "decoder_pipe": RBLNKandinskyV22InpaintPipeline}
@@ -0,0 +1,25 @@
1
+ # Copyright 2024 Rebellions Inc.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from diffusers import KandinskyV22Img2ImgPipeline
16
+
17
+ from ...modeling_diffusers import RBLNDiffusionMixin
18
+
19
+
20
+ class RBLNKandinskyV22Img2ImgPipeline(RBLNDiffusionMixin, KandinskyV22Img2ImgPipeline):
21
+ original_class = KandinskyV22Img2ImgPipeline
22
+ _submodules = ["unet", "movq"]
23
+
24
+ def get_compiled_image_size(self):
25
+ return self.movq.image_size
@@ -20,3 +20,6 @@ from ...modeling_diffusers import RBLNDiffusionMixin
20
20
  class RBLNKandinskyV22InpaintPipeline(RBLNDiffusionMixin, KandinskyV22InpaintPipeline):
21
21
  original_class = KandinskyV22InpaintPipeline
22
22
  _submodules = ["unet", "movq"]
23
+
24
+ def get_compiled_image_size(self):
25
+ return self.movq.image_size
@@ -282,15 +282,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
282
282
  **kwargs,
283
283
  )
284
284
 
285
- @classmethod
286
- def _check_compiled_models(
287
- cls, compiled_models: Dict[str, rebel.RBLNCompiledModel], rbln_config: RBLNConfig, config: "PretrainedConfig"
288
- ):
289
- # check compiled model can create runtimes.
290
- # this logic currently only works in LLM
291
- # fail when LLM model using Paged Attention can't guarantee max sequence length
292
- pass
293
-
294
285
  @classmethod
295
286
  def _from_compiled_models(
296
287
  cls,
@@ -305,8 +296,6 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
305
296
  if isinstance(model_save_dir, str):
306
297
  model_save_dir = Path(model_save_dir)
307
298
 
308
- cls._check_compiled_models(compiled_models=rbln_compiled_models, rbln_config=rbln_config, config=config)
309
-
310
299
  # FIXME:: Should we convert it?
311
300
  compiled_model_names = [cfg.compiled_model_name for cfg in rbln_config.compile_cfgs]
312
301
  rbln_compiled_models = [rbln_compiled_models[cm_name] for cm_name in compiled_model_names]
@@ -98,9 +98,9 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
98
98
  """
99
99
 
100
100
  NO_BLOCKS_ERROR = (
101
- "No memory blocks are available for allocation."
102
- "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln."
103
- "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html)."
101
+ "No memory blocks are available for allocation. "
102
+ "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln. "
103
+ "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html). "
104
104
  "Using vllm-rbln should fix this issue and enhance inference performance."
105
105
  )
106
106
 
@@ -575,59 +575,58 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
575
575
  nbits_per_param: int,
576
576
  n_model_params: int,
577
577
  ) -> int:
578
- num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
579
- num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
580
- head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
581
- vocab_size = config.vocab_size
582
- hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
583
- num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
584
-
585
- TARGET_DRAM_LIMIT = int(tensor_parallel_size * 15.7 * 2**30) # 16GB # TODO(jongho): 더 정확한 값
586
-
587
578
  def align(x: int, nbytes: int) -> int:
588
579
  return int(math.ceil(x / nbytes) * nbytes)
589
580
 
590
581
  def align_2MB(x: int) -> int:
591
582
  return align(x, 2 * 1024 * 1024)
592
583
 
593
- def get_kernel_size() -> int:
594
- # TODO: Implement
595
- lm_heads_params = align(vocab_size, 64) * hidden_size
596
- lm_heads_nbytes = (
597
- align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
598
- )
584
+ num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
585
+ num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
586
+ head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
587
+ vocab_size = config.vocab_size
588
+ hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
589
+ num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
599
590
 
600
- params = n_model_params - lm_heads_params
601
- layer_nbytes = (
602
- align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
603
- * num_layers
604
- * tensor_parallel_size
605
- )
591
+ # TODO(jongho): Update if target npu is REBEL.
592
+ ATOM_DRAM_NBYTES = 16 * 2**30
593
+ ATOM_SYS_DRAM_NBYTES = 288 * 2**20
594
+ available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
606
595
 
607
- return layer_nbytes + lm_heads_nbytes
596
+ # Get estimated kernel size (approximated)
597
+ lm_heads_params = align(vocab_size, 64) * hidden_size
598
+ lm_heads_nbytes = (
599
+ align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
600
+ )
601
+ params = n_model_params - lm_heads_params
602
+ layer_nbytes = (
603
+ align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
604
+ * num_layers
605
+ * tensor_parallel_size
606
+ )
607
+ kernel_size = layer_nbytes + lm_heads_nbytes
608
608
 
609
- available_dram = TARGET_DRAM_LIMIT - get_kernel_size()
609
+ available_dram -= kernel_size
610
610
 
611
- buffer = 2**30 # 1GB
612
- if tensor_parallel_size <= 2:
611
+ # TODO: Accurate buffer estimation
612
+ buffer = 2**30 # 1GB Buffer
613
+ if tensor_parallel_size <= 4:
613
614
  buffer /= 4
614
615
 
615
616
  available_dram -= buffer
616
617
 
617
- def get_nbytes_per_block() -> int:
618
- return (
619
- align_2MB(
620
- kvcache_block_size
621
- * head_dim
622
- * math.ceil(num_key_value_heads / tensor_parallel_size) # Shard
623
- * 2 # (fp16)
624
- )
625
- * num_layers
626
- * 2 # (k, v)
627
- * tensor_parallel_size
618
+ # Estimate nbytes per a single kvcache block
619
+ nbytes_per_block = (
620
+ align_2MB(
621
+ kvcache_block_size
622
+ * head_dim
623
+ * math.ceil(num_key_value_heads / tensor_parallel_size) # Shard
624
+ * 2 # (fp16)
628
625
  )
629
-
630
- nbytes_per_block = get_nbytes_per_block()
626
+ * num_layers
627
+ * 2 # (k, v)
628
+ * tensor_parallel_size
629
+ )
631
630
  n_blocks = available_dram // nbytes_per_block
632
631
 
633
632
  return n_blocks, nbytes_per_block
@@ -685,27 +684,28 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
685
684
  else:
686
685
  rbln_kvcache_block_size = rbln_kvcache_partition_len
687
686
 
688
- max_num_blocks, nbytes_per_block = cls.get_maximum_num_blocks(
689
- config=model_config,
690
- tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
691
- kvcache_block_size=rbln_kvcache_block_size,
692
- nbits_per_param=16 if rbln_quantization is None else 4, # TODO(jongho): FIX Ad-hoc
693
- n_model_params=rbln_kwargs["n_model_params"],
694
- )
695
- model_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
696
- rbln_kvcache_num_blocks = min(model_num_blocks, max_num_blocks)
687
+ rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
688
+ if rbln_attn_impl == "flash_attn":
689
+ max_num_blocks, _ = cls.get_maximum_num_blocks(
690
+ config=model_config,
691
+ tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
692
+ kvcache_block_size=rbln_kvcache_block_size,
693
+ nbits_per_param=16 if rbln_quantization is None else 4, # TODO(jongho): FIX Ad-hoc
694
+ n_model_params=rbln_kwargs["n_model_params"],
695
+ )
696
+ rbln_kvcache_num_blocks = min(rbln_kvcache_num_blocks, max_num_blocks)
697
697
 
698
- required_blocks = rbln_max_seq_len // rbln_kvcache_block_size + 1
699
- if rbln_kvcache_num_blocks < required_blocks:
700
- rbln_kvcache_num_blocks = required_blocks
698
+ required_blocks = rbln_max_seq_len // rbln_kvcache_block_size + 1
699
+ if rbln_kvcache_num_blocks < required_blocks:
700
+ rbln_kvcache_num_blocks = required_blocks
701
701
 
702
- logger.info(f"[KVCache] Compiling with num_blocks: {rbln_kvcache_num_blocks}")
702
+ logger.info(f"[KVCache] Compiling with num_blocks: {rbln_kvcache_num_blocks}")
703
703
 
704
- if rbln_kvcache_num_blocks < rbln_batch_size:
705
- raise RuntimeError(
706
- f"Batch size ({rbln_batch_size}) exceeds available KV cache blocks ({rbln_kvcache_num_blocks}). "
707
- "Ensure the number of blocks is at least equal to the batch size."
708
- )
704
+ if rbln_kvcache_num_blocks < rbln_batch_size:
705
+ raise RuntimeError(
706
+ f"Batch size ({rbln_batch_size}) exceeds available KV cache blocks ({rbln_kvcache_num_blocks}). "
707
+ "Ensure the number of blocks is at least equal to the batch size."
708
+ )
709
709
 
710
710
  num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
711
711
  num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
@@ -805,9 +805,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
805
805
  "kvcache_block_size": rbln_kvcache_block_size,
806
806
  "attn_impl": rbln_attn_impl,
807
807
  "kvcache_num_blocks": rbln_kvcache_num_blocks,
808
- "model_num_blocks": model_num_blocks,
809
- "max_num_blocks": max_num_blocks,
810
- "nbytes_per_block": nbytes_per_block,
811
808
  }
812
809
  )
813
810
 
@@ -92,7 +92,7 @@ class PhiLayer(DecoderOnlyLayer):
92
92
 
93
93
  hidden_states = self.get_pre_attention_layernorm()(hidden_states)
94
94
 
95
- attn_outputs, present_key_values = self.self_attn(
95
+ attn_output = self.self_attn(
96
96
  hidden_states=hidden_states,
97
97
  attention_mask=attention_mask,
98
98
  seq_positions=seq_positions,
@@ -104,9 +104,9 @@ class PhiLayer(DecoderOnlyLayer):
104
104
 
105
105
  feed_forward_hidden_states = self._original_mod.mlp(hidden_states)
106
106
 
107
- hidden_states = attn_outputs + feed_forward_hidden_states + residual
107
+ hidden_states = attn_output + feed_forward_hidden_states + residual
108
108
 
109
- return hidden_states, present_key_values
109
+ return hidden_states
110
110
 
111
111
 
112
112
  class PhiModel(DecoderOnlyModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optimum-rbln
3
- Version: 0.7.3a4
3
+ Version: 0.7.3a6
4
4
  Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
5
5
  Project-URL: Homepage, https://rebellions.ai
6
6
  Project-URL: Documentation, https://docs.rbln.ai
@@ -1,31 +1,33 @@
1
- optimum/rbln/__init__.py,sha256=eHi15YM3989AcX52jka9rUmgAtlp1PHqMNwBEdOfuu8,6554
2
- optimum/rbln/__version__.py,sha256=MLlg_138GxyhciEP0ZB5dPN8vriXkicRnaZiwqygxOY,519
1
+ optimum/rbln/__init__.py,sha256=ZDzXcl-oAcYJhKjJMpotjbTih9awo7HzUb6T3MUEP6Q,6894
2
+ optimum/rbln/__version__.py,sha256=9voT1MrnPHKvqTeiZK8bNEZcPseZOq7N_U5etptnmTE,519
3
3
  optimum/rbln/modeling.py,sha256=nJsAs5zs--VVOYGFjYNpqfxYIemJIK4Lr0WEzlDLdP0,8390
4
- optimum/rbln/modeling_base.py,sha256=Ow73GVJF1N5cDFO8_rgirtGj1wC-cXBDyqXHW5PCybA,22270
4
+ optimum/rbln/modeling_base.py,sha256=dNCL-BhrWCpuOVkZaj8-MW567Tf4lLo3p3Z3ldjWJfU,21779
5
5
  optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
6
- optimum/rbln/diffusers/__init__.py,sha256=pOyoXv3-JRzTBSwPKbgLS9H6F2K9dJdReEmpGhcLQYU,3283
6
+ optimum/rbln/diffusers/__init__.py,sha256=Hq87CbtiCy85YmK2SB-OmUyfv77oe3j4bsTenTRnu6w,3623
7
7
  optimum/rbln/diffusers/modeling_diffusers.py,sha256=zqVNgH9oeOx2iNE7VsW_FinVf4s6G5Idyh4TKz7XJJg,21116
8
8
  optimum/rbln/diffusers/models/__init__.py,sha256=mkCvJyH1KcwrsUvYSq_bVC79oOfyqtBSFDyPS1_48wA,1478
9
9
  optimum/rbln/diffusers/models/controlnet.py,sha256=EM_HlzCdaZdnnK0oGpY2fQeigPqHhlwh4NHCzlmoumI,10512
10
10
  optimum/rbln/diffusers/models/autoencoders/__init__.py,sha256=dg17ZTUsiqTcbIaEE4fqew9uRbao0diQ21PXvRKIqKg,679
11
11
  optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py,sha256=rCbC32bJnfXtsLdVvNVVHpRAkCYy6jeCSwIZ-JSReWk,9220
12
12
  optimum/rbln/diffusers/models/autoencoders/vae.py,sha256=gB9HR7Bf7wpIXLv-Js4Pc3oyWRlqEe4cms4sI2AJicY,4380
13
- optimum/rbln/diffusers/models/autoencoders/vq_model.py,sha256=GunIau02_-lodYZBzd0ktJSNRT5axEFIZxSAfj2Mlyo,5974
13
+ optimum/rbln/diffusers/models/autoencoders/vq_model.py,sha256=b36QqPbayjApKivceQVVyQxHyR1ZOZ1ffuGgdALEPTQ,6117
14
14
  optimum/rbln/diffusers/models/transformers/__init__.py,sha256=V8rSR7WzHs-i8Cwb_MNxhY2NFbwPgxu24vGtkwl-6tk,706
15
15
  optimum/rbln/diffusers/models/transformers/prior_transformer.py,sha256=VG9cQo-_eppDvQSW1q1euAGBt1socUHetN_fIN2u1iU,6169
16
16
  optimum/rbln/diffusers/models/transformers/transformer_sd3.py,sha256=n_krmMgiRxWrG--567PNpk58EG_X7x7H4gidIkRvwjo,7308
17
17
  optimum/rbln/diffusers/models/unets/__init__.py,sha256=MaICuK9CWjgzejXy8y2NDrphuEq1rkzanF8u45k6O5I,655
18
- optimum/rbln/diffusers/models/unets/unet_2d_condition.py,sha256=xHnBzFrm7aNaolxrsotbjo9GkbNiNdTleXQoeqGLlhg,15540
19
- optimum/rbln/diffusers/pipelines/__init__.py,sha256=DAsM4eNks3hEY-bsUKSxRKmgwUWDGDlw82gfplSOdO8,2800
18
+ optimum/rbln/diffusers/models/unets/unet_2d_condition.py,sha256=QIjVWQQf8KBn5rU7lvipdm3gNBxZl7l6HCAj7p5FjLU,15977
19
+ optimum/rbln/diffusers/pipelines/__init__.py,sha256=5KLZ5LrpMzBya2e_3_PvEoPwG24U8JMexfw_ygZREKc,3140
20
20
  optimum/rbln/diffusers/pipelines/controlnet/__init__.py,sha256=n1Ef22TSeax-kENi_d8K6wGGHSNEo9QkUeygELHgcao,983
21
21
  optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py,sha256=JWKtnZYBIfgmbAo0SLFIvHBQCv2BPSFNvpcdjG4GUOY,4113
22
22
  optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py,sha256=dGdw5cwJLS4CLv6IHskk5ZCcPgS7UDuHKbfOZ8ojNUs,35187
23
23
  optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py,sha256=7xCiXrH4ToCTHohVGFXqO7_f9G8HShYaHgZxoMZARkQ,33664
24
24
  optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py,sha256=Gzt2wg4dgFg0TV3Bu0cs8Xru3wVrxWUxxgciwZ-QKLE,44755
25
25
  optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py,sha256=RfwxNX_zQWFtvvFQJ5bt3qtHbdYdQV_3XLHm9WYCKOs,46084
26
- optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py,sha256=YFqA76_XiMNxPwqotbHug2kd7jCbOXOu5NlxG2hbaVs,808
27
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py,sha256=9szfe1NvOr1mgDnSPZvBGq1b65RElUrqLVhuErY3Dmw,2962
28
- optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py,sha256=WxBbHAZSAKDSWhFerPvUlIhhWEsejW4NmhwmWX-_b54,856
26
+ optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py,sha256=I4YQq2HfA3xONbWsdJ870IEJPyLWeCDDG-UCJsu9YO8,1035
27
+ optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py,sha256=aNFGOjth8tDvPrjYLbRWrkHr6p-8AFgcQx1Qay1fw70,904
28
+ optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py,sha256=unqFDviA7dnx0yuo8L8tXVj2mjFYCPm7C9dcpdWBICc,6882
29
+ optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py,sha256=fEs-WgJqWs5zvuCkKb7MuZokH9Mi6q-0DOEKxzfWxzo,932
30
+ optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py,sha256=Ad2ZYCXaMiYpB0mz-8X1CGhILxrVbt7rRIXt6IPwYBM,932
29
31
  optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py,sha256=Mf7tzrXetwCgt7LuXfkX-CX1hltLgNZdwF9bHxAbDJM,874
30
32
  optimum/rbln/diffusers/pipelines/stable_diffusion/__init__.py,sha256=gz6CbP4T6w8XH3PIGRIJXTmKFsChJIkwcAEAsiR5Ydg,830
31
33
  optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py,sha256=DgRLzO9HxtgE1jICmHoHaqeVXM4Ih-5uo2JqNMAPMcc,876
@@ -60,7 +62,7 @@ optimum/rbln/transformers/models/clip/__init__.py,sha256=H9vuBwrmFO0-CqZhXUrKF-u
60
62
  optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=NiSm7bHs4SReHDUr53BBWSX0Y8bkKOeUSpsBDrp8YDw,6628
61
63
  optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
62
64
  optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=7OIKteJLKNxOLOg0w3lLOM7TxZovQn4jkglI9wRkrtQ,40609
63
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=W9HnxJoTz78Wc4X5Q3sMSHhMTSa7-9uQCFlnqNVozvA,38932
65
+ optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=uGdPGcFrWm2gAwFLjfBiALwFsl49VGCReVi4NUfOPxM,38898
64
66
  optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
65
67
  optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
66
68
  optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
@@ -85,7 +87,7 @@ optimum/rbln/transformers/models/mistral/mistral_architecture.py,sha256=_aU8TE_t
85
87
  optimum/rbln/transformers/models/mistral/modeling_mistral.py,sha256=7nrddoBIHf8S12LZWBUpotnvG3gND11vMQda9yYXJ-s,1560
86
88
  optimum/rbln/transformers/models/phi/__init__.py,sha256=mZLt1M7BbYEvSon5UlkniMUPa15SfjZFdw0kMSAF3VA,644
87
89
  optimum/rbln/transformers/models/phi/modeling_phi.py,sha256=j-6Pqd5rR2JE8I1pnKFlCi4nW5Dv3wZjoPWxohissoo,1516
88
- optimum/rbln/transformers/models/phi/phi_architecture.py,sha256=rBQjr6MOYBo1i5yLekMSR81TzYlHrHAA30kyKDdR7ww,4132
90
+ optimum/rbln/transformers/models/phi/phi_architecture.py,sha256=TueyqmjPXWmOPOxBm4dIFyd0X3iV1jgw0U6c26iCAPk,4090
89
91
  optimum/rbln/transformers/models/qwen2/__init__.py,sha256=RAMWc21W_2I6DH9xBjeNxPECmAcTrbKhSIefq3Lass0,648
90
92
  optimum/rbln/transformers/models/qwen2/modeling_qwen2.py,sha256=9-aFDvjMzPNUyGOz0qo33RE18bUFGYZ3Wt_68zb5uJY,1530
91
93
  optimum/rbln/transformers/models/qwen2/qwen2_architecture.py,sha256=XlNAMYAcDLohnSAhIFGKOPuCB5XLgzYs5ABWdeQSaZs,720
@@ -114,7 +116,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
114
116
  optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
115
117
  optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
116
118
  optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
117
- optimum_rbln-0.7.3a4.dist-info/METADATA,sha256=8VNTOVgsgFtcFUuZ9VEeRQfC2LEB60OFmW92hlJo8V8,5300
118
- optimum_rbln-0.7.3a4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
119
- optimum_rbln-0.7.3a4.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
120
- optimum_rbln-0.7.3a4.dist-info/RECORD,,
119
+ optimum_rbln-0.7.3a6.dist-info/METADATA,sha256=TGw8TCIfBQ9RWlzxf5JI16Zoy-xoEodnBO8m6SKXBsk,5300
120
+ optimum_rbln-0.7.3a6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
121
+ optimum_rbln-0.7.3a6.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
122
+ optimum_rbln-0.7.3a6.dist-info/RECORD,,