optimum-rbln 0.7.4a1__py3-none-any.whl → 0.7.4a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.7.4a1'
20
+ __version__ = version = '0.7.4a2'
21
21
  __version_tuple__ = version_tuple = (0, 7, 4)
@@ -578,11 +578,41 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
578
578
  nbits_per_param: int,
579
579
  n_model_params: int,
580
580
  ) -> int:
581
+ """
582
+ We are finding max_n_blocks(x) that satisfies the following equation:
583
+
584
+ available_dram - kernel_size - buffer
585
+ - num_layers * 2 * tensor_parallel_size
586
+ * align_2MB(
587
+ x
588
+ * block_size
589
+ * align_64(head_dim)
590
+ * math.ceil(num_key_value_heads / tensor_parallel_size)
591
+ * 2
592
+ ) > 0
593
+
594
+ This inequality can be rewritten as follows:
595
+
596
+ a - c * align_2MB(b * x) > 0
597
+ where
598
+ a = available_dram - kernel_size - buffer
599
+ b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
600
+ c = num_layers * 2 * tensor_parallel_size
601
+
602
+ We can rewrite the inequality as follows:
603
+ k > align_2MB(b*x)
604
+ where
605
+ k = a / c
606
+
607
+ After that, we can derive the following equation:
608
+ x = floor(2**21 / b * floor((k - 1) / 2**21))
609
+ """
610
+
581
611
  def align(x: int, nbytes: int) -> int:
582
612
  return int(math.ceil(x / nbytes) * nbytes)
583
613
 
584
614
  def align_2MB(x: int) -> int:
585
- return align(x, 2 * 1024 * 1024)
615
+ return align(x, 2**21)
586
616
 
587
617
  num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
588
618
  num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
@@ -612,27 +642,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
612
642
  available_dram -= kernel_size
613
643
 
614
644
  # TODO: Accurate buffer estimation
615
- buffer = 2**30 # 1GB Buffer
616
- if tensor_parallel_size <= 4:
617
- buffer /= 4
618
-
645
+ buffer_per_core = 2**29 # 500MB per npu
646
+ buffer = buffer_per_core * tensor_parallel_size
619
647
  available_dram -= buffer
620
648
 
621
- # Estimate nbytes per a single kvcache block
622
- nbytes_per_block = (
623
- align_2MB(
624
- kvcache_block_size
625
- * head_dim
626
- * math.ceil(num_key_value_heads / tensor_parallel_size) # Shard
627
- * 2 # (fp16)
628
- )
629
- * num_layers
630
- * 2 # (k, v)
631
- * tensor_parallel_size
632
- )
633
- n_blocks = available_dram // nbytes_per_block
649
+ b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
650
+ c = num_layers * 2 * tensor_parallel_size
651
+ k = available_dram / c
652
+ max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
634
653
 
635
- return n_blocks, nbytes_per_block
654
+ return max_n_blocks
636
655
 
637
656
  @classmethod
638
657
  def _get_rbln_config(
@@ -689,7 +708,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
689
708
 
690
709
  rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
691
710
  if rbln_attn_impl == "flash_attn":
692
- max_num_blocks, _ = cls.get_maximum_num_blocks(
711
+ max_num_blocks = cls.get_maximum_num_blocks(
693
712
  config=model_config,
694
713
  tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
695
714
  kvcache_block_size=rbln_kvcache_block_size,
@@ -36,19 +36,50 @@ from .t5_architecture import T5Wrapper
36
36
  logger = get_logger()
37
37
 
38
38
  if TYPE_CHECKING:
39
+ from rebel import Runtime
39
40
  from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
40
41
 
41
42
 
42
43
  class RBLNRuntimeModel(RBLNPytorchRuntime):
44
+ def __init__(
45
+ self,
46
+ runtime: "Runtime",
47
+ max_seq_len: int,
48
+ **kwargs: Any,
49
+ ) -> None:
50
+ super().__init__(runtime, **kwargs)
51
+ self.max_seq_len = max_seq_len
52
+
53
+ def _prepare_inputs(
54
+ self,
55
+ input_ids: torch.LongTensor,
56
+ attention_mask: torch.LongTensor,
57
+ ):
58
+ input_len = input_ids.shape[-1]
59
+ pad_len = None
60
+ if input_len > self.max_seq_len:
61
+ raise ValueError(f"Error input_len({input_len}) exceed max_seq_len({self.max_seq_len}).")
62
+ elif input_len < self.max_seq_len and input_len > 0:
63
+ pad_len = self.max_seq_len - input_len
64
+ logger.warning(
65
+ f"Warning: The input was padded with {pad_len} tokens to meet the compiled model's requirements. "
66
+ "For optimal performance, consider recompiling with a shorter 'rbln_max_seq_len'."
67
+ )
68
+ input_ids = torch.nn.functional.pad(input_ids, (0, pad_len))
69
+ attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_len), value=0)
70
+
71
+ return input_ids, attention_mask, pad_len
72
+
43
73
  def forward(
44
74
  self,
45
75
  input_ids: torch.LongTensor,
46
- attention_mask: torch.FloatTensor,
76
+ attention_mask: torch.LongTensor,
47
77
  head_mask: torch.FloatTensor,
48
78
  inputs_embeds: torch.FloatTensor,
49
79
  **kwargs,
50
80
  ):
51
- return super().forward(
81
+ input_ids, attention_mask, pad_len = self._prepare_inputs(input_ids, attention_mask)
82
+ logits = super().forward(
52
83
  input_ids,
53
84
  attention_mask,
54
85
  head_mask,
@@ -56,6 +87,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
56
87
  **kwargs,
57
88
  )
58
89
 
90
+ return logits[:, :-pad_len, :] if pad_len is not None else logits
91
+
59
92
 
60
93
  class T5EncoderWrapper(torch.nn.Module):
61
94
  def __init__(self, model: "T5EncoderModel") -> None:
@@ -72,7 +105,8 @@ class RBLNT5EncoderModel(RBLNModel):
72
105
  rbln_model_input_names = ["input_ids", "attention_mask"]
73
106
 
74
107
  def __post_init__(self, **kwargs):
75
- self.model = RBLNRuntimeModel(runtime=self.model[0])
108
+ max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
109
+ self.model = RBLNRuntimeModel(runtime=self.model[0], max_seq_len=max_seq_len)
76
110
 
77
111
  @classmethod
78
112
  def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
@@ -190,11 +190,11 @@ class WhisperDecoder(nn.Module):
190
190
  all_hiddens = []
191
191
  for i in range(inputs_embeds.shape[0]):
192
192
  position_id = cache_position[i]
193
- position = self.embed_positions(input_ids, position_ids=position_id)
193
+ position = self.embed_positions.weight[position_id]
194
194
  batch_hidden = position + inputs_embeds[i]
195
195
  all_hiddens.append(batch_hidden)
196
196
 
197
- hidden_states = torch.stack(all_hiddens, dim=0)
197
+ hidden_states = torch.cat(all_hiddens, dim=0).unsqueeze(1)
198
198
 
199
199
  # prepare attn mask (normal attention - masked)
200
200
  if attention_mask is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optimum-rbln
3
- Version: 0.7.4a1
3
+ Version: 0.7.4a2
4
4
  Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
5
5
  Project-URL: Homepage, https://rebellions.ai
6
6
  Project-URL: Documentation, https://docs.rbln.ai
@@ -1,5 +1,5 @@
1
1
  optimum/rbln/__init__.py,sha256=qW45z47BiNLTDtRFEhVEzr4THNFX0ygqCbdNKqI0biI,6992
2
- optimum/rbln/__version__.py,sha256=KifVR95YmJmHh5f74wGiEAzd-c6ElHQ3XFHbY8VRp14,513
2
+ optimum/rbln/__version__.py,sha256=7nOn__ePBcQxiENj-KnOIjfCYYWSP4QlkMW59HLjtRs,513
3
3
  optimum/rbln/modeling.py,sha256=nJsAs5zs--VVOYGFjYNpqfxYIemJIK4Lr0WEzlDLdP0,8390
4
4
  optimum/rbln/modeling_base.py,sha256=dNCL-BhrWCpuOVkZaj8-MW567Tf4lLo3p3Z3ldjWJfU,21779
5
5
  optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
@@ -63,7 +63,7 @@ optimum/rbln/transformers/models/clip/__init__.py,sha256=H9vuBwrmFO0-CqZhXUrKF-u
63
63
  optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=NiSm7bHs4SReHDUr53BBWSX0Y8bkKOeUSpsBDrp8YDw,6628
64
64
  optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
65
65
  optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=m93-qKN7NMw3i0XDmFmttmRIRK4np_fWtLFlBb2RFgU,41351
66
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=qeZWdfLU0gCssxBODJsjQWMjfQWxK9vgC2Xt9eA5j4I,39147
66
+ optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=qNnWSe3p2LDwNJ6utrilsqid-rQ8YLloqYkSOZamvhs,39918
67
67
  optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
68
68
  optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
69
69
  optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
@@ -96,7 +96,7 @@ optimum/rbln/transformers/models/seq2seq/__init__.py,sha256=EmEMV4rOYqKyruX85d0f
96
96
  optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py,sha256=XcZb57v42wju1qOJ1AKqmtJXcmz6MEWaJZ8jyzaEiTw,17701
97
97
  optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=tvzacIZam1sIr_1BvvZ_fDr8u5dXAiYiynFdX9tArtY,18877
98
98
  optimum/rbln/transformers/models/t5/__init__.py,sha256=1skR1RmnG62WTAP3-F5P1x-V_ReFhMyirH3u56vWwvc,675
99
- optimum/rbln/transformers/models/t5/modeling_t5.py,sha256=-fG-h0wwsfjZ3par0QHbXKA7hbvw_lPJOIf8iXQDOfM,8082
99
+ optimum/rbln/transformers/models/t5/modeling_t5.py,sha256=Gyq5aAfkl4hBbLiR0114nDxLBs5P6YTw7hCnyuDyRrM,9494
100
100
  optimum/rbln/transformers/models/t5/t5_architecture.py,sha256=Ups6drBbYe4wEAiBLcBIyO9wqrIQbvOPFR_ybbAgR8c,9722
101
101
  optimum/rbln/transformers/models/time_series_transformers/__init__.py,sha256=RL4SO8tKEd4wQrzyU4Nv4-hhITKPhblUsBd3anXNkA8,1079
102
102
  optimum/rbln/transformers/models/time_series_transformers/modeling_time_series_transformers.py,sha256=1Ippt0Rmt2TxJ5X4-4tlALQOkKmOfMaTrbOLWIUIKWw,16614
@@ -106,7 +106,7 @@ optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py,sha256=JYJmV52j6c
106
106
  optimum/rbln/transformers/models/whisper/__init__.py,sha256=ktnNe5ri3ycCWZ_W_voFB9y9-vgGgxS1X9s8LBRZmWc,665
107
107
  optimum/rbln/transformers/models/whisper/generation_whisper.py,sha256=GIHTca3b1VtW81kp7BzKQ7f77c2t9OsEsbZetripgDo,4582
108
108
  optimum/rbln/transformers/models/whisper/modeling_whisper.py,sha256=GegyAi3a8fF0psdYsffTQ1pC4KAUqE7WYLj4ZqObWXI,18184
109
- optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=k_aDk2B58IxQimf6yW36Wgc0uw5PqB85Or8ie_6ZZ70,14205
109
+ optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=DS9AQYhNkaR7sUz_loee-fFtCCYy1BUsx7_dX_o1Le8,14199
110
110
  optimum/rbln/transformers/models/xlm_roberta/__init__.py,sha256=fC7iNcdxBZ_6eOF2snStmf8r2M3c8O_-XcXnQEaHQCE,653
111
111
  optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py,sha256=8YNLz0bc5ze-QuU8rN-QhUfGzlSUs3iMJiWTxO3o6AM,4366
112
112
  optimum/rbln/transformers/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -120,7 +120,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
120
120
  optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
121
121
  optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
122
122
  optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
123
- optimum_rbln-0.7.4a1.dist-info/METADATA,sha256=dMl4yloIz6iqjC2SN8CE1rVP9Kftw50Z01zocntnguE,5300
124
- optimum_rbln-0.7.4a1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
125
- optimum_rbln-0.7.4a1.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
126
- optimum_rbln-0.7.4a1.dist-info/RECORD,,
123
+ optimum_rbln-0.7.4a2.dist-info/METADATA,sha256=Pl6SOVN73gxS7Po-R5hMdTI_2izOrjWqyb9FhaDnr-A,5300
124
+ optimum_rbln-0.7.4a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
125
+ optimum_rbln-0.7.4a2.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
126
+ optimum_rbln-0.7.4a2.dist-info/RECORD,,