optimum-rbln 0.7.4a1__py3-none-any.whl → 0.7.4a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum/rbln/__version__.py +1 -1
- optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +39 -20
- optimum/rbln/transformers/models/t5/modeling_t5.py +37 -3
- optimum/rbln/transformers/models/whisper/whisper_architecture.py +2 -2
- {optimum_rbln-0.7.4a1.dist-info → optimum_rbln-0.7.4a2.dist-info}/METADATA +1 -1
- {optimum_rbln-0.7.4a1.dist-info → optimum_rbln-0.7.4a2.dist-info}/RECORD +8 -8
- {optimum_rbln-0.7.4a1.dist-info → optimum_rbln-0.7.4a2.dist-info}/WHEEL +0 -0
- {optimum_rbln-0.7.4a1.dist-info → optimum_rbln-0.7.4a2.dist-info}/licenses/LICENSE +0 -0
optimum/rbln/__version__.py
CHANGED
@@ -578,11 +578,41 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
578
578
|
nbits_per_param: int,
|
579
579
|
n_model_params: int,
|
580
580
|
) -> int:
|
581
|
+
"""
|
582
|
+
We are finding max_n_blocks(x) that satisfies the following equation:
|
583
|
+
|
584
|
+
available_dram - kernel_size - buffer
|
585
|
+
- num_layers * 2 * tensor_parallel_size
|
586
|
+
* align_2MB(
|
587
|
+
x
|
588
|
+
* block_size
|
589
|
+
* align_64(head_dim)
|
590
|
+
* math.ceil(num_key_value_heads / tensor_parallel_size)
|
591
|
+
* 2
|
592
|
+
) > 0
|
593
|
+
|
594
|
+
This inequality can be rewritten as follows:
|
595
|
+
|
596
|
+
a - c * align_2MB(b * x) > 0
|
597
|
+
where
|
598
|
+
a = available_dram - kernel_size - buffer
|
599
|
+
b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
600
|
+
c = num_layers * 2 * tensor_parallel_size
|
601
|
+
|
602
|
+
We can rewrite the inequality as follows:
|
603
|
+
k > align_2MB(b*x)
|
604
|
+
where
|
605
|
+
k = a / c
|
606
|
+
|
607
|
+
After that, we can derive the following equation:
|
608
|
+
x = floor(2**21 / b * floor((k - 1) / 2**21))
|
609
|
+
"""
|
610
|
+
|
581
611
|
def align(x: int, nbytes: int) -> int:
|
582
612
|
return int(math.ceil(x / nbytes) * nbytes)
|
583
613
|
|
584
614
|
def align_2MB(x: int) -> int:
|
585
|
-
return align(x, 2
|
615
|
+
return align(x, 2**21)
|
586
616
|
|
587
617
|
num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
|
588
618
|
num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
|
@@ -612,27 +642,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
612
642
|
available_dram -= kernel_size
|
613
643
|
|
614
644
|
# TODO: Accurate buffer estimation
|
615
|
-
|
616
|
-
|
617
|
-
buffer /= 4
|
618
|
-
|
645
|
+
buffer_per_core = 2**29 # 500MB per npu
|
646
|
+
buffer = buffer_per_core * tensor_parallel_size
|
619
647
|
available_dram -= buffer
|
620
648
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
* head_dim
|
626
|
-
* math.ceil(num_key_value_heads / tensor_parallel_size) # Shard
|
627
|
-
* 2 # (fp16)
|
628
|
-
)
|
629
|
-
* num_layers
|
630
|
-
* 2 # (k, v)
|
631
|
-
* tensor_parallel_size
|
632
|
-
)
|
633
|
-
n_blocks = available_dram // nbytes_per_block
|
649
|
+
b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
|
650
|
+
c = num_layers * 2 * tensor_parallel_size
|
651
|
+
k = available_dram / c
|
652
|
+
max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
|
634
653
|
|
635
|
-
return
|
654
|
+
return max_n_blocks
|
636
655
|
|
637
656
|
@classmethod
|
638
657
|
def _get_rbln_config(
|
@@ -689,7 +708,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
|
|
689
708
|
|
690
709
|
rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
|
691
710
|
if rbln_attn_impl == "flash_attn":
|
692
|
-
max_num_blocks
|
711
|
+
max_num_blocks = cls.get_maximum_num_blocks(
|
693
712
|
config=model_config,
|
694
713
|
tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
|
695
714
|
kvcache_block_size=rbln_kvcache_block_size,
|
@@ -36,19 +36,50 @@ from .t5_architecture import T5Wrapper
|
|
36
36
|
logger = get_logger()
|
37
37
|
|
38
38
|
if TYPE_CHECKING:
|
39
|
+
from rebel import Runtime
|
39
40
|
from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
|
40
41
|
|
41
42
|
|
42
43
|
class RBLNRuntimeModel(RBLNPytorchRuntime):
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
runtime: "Runtime",
|
47
|
+
max_seq_len: int,
|
48
|
+
**kwargs: Any,
|
49
|
+
) -> None:
|
50
|
+
super().__init__(runtime, **kwargs)
|
51
|
+
self.max_seq_len = max_seq_len
|
52
|
+
|
53
|
+
def _prepare_inputs(
|
54
|
+
self,
|
55
|
+
input_ids: torch.LongTensor,
|
56
|
+
attention_mask: torch.LongTensor,
|
57
|
+
):
|
58
|
+
input_len = input_ids.shape[-1]
|
59
|
+
pad_len = None
|
60
|
+
if input_len > self.max_seq_len:
|
61
|
+
raise ValueError(f"Error input_len({input_len}) exceed max_seq_len({self.max_seq_len}).")
|
62
|
+
elif input_len < self.max_seq_len and input_len > 0:
|
63
|
+
pad_len = self.max_seq_len - input_len
|
64
|
+
logger.warning(
|
65
|
+
f"Warning: The input was padded with {pad_len} tokens to meet the compiled model's requirements. "
|
66
|
+
"For optimal performance, consider recompiling with a shorter 'rbln_max_seq_len'."
|
67
|
+
)
|
68
|
+
input_ids = torch.nn.functional.pad(input_ids, (0, pad_len))
|
69
|
+
attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_len), value=0)
|
70
|
+
|
71
|
+
return input_ids, attention_mask, pad_len
|
72
|
+
|
43
73
|
def forward(
|
44
74
|
self,
|
45
75
|
input_ids: torch.LongTensor,
|
46
|
-
attention_mask: torch.
|
76
|
+
attention_mask: torch.LongTensor,
|
47
77
|
head_mask: torch.FloatTensor,
|
48
78
|
inputs_embeds: torch.FloatTensor,
|
49
79
|
**kwargs,
|
50
80
|
):
|
51
|
-
|
81
|
+
input_ids, attention_mask, pad_len = self._prepare_inputs(input_ids, attention_mask)
|
82
|
+
logits = super().forward(
|
52
83
|
input_ids,
|
53
84
|
attention_mask,
|
54
85
|
head_mask,
|
@@ -56,6 +87,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
|
|
56
87
|
**kwargs,
|
57
88
|
)
|
58
89
|
|
90
|
+
return logits[:, :-pad_len, :] if pad_len is not None else logits
|
91
|
+
|
59
92
|
|
60
93
|
class T5EncoderWrapper(torch.nn.Module):
|
61
94
|
def __init__(self, model: "T5EncoderModel") -> None:
|
@@ -72,7 +105,8 @@ class RBLNT5EncoderModel(RBLNModel):
|
|
72
105
|
rbln_model_input_names = ["input_ids", "attention_mask"]
|
73
106
|
|
74
107
|
def __post_init__(self, **kwargs):
|
75
|
-
|
108
|
+
max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
|
109
|
+
self.model = RBLNRuntimeModel(runtime=self.model[0], max_seq_len=max_seq_len)
|
76
110
|
|
77
111
|
@classmethod
|
78
112
|
def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
|
@@ -190,11 +190,11 @@ class WhisperDecoder(nn.Module):
|
|
190
190
|
all_hiddens = []
|
191
191
|
for i in range(inputs_embeds.shape[0]):
|
192
192
|
position_id = cache_position[i]
|
193
|
-
position = self.embed_positions
|
193
|
+
position = self.embed_positions.weight[position_id]
|
194
194
|
batch_hidden = position + inputs_embeds[i]
|
195
195
|
all_hiddens.append(batch_hidden)
|
196
196
|
|
197
|
-
hidden_states = torch.
|
197
|
+
hidden_states = torch.cat(all_hiddens, dim=0).unsqueeze(1)
|
198
198
|
|
199
199
|
# prepare attn mask (normal attention - masked)
|
200
200
|
if attention_mask is not None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: optimum-rbln
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.4a2
|
4
4
|
Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
|
5
5
|
Project-URL: Homepage, https://rebellions.ai
|
6
6
|
Project-URL: Documentation, https://docs.rbln.ai
|
@@ -1,5 +1,5 @@
|
|
1
1
|
optimum/rbln/__init__.py,sha256=qW45z47BiNLTDtRFEhVEzr4THNFX0ygqCbdNKqI0biI,6992
|
2
|
-
optimum/rbln/__version__.py,sha256=
|
2
|
+
optimum/rbln/__version__.py,sha256=7nOn__ePBcQxiENj-KnOIjfCYYWSP4QlkMW59HLjtRs,513
|
3
3
|
optimum/rbln/modeling.py,sha256=nJsAs5zs--VVOYGFjYNpqfxYIemJIK4Lr0WEzlDLdP0,8390
|
4
4
|
optimum/rbln/modeling_base.py,sha256=dNCL-BhrWCpuOVkZaj8-MW567Tf4lLo3p3Z3ldjWJfU,21779
|
5
5
|
optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
|
@@ -63,7 +63,7 @@ optimum/rbln/transformers/models/clip/__init__.py,sha256=H9vuBwrmFO0-CqZhXUrKF-u
|
|
63
63
|
optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=NiSm7bHs4SReHDUr53BBWSX0Y8bkKOeUSpsBDrp8YDw,6628
|
64
64
|
optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
|
65
65
|
optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=m93-qKN7NMw3i0XDmFmttmRIRK4np_fWtLFlBb2RFgU,41351
|
66
|
-
optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=
|
66
|
+
optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=qNnWSe3p2LDwNJ6utrilsqid-rQ8YLloqYkSOZamvhs,39918
|
67
67
|
optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
|
68
68
|
optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
|
69
69
|
optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
|
@@ -96,7 +96,7 @@ optimum/rbln/transformers/models/seq2seq/__init__.py,sha256=EmEMV4rOYqKyruX85d0f
|
|
96
96
|
optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py,sha256=XcZb57v42wju1qOJ1AKqmtJXcmz6MEWaJZ8jyzaEiTw,17701
|
97
97
|
optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=tvzacIZam1sIr_1BvvZ_fDr8u5dXAiYiynFdX9tArtY,18877
|
98
98
|
optimum/rbln/transformers/models/t5/__init__.py,sha256=1skR1RmnG62WTAP3-F5P1x-V_ReFhMyirH3u56vWwvc,675
|
99
|
-
optimum/rbln/transformers/models/t5/modeling_t5.py,sha256
|
99
|
+
optimum/rbln/transformers/models/t5/modeling_t5.py,sha256=Gyq5aAfkl4hBbLiR0114nDxLBs5P6YTw7hCnyuDyRrM,9494
|
100
100
|
optimum/rbln/transformers/models/t5/t5_architecture.py,sha256=Ups6drBbYe4wEAiBLcBIyO9wqrIQbvOPFR_ybbAgR8c,9722
|
101
101
|
optimum/rbln/transformers/models/time_series_transformers/__init__.py,sha256=RL4SO8tKEd4wQrzyU4Nv4-hhITKPhblUsBd3anXNkA8,1079
|
102
102
|
optimum/rbln/transformers/models/time_series_transformers/modeling_time_series_transformers.py,sha256=1Ippt0Rmt2TxJ5X4-4tlALQOkKmOfMaTrbOLWIUIKWw,16614
|
@@ -106,7 +106,7 @@ optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py,sha256=JYJmV52j6c
|
|
106
106
|
optimum/rbln/transformers/models/whisper/__init__.py,sha256=ktnNe5ri3ycCWZ_W_voFB9y9-vgGgxS1X9s8LBRZmWc,665
|
107
107
|
optimum/rbln/transformers/models/whisper/generation_whisper.py,sha256=GIHTca3b1VtW81kp7BzKQ7f77c2t9OsEsbZetripgDo,4582
|
108
108
|
optimum/rbln/transformers/models/whisper/modeling_whisper.py,sha256=GegyAi3a8fF0psdYsffTQ1pC4KAUqE7WYLj4ZqObWXI,18184
|
109
|
-
optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=
|
109
|
+
optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=DS9AQYhNkaR7sUz_loee-fFtCCYy1BUsx7_dX_o1Le8,14199
|
110
110
|
optimum/rbln/transformers/models/xlm_roberta/__init__.py,sha256=fC7iNcdxBZ_6eOF2snStmf8r2M3c8O_-XcXnQEaHQCE,653
|
111
111
|
optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py,sha256=8YNLz0bc5ze-QuU8rN-QhUfGzlSUs3iMJiWTxO3o6AM,4366
|
112
112
|
optimum/rbln/transformers/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -120,7 +120,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
|
|
120
120
|
optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
|
121
121
|
optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
|
122
122
|
optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
|
123
|
-
optimum_rbln-0.7.
|
124
|
-
optimum_rbln-0.7.
|
125
|
-
optimum_rbln-0.7.
|
126
|
-
optimum_rbln-0.7.
|
123
|
+
optimum_rbln-0.7.4a2.dist-info/METADATA,sha256=Pl6SOVN73gxS7Po-R5hMdTI_2izOrjWqyb9FhaDnr-A,5300
|
124
|
+
optimum_rbln-0.7.4a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
125
|
+
optimum_rbln-0.7.4a2.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
126
|
+
optimum_rbln-0.7.4a2.dist-info/RECORD,,
|
File without changes
|
File without changes
|