optimum-rbln 0.7.4a0__py3-none-any.whl → 0.7.4a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
optimum/rbln/__init__.py CHANGED
@@ -73,6 +73,7 @@ _import_structure = {
73
73
  "RBLNRobertaForMaskedLM",
74
74
  "RBLNViTForImageClassification",
75
75
  "RBLNBertForMaskedLM",
76
+ "RBLNTimeSeriesTransformerForPrediction",
76
77
  ],
77
78
  "diffusers": [
78
79
  "RBLNAutoencoderKL",
@@ -184,6 +185,7 @@ if TYPE_CHECKING:
184
185
  RBLNRobertaForSequenceClassification,
185
186
  RBLNT5EncoderModel,
186
187
  RBLNT5ForConditionalGeneration,
188
+ RBLNTimeSeriesTransformerForPrediction,
187
189
  RBLNViTForImageClassification,
188
190
  RBLNWav2Vec2ForCTC,
189
191
  RBLNWhisperForConditionalGeneration,
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.7.4a0'
20
+ __version__ = version = '0.7.4a2'
21
21
  __version_tuple__ = version_tuple = (0, 7, 4)
@@ -19,3 +19,4 @@ from .attn import (
19
19
  )
20
20
  from .flash_attn import register_rbln_custom_paged_flash_attention, register_rbln_custom_paged_flash_causal_attention
21
21
  from .kv_cache_update import register_rbln_custom_cache_update
22
+ from .linear import linear
@@ -0,0 +1,25 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional
16
+
17
+ import torch
18
+ from torch import Tensor
19
+
20
+
21
+ @torch.library.custom_op("rbln_custom_ops::linear", mutates_args=())
22
+ def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
23
+ output_shape = list(input.shape[:-1])
24
+ output_shape += [weight.shape[0]]
25
+ return torch.empty(size=output_shape, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad)
@@ -52,6 +52,7 @@ _import_structure = {
52
52
  "RBLNPhiForCausalLM",
53
53
  "RBLNT5EncoderModel",
54
54
  "RBLNT5ForConditionalGeneration",
55
+ "RBLNTimeSeriesTransformerForPrediction",
55
56
  "RBLNLlavaNextForConditionalGeneration",
56
57
  "RBLNMidmLMHeadModel",
57
58
  "RBLNXLMRobertaModel",
@@ -113,6 +114,7 @@ if TYPE_CHECKING:
113
114
  RBLNQwen2ForCausalLM,
114
115
  RBLNT5EncoderModel,
115
116
  RBLNT5ForConditionalGeneration,
117
+ RBLNTimeSeriesTransformerForPrediction,
116
118
  RBLNWav2Vec2ForCTC,
117
119
  RBLNWhisperForConditionalGeneration,
118
120
  RBLNXLMRobertaModel,
@@ -50,6 +50,7 @@ _import_structure = {
50
50
  "mistral": ["RBLNMistralForCausalLM"],
51
51
  "phi": ["RBLNPhiForCausalLM"],
52
52
  "qwen2": ["RBLNQwen2ForCausalLM"],
53
+ "time_series_transformers": ["RBLNTimeSeriesTransformerForPrediction"],
53
54
  "t5": ["RBLNT5EncoderModel", "RBLNT5ForConditionalGeneration"],
54
55
  "wav2vec2": ["RBLNWav2Vec2ForCTC"],
55
56
  "whisper": ["RBLNWhisperForConditionalGeneration"],
@@ -90,6 +91,7 @@ if TYPE_CHECKING:
90
91
  from .phi import RBLNPhiForCausalLM
91
92
  from .qwen2 import RBLNQwen2ForCausalLM
92
93
  from .t5 import RBLNT5EncoderModel, RBLNT5ForConditionalGeneration
94
+ from .time_series_transformers import RBLNTimeSeriesTransformerForPrediction
93
95
  from .wav2vec2 import RBLNWav2Vec2ForCTC
94
96
  from .whisper import RBLNWhisperForConditionalGeneration
95
97
  from .xlm_roberta import RBLNXLMRobertaModel
@@ -94,12 +94,11 @@ class RBLNBartModel(RBLNModel):
94
94
  for model_input_name in rbln_model_input_names
95
95
  ]
96
96
 
97
- enc_compile_config = RBLNCompileConfig(input_info=input_info, compiled_model_name="encoder")
98
- dec_compile_config = RBLNCompileConfig(input_info=input_info, compiled_model_name="decoder")
97
+ rbln_compile_config = RBLNCompileConfig(input_info=input_info)
99
98
 
100
99
  rbln_config = RBLNConfig(
101
100
  rbln_cls=cls.__name__,
102
- compile_cfgs=[enc_compile_config, dec_compile_config],
101
+ compile_cfgs=[rbln_compile_config],
103
102
  rbln_kwargs=rbln_kwargs,
104
103
  )
105
104
 
@@ -222,8 +222,6 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
222
222
 
223
223
  attention_mask = self.dec_attn_mask
224
224
 
225
- attention_mask = self.dec_attn_mask
226
-
227
225
  logits = super().forward(
228
226
  inputs,
229
227
  cache_position,
@@ -547,22 +545,27 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
547
545
 
548
546
  @QuantizationManager.with_quantization_env
549
547
  def compile_model(*args, **kwargs):
550
- wrapped_model.phase = "prefill"
551
- compiled_prefill = RBLNModel.compile(
552
- wrapped_model,
553
- prefill_compile_config,
554
- example_inputs=prefill_example_inputs,
555
- compile_context=context,
556
- )
548
+ try:
549
+ original_linear = torch.nn.functional.linear
550
+ torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
551
+ wrapped_model.phase = "prefill"
552
+ compiled_prefill = RBLNModel.compile(
553
+ wrapped_model,
554
+ prefill_compile_config,
555
+ example_inputs=prefill_example_inputs,
556
+ compile_context=context,
557
+ )
557
558
 
558
- wrapped_model.phase = "decode"
559
- compiled_decoder = RBLNModel.compile(
560
- wrapped_model,
561
- dec_compile_config,
562
- example_inputs=dec_example_inputs,
563
- compile_context=context,
564
- )
565
- return {"prefill": compiled_prefill, "decoder": compiled_decoder}
559
+ wrapped_model.phase = "decode"
560
+ compiled_decoder = RBLNModel.compile(
561
+ wrapped_model,
562
+ dec_compile_config,
563
+ example_inputs=dec_example_inputs,
564
+ compile_context=context,
565
+ )
566
+ return {"prefill": compiled_prefill, "decoder": compiled_decoder}
567
+ finally:
568
+ torch.nn.functional.linear = original_linear
566
569
 
567
570
  return compile_model(quantize_config=quantize_config)
568
571
 
@@ -575,11 +578,41 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
575
578
  nbits_per_param: int,
576
579
  n_model_params: int,
577
580
  ) -> int:
581
+ """
582
+ We are finding max_n_blocks(x) that satisfies the following equation:
583
+
584
+ available_dram - kernel_size - buffer
585
+ - num_layers * 2 * tensor_parallel_size
586
+ * align_2MB(
587
+ x
588
+ * block_size
589
+ * align_64(head_dim)
590
+ * math.ceil(num_key_value_heads / tensor_parallel_size)
591
+ * 2
592
+ ) > 0
593
+
594
+ This inequality can be rewritten as follows:
595
+
596
+ a - c * align_2MB(b * x) > 0
597
+ where
598
+ a = available_dram - kernel_size - buffer
599
+ b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
600
+ c = num_layers * 2 * tensor_parallel_size
601
+
602
+ We can rewrite the inequality as follows:
603
+ k > align_2MB(b*x)
604
+ where
605
+ k = a / c
606
+
607
+ After that, we can derive the following equation:
608
+ x = floor(2**21 / b * floor((k - 1) / 2**21))
609
+ """
610
+
578
611
  def align(x: int, nbytes: int) -> int:
579
612
  return int(math.ceil(x / nbytes) * nbytes)
580
613
 
581
614
  def align_2MB(x: int) -> int:
582
- return align(x, 2 * 1024 * 1024)
615
+ return align(x, 2**21)
583
616
 
584
617
  num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
585
618
  num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
@@ -609,27 +642,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
609
642
  available_dram -= kernel_size
610
643
 
611
644
  # TODO: Accurate buffer estimation
612
- buffer = 2**30 # 1GB Buffer
613
- if tensor_parallel_size <= 4:
614
- buffer /= 4
615
-
645
+ buffer_per_core = 2**29 # 500MB per npu
646
+ buffer = buffer_per_core * tensor_parallel_size
616
647
  available_dram -= buffer
617
648
 
618
- # Estimate nbytes per a single kvcache block
619
- nbytes_per_block = (
620
- align_2MB(
621
- kvcache_block_size
622
- * head_dim
623
- * math.ceil(num_key_value_heads / tensor_parallel_size) # Shard
624
- * 2 # (fp16)
625
- )
626
- * num_layers
627
- * 2 # (k, v)
628
- * tensor_parallel_size
629
- )
630
- n_blocks = available_dram // nbytes_per_block
649
+ b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
650
+ c = num_layers * 2 * tensor_parallel_size
651
+ k = available_dram / c
652
+ max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
631
653
 
632
- return n_blocks, nbytes_per_block
654
+ return max_n_blocks
633
655
 
634
656
  @classmethod
635
657
  def _get_rbln_config(
@@ -686,7 +708,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
686
708
 
687
709
  rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
688
710
  if rbln_attn_impl == "flash_attn":
689
- max_num_blocks, _ = cls.get_maximum_num_blocks(
711
+ max_num_blocks = cls.get_maximum_num_blocks(
690
712
  config=model_config,
691
713
  tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
692
714
  kvcache_block_size=rbln_kvcache_block_size,
@@ -38,8 +38,8 @@ class RBLNRuntimeEncoder(RBLNPytorchRuntime):
38
38
  mandatory_members = ["main_input_name"]
39
39
 
40
40
  def forward(self, *args: List[torch.Tensor], **kwargs: Dict[str, torch.Tensor]):
41
- _ = super().forward(*args, **kwargs)
42
- return BaseModelOutput(last_hidden_state=torch.tensor([1.0]))
41
+ output = super().forward(*args, **kwargs)
42
+ return BaseModelOutput(last_hidden_state=output)
43
43
 
44
44
 
45
45
  class RBLNRuntimeDecoder(RBLNPytorchRuntime):
@@ -36,19 +36,50 @@ from .t5_architecture import T5Wrapper
36
36
  logger = get_logger()
37
37
 
38
38
  if TYPE_CHECKING:
39
+ from rebel import Runtime
39
40
  from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
40
41
 
41
42
 
42
43
  class RBLNRuntimeModel(RBLNPytorchRuntime):
44
+ def __init__(
45
+ self,
46
+ runtime: "Runtime",
47
+ max_seq_len: int,
48
+ **kwargs: Any,
49
+ ) -> None:
50
+ super().__init__(runtime, **kwargs)
51
+ self.max_seq_len = max_seq_len
52
+
53
+ def _prepare_inputs(
54
+ self,
55
+ input_ids: torch.LongTensor,
56
+ attention_mask: torch.LongTensor,
57
+ ):
58
+ input_len = input_ids.shape[-1]
59
+ pad_len = None
60
+ if input_len > self.max_seq_len:
61
+ raise ValueError(f"Error input_len({input_len}) exceed max_seq_len({self.max_seq_len}).")
62
+ elif input_len < self.max_seq_len and input_len > 0:
63
+ pad_len = self.max_seq_len - input_len
64
+ logger.warning(
65
+ f"Warning: The input was padded with {pad_len} tokens to meet the compiled model's requirements. "
66
+ "For optimal performance, consider recompiling with a shorter 'rbln_max_seq_len'."
67
+ )
68
+ input_ids = torch.nn.functional.pad(input_ids, (0, pad_len))
69
+ attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_len), value=0)
70
+
71
+ return input_ids, attention_mask, pad_len
72
+
43
73
  def forward(
44
74
  self,
45
75
  input_ids: torch.LongTensor,
46
- attention_mask: torch.FloatTensor,
76
+ attention_mask: torch.LongTensor,
47
77
  head_mask: torch.FloatTensor,
48
78
  inputs_embeds: torch.FloatTensor,
49
79
  **kwargs,
50
80
  ):
51
- return super().forward(
81
+ input_ids, attention_mask, pad_len = self._prepare_inputs(input_ids, attention_mask)
82
+ logits = super().forward(
52
83
  input_ids,
53
84
  attention_mask,
54
85
  head_mask,
@@ -56,6 +87,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
56
87
  **kwargs,
57
88
  )
58
89
 
90
+ return logits[:, :-pad_len, :] if pad_len is not None else logits
91
+
59
92
 
60
93
  class T5EncoderWrapper(torch.nn.Module):
61
94
  def __init__(self, model: "T5EncoderModel") -> None:
@@ -72,7 +105,8 @@ class RBLNT5EncoderModel(RBLNModel):
72
105
  rbln_model_input_names = ["input_ids", "attention_mask"]
73
106
 
74
107
  def __post_init__(self, **kwargs):
75
- self.model = RBLNRuntimeModel(runtime=self.model[0])
108
+ max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
109
+ self.model = RBLNRuntimeModel(runtime=self.model[0], max_seq_len=max_seq_len)
76
110
 
77
111
  @classmethod
78
112
  def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
@@ -0,0 +1,24 @@
1
+ # Copyright 2024 Rebellions Inc.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Portions of this software are licensed under the Apache License,
16
+ # Version 2.0. See the NOTICE file distributed with this work for
17
+ # additional information regarding copyright ownership.
18
+
19
+ # All other portions of this software, including proprietary code,
20
+ # are the intellectual property of Rebellions Inc. and may not be
21
+ # copied, modified, or distributed without prior written permission
22
+ # from Rebellions Inc.
23
+
24
+ from .modeling_time_series_transformers import RBLNTimeSeriesTransformerForPrediction