optimum-rbln 0.8.1rc0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of optimum-rbln might be problematic. Click here for more details.

Files changed (120) hide show
  1. optimum/rbln/__init__.py +58 -9
  2. optimum/rbln/__version__.py +2 -2
  3. optimum/rbln/configuration_utils.py +24 -5
  4. optimum/rbln/diffusers/configurations/models/__init__.py +1 -1
  5. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl.py +2 -2
  6. optimum/rbln/diffusers/configurations/models/configuration_autoencoder_kl_cosmos.py +5 -3
  7. optimum/rbln/diffusers/configurations/models/configuration_controlnet.py +2 -2
  8. optimum/rbln/diffusers/configurations/models/configuration_prior_transformer.py +2 -2
  9. optimum/rbln/diffusers/configurations/models/{configuration_cosmos_transformer.py → configuration_transformer_cosmos.py} +7 -2
  10. optimum/rbln/diffusers/configurations/models/configuration_transformer_sd3.py +2 -2
  11. optimum/rbln/diffusers/configurations/models/configuration_unet_2d_condition.py +2 -2
  12. optimum/rbln/diffusers/configurations/models/configuration_vq_model.py +2 -2
  13. optimum/rbln/diffusers/configurations/pipelines/configuration_controlnet.py +3 -3
  14. optimum/rbln/diffusers/configurations/pipelines/configuration_cosmos.py +10 -6
  15. optimum/rbln/diffusers/configurations/pipelines/configuration_kandinsky2_2.py +4 -4
  16. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion.py +2 -2
  17. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_3.py +2 -2
  18. optimum/rbln/diffusers/configurations/pipelines/configuration_stable_diffusion_xl.py +2 -2
  19. optimum/rbln/diffusers/modeling_diffusers.py +4 -5
  20. optimum/rbln/diffusers/models/__init__.py +3 -13
  21. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +1 -0
  22. optimum/rbln/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1 -0
  23. optimum/rbln/diffusers/models/autoencoders/vq_model.py +1 -0
  24. optimum/rbln/diffusers/models/transformers/transformer_cosmos.py +1 -1
  25. optimum/rbln/diffusers/pipelines/__init__.py +1 -5
  26. optimum/rbln/diffusers/pipelines/cosmos/configuration_cosmos_guardrail.py +12 -4
  27. optimum/rbln/diffusers/pipelines/cosmos/cosmos_guardrail.py +4 -26
  28. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +2 -2
  29. optimum/rbln/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +2 -2
  30. optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -6
  31. optimum/rbln/modeling.py +4 -5
  32. optimum/rbln/modeling_base.py +18 -14
  33. optimum/rbln/ops/kv_cache_update.py +5 -0
  34. optimum/rbln/ops/linear.py +7 -0
  35. optimum/rbln/transformers/__init__.py +60 -0
  36. optimum/rbln/transformers/configuration_generic.py +4 -4
  37. optimum/rbln/transformers/modeling_attention_utils.py +252 -0
  38. optimum/rbln/transformers/modeling_generic.py +1 -4
  39. optimum/rbln/transformers/models/__init__.py +45 -30
  40. optimum/rbln/transformers/models/bart/bart_architecture.py +2 -7
  41. optimum/rbln/transformers/models/blip_2/configuration_blip_2.py +2 -2
  42. optimum/rbln/transformers/models/blip_2/modeling_blip_2.py +1 -5
  43. optimum/rbln/transformers/models/clip/configuration_clip.py +14 -3
  44. optimum/rbln/transformers/models/clip/modeling_clip.py +123 -28
  45. optimum/rbln/transformers/models/colpali/colpali_architecture.py +1 -4
  46. optimum/rbln/transformers/models/colpali/configuration_colpali.py +2 -2
  47. optimum/rbln/transformers/models/colpali/modeling_colpali.py +2 -10
  48. optimum/rbln/transformers/models/decoderonly/__init__.py +2 -2
  49. optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py +214 -45
  50. optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +323 -454
  51. optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +579 -362
  52. optimum/rbln/transformers/models/exaone/exaone_architecture.py +17 -42
  53. optimum/rbln/transformers/models/gemma/__init__.py +2 -2
  54. optimum/rbln/transformers/models/gemma/configuration_gemma.py +9 -1
  55. optimum/rbln/transformers/models/gemma/gemma_architecture.py +3 -44
  56. optimum/rbln/transformers/models/gemma/modeling_gemma.py +22 -1
  57. optimum/rbln/transformers/models/gemma3/configuration_gemma3.py +21 -9
  58. optimum/rbln/transformers/models/gemma3/gemma3_architecture.py +9 -63
  59. optimum/rbln/transformers/models/gemma3/modeling_gemma3.py +200 -292
  60. optimum/rbln/transformers/models/gpt2/__init__.py +2 -2
  61. optimum/rbln/transformers/models/gpt2/configuration_gpt2.py +31 -3
  62. optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +19 -24
  63. optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +18 -1
  64. optimum/rbln/transformers/models/idefics3/configuration_idefics3.py +2 -2
  65. optimum/rbln/transformers/models/idefics3/modeling_idefics3.py +3 -9
  66. optimum/rbln/transformers/models/llama/__init__.py +2 -2
  67. optimum/rbln/transformers/models/llama/configuration_llama.py +9 -1
  68. optimum/rbln/transformers/models/llama/modeling_llama.py +22 -1
  69. optimum/rbln/transformers/models/llava/__init__.py +16 -0
  70. optimum/rbln/transformers/models/llava/configuration_llava.py +54 -0
  71. optimum/rbln/transformers/models/llava/modeling_llava.py +419 -0
  72. optimum/rbln/transformers/models/llava_next/configuration_llava_next.py +20 -3
  73. optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +6 -16
  74. optimum/rbln/transformers/models/midm/midm_architecture.py +14 -22
  75. optimum/rbln/transformers/models/mistral/__init__.py +2 -2
  76. optimum/rbln/transformers/models/mistral/configuration_mistral.py +9 -1
  77. optimum/rbln/transformers/models/mistral/mistral_architecture.py +1 -1
  78. optimum/rbln/transformers/models/mistral/modeling_mistral.py +26 -3
  79. optimum/rbln/transformers/models/opt/__init__.py +2 -2
  80. optimum/rbln/transformers/models/opt/configuration_opt.py +8 -1
  81. optimum/rbln/transformers/models/opt/modeling_opt.py +41 -1
  82. optimum/rbln/transformers/models/opt/opt_architecture.py +16 -25
  83. optimum/rbln/transformers/models/pegasus/__init__.py +17 -0
  84. optimum/rbln/transformers/models/pegasus/configuration_pegasus.py +34 -0
  85. optimum/rbln/transformers/models/pegasus/modeling_pegasus.py +69 -0
  86. optimum/rbln/transformers/models/pegasus/pegasus_architecture.py +161 -0
  87. optimum/rbln/transformers/models/phi/__init__.py +2 -2
  88. optimum/rbln/transformers/models/phi/configuration_phi.py +9 -1
  89. optimum/rbln/transformers/models/phi/modeling_phi.py +10 -1
  90. optimum/rbln/transformers/models/phi/phi_architecture.py +16 -22
  91. optimum/rbln/transformers/models/pixtral/__init__.py +16 -0
  92. optimum/rbln/transformers/models/pixtral/configuration_pixtral.py +43 -0
  93. optimum/rbln/transformers/models/pixtral/modeling_pixtral.py +315 -0
  94. optimum/rbln/transformers/models/pixtral/pixtral_architecture.py +73 -0
  95. optimum/rbln/transformers/models/qwen2/__init__.py +2 -2
  96. optimum/rbln/transformers/models/qwen2/configuration_qwen2.py +9 -1
  97. optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +27 -1
  98. optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +3 -3
  99. optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -15
  100. optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py +1 -4
  101. optimum/rbln/transformers/models/qwen3/__init__.py +16 -0
  102. optimum/rbln/transformers/models/qwen3/configuration_qwen3.py +71 -0
  103. optimum/rbln/transformers/models/qwen3/modeling_qwen3.py +133 -0
  104. optimum/rbln/transformers/models/qwen3/qwen3_architecture.py +31 -0
  105. optimum/rbln/transformers/models/seq2seq/configuration_seq2seq.py +2 -12
  106. optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +3 -1
  107. optimum/rbln/transformers/models/siglip/__init__.py +2 -6
  108. optimum/rbln/transformers/models/siglip/modeling_siglip.py +2 -2
  109. optimum/rbln/transformers/models/time_series_transformer/configuration_time_series_transformer.py +2 -2
  110. optimum/rbln/transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -5
  111. optimum/rbln/transformers/models/whisper/configuration_whisper.py +3 -12
  112. optimum/rbln/transformers/models/whisper/modeling_whisper.py +8 -2
  113. optimum/rbln/transformers/models/xlm_roberta/__init__.py +2 -8
  114. optimum/rbln/utils/depreacate_utils.py +16 -0
  115. optimum/rbln/utils/hub.py +8 -47
  116. optimum/rbln/utils/runtime_utils.py +31 -5
  117. {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/METADATA +1 -1
  118. {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/RECORD +120 -103
  119. {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/WHEEL +0 -0
  120. {optimum_rbln-0.8.1rc0.dist-info → optimum_rbln-0.8.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,133 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from transformers import PretrainedConfig
18
+
19
+ from ....utils import logging
20
+ from ...models.decoderonly import (
21
+ RBLNDecoderOnlyModel,
22
+ RBLNDecoderOnlyModelForCausalLM,
23
+ RBLNDecoderOnlyModelForCausalLMConfig,
24
+ )
25
+ from .qwen3_architecture import Qwen3Wrapper
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ if TYPE_CHECKING:
31
+ from transformers import PretrainedConfig
32
+
33
+
34
+ class RBLNQwen3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
35
+ """
36
+ The Qwen3 Model transformer with a language modeling head (linear layer) on top.
37
+ This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
38
+ A class to convert and run pre-trained transformers based Qwen3ForCausalLM model on RBLN devices.
39
+ It implements the methods to convert a pre-trained transformers Qwen3ForCausalLM model into a RBLN transformer model by:
40
+ - transferring the checkpoint weights of the original into an optimized RBLN graph,
41
+ - compiling the resulting graph using the RBLN compiler.
42
+ **Configuration:**
43
+ This model uses [`RBLNQwen3ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
44
+ the `rbln_config` parameter should be an instance of [`RBLNQwen3ForCausalLMConfig`] or a dictionary conforming to its structure.
45
+ See the [`RBLNQwen3ForCausalLMConfig`] class for all available configuration options.
46
+ Examples:
47
+ ```python
48
+ from optimum.rbln import RBLNQwen3ForCausalLM
49
+ # Simple usage using rbln_* arguments
50
+ # `max_seq_len` is automatically inferred from the model config
51
+ model = RBLNQwen3ForCausalLM.from_pretrained(
52
+ "Qwen/Qwen3-4B",
53
+ export=True,
54
+ rbln_batch_size=1,
55
+ rbln_tensor_parallel_size=4,
56
+ )
57
+ # Using a config dictionary
58
+ rbln_config = {
59
+ "batch_size": 1,
60
+ "max_seq_len": 40_960,
61
+ "tensor_parallel_size": 4,
62
+ "kvcache_partition_len": 8192,
63
+ }
64
+ model = RBLNQwen3ForCausalLM.from_pretrained(
65
+ "Qwen/Qwen3-4B",
66
+ export=True,
67
+ rbln_config=rbln_config
68
+ )
69
+ # Using a RBLNQwen3ForCausalLMConfig instance (recommended for type checking)
70
+ from optimum.rbln import RBLNQwen3ForCausalLMConfig
71
+ config = RBLNQwen3ForCausalLMConfig(
72
+ batch_size=1,
73
+ max_seq_len=40_960,
74
+ tensor_parallel_size=4,
75
+ kvcache_partition_len=8192,
76
+ )
77
+ model = RBLNQwen3ForCausalLM.from_pretrained(
78
+ "Qwen/Qwen3-4B",
79
+ export=True,
80
+ rbln_config=config
81
+ )
82
+ ```
83
+ """
84
+
85
+ _decoder_wrapper_cls = Qwen3Wrapper
86
+
87
+ @classmethod
88
+ def _update_sliding_window_config(
89
+ cls, model_config: PretrainedConfig, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
90
+ ):
91
+ # https://github.com/huggingface/transformers/issues/35896
92
+ # There seems to be a bug in transformers(v4.52.4). Therefore, similar to when attn_implementation is eager,
93
+ # we set all layers to use sliding window in this version. This should be updated once the bug is fixed.
94
+
95
+ rbln_config.cache_impl = "sliding_window"
96
+ rbln_config.sliding_window = model_config.sliding_window
97
+ rbln_config.sliding_window_layers = list(range(model_config.num_hidden_layers))
98
+ return rbln_config
99
+
100
+ def forward(self, *args, **kwargs):
101
+ kwargs["return_dict"] = True
102
+ return super().forward(*args, **kwargs)
103
+
104
+
105
+ class RBLNQwen3Model(RBLNDecoderOnlyModel):
106
+ """
107
+ The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
108
+ This model inherits from [`RBLNDecoderOnlyModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
109
+ A class to convert and run pre-trained transformers based Qwen3Model on RBLN devices.
110
+ It implements the methods to convert a pre-trained transformers Qwen3Model into a RBLN transformer model by:
111
+ - transferring the checkpoint weights of the original into an optimized RBLN graph,
112
+ - compiling the resulting graph using the RBLN compiler.
113
+ **Configuration:**
114
+ This model uses [`RBLNQwen3ModelConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
115
+ the `rbln_config` parameter should be an instance of [`RBLNQwen3ModelConfig`] or a dictionary conforming to its structure.
116
+ See the [`RBLNQwen3ModelConfig`] class for all available configuration options.
117
+ Examples:
118
+ ```python
119
+ from optimum.rbln import RBLNQwen3Model
120
+ # Simple usage using rbln_* arguments
121
+ # `max_seq_len` is automatically inferred from the model config
122
+ model = RBLNQwen3Model.from_pretrained(
123
+ "Qwen/Qwen3-Embedding-4B",
124
+ export=True,
125
+ rbln_batch_size=1,
126
+ rbln_max_seq_len=40_960,
127
+ rbln_tensor_parallel_size=4,
128
+ rbln_kvcache_partition_len=8192,
129
+ )
130
+ """
131
+
132
+ _decoder_wrapper_cls = Qwen3Wrapper
133
+ _use_rotary_emb = True
@@ -0,0 +1,31 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from ..decoderonly.decoderonly_architecture import DecoderOnlyAttention, DecoderOnlyWrapper
17
+
18
+
19
+ class Qwen3Wrapper(DecoderOnlyWrapper):
20
+ def get_rbln_attn_class(self):
21
+ return Qwen3Attention
22
+
23
+
24
+ class Qwen3Attention(DecoderOnlyAttention):
25
+ def __post_init__(self):
26
+ self.k_proj = self._original_mod.k_proj
27
+ self.v_proj = self._original_mod.v_proj
28
+ self.q_proj = self._original_mod.q_proj
29
+ self.o_proj = self._original_mod.o_proj
30
+ self.q_norm = self._original_mod.q_norm
31
+ self.k_norm = self._original_mod.k_norm
@@ -12,9 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Dict, Optional
16
-
17
- import rebel
15
+ from typing import Any, Optional
18
16
 
19
17
  from ....configuration_utils import RBLNModelConfig
20
18
  from ....utils.logging import get_logger
@@ -31,7 +29,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
31
29
  dec_max_seq_len: Optional[int] = None,
32
30
  use_attention_mask: Optional[bool] = None,
33
31
  pad_token_id: Optional[int] = None,
34
- **kwargs: Dict[str, Any],
32
+ **kwargs: Any,
35
33
  ):
36
34
  """
37
35
  Args:
@@ -39,7 +37,6 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
39
37
  enc_max_seq_len (Optional[int]): Maximum sequence length for the encoder.
40
38
  dec_max_seq_len (Optional[int]): Maximum sequence length for the decoder.
41
39
  use_attention_mask (Optional[bool]): Whether to use attention masks during inference.
42
- This is automatically set to True for RBLN-CA02 devices.
43
40
  pad_token_id (Optional[int]): The ID of the padding token in the vocabulary.
44
41
  **kwargs: Additional arguments passed to the parent RBLNModelConfig.
45
42
 
@@ -55,12 +52,5 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
55
52
  self.dec_max_seq_len = dec_max_seq_len
56
53
 
57
54
  self.use_attention_mask = use_attention_mask
58
- npu = self.npu or rebel.get_npu_name()
59
- if npu == "RBLN-CA02":
60
- if self.use_attention_mask is False:
61
- logger.warning("Attention mask should be used with RBLN-CA02. Setting use_attention_mask to True.")
62
- self.use_attention_mask = True
63
- else:
64
- self.use_attention_mask = self.use_attention_mask or False
65
55
 
66
56
  self.pad_token_id = pad_token_id
@@ -38,7 +38,7 @@ if TYPE_CHECKING:
38
38
  class RBLNRuntimeEncoder(RBLNPytorchRuntime):
39
39
  mandatory_members = ["main_input_name"]
40
40
 
41
- def forward(self, *args: List[torch.Tensor], **kwargs: Dict[str, torch.Tensor]):
41
+ def forward(self, *args: List[torch.Tensor], **kwargs: torch.Tensor):
42
42
  output = super().forward(*args, **kwargs)
43
43
  return BaseModelOutput(last_hidden_state=output)
44
44
 
@@ -327,12 +327,14 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
327
327
  tensor_type="pt",
328
328
  device=rbln_config.device_map["encoder"],
329
329
  activate_profiler=rbln_config.activate_profiler,
330
+ timeout=rbln_config.timeout,
330
331
  ),
331
332
  rebel.Runtime(
332
333
  compiled_models[1],
333
334
  tensor_type="pt",
334
335
  device=rbln_config.device_map["decoder"],
335
336
  activate_profiler=rbln_config.activate_profiler,
337
+ timeout=rbln_config.timeout,
336
338
  ),
337
339
  ]
338
340
 
@@ -12,9 +12,5 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .configuration_siglip import (
16
- RBLNSiglipVisionModelConfig,
17
- )
18
- from .modeling_siglip import (
19
- RBLNSiglipVisionModel,
20
- )
15
+ from .configuration_siglip import RBLNSiglipVisionModelConfig
16
+ from .modeling_siglip import RBLNSiglipVisionModel
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
15
+ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
16
16
 
17
17
  import torch
18
18
  from transformers import SiglipVisionConfig, SiglipVisionModel
@@ -126,7 +126,7 @@ class RBLNSiglipVisionModel(RBLNModel):
126
126
  output_attentions: bool = None,
127
127
  output_hidden_states: bool = None,
128
128
  interpolate_pos_encoding: bool = False,
129
- **kwargs: Dict[str, Any],
129
+ **kwargs: Any,
130
130
  ) -> Union[Tuple, BaseModelOutputWithPooling]:
131
131
  if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
132
132
  logger.warning(
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Any, Optional
2
2
 
3
3
  from ....configuration_utils import RBLNModelConfig
4
4
 
@@ -17,7 +17,7 @@ class RBLNTimeSeriesTransformerForPredictionConfig(RBLNModelConfig):
17
17
  enc_max_seq_len: Optional[int] = None,
18
18
  dec_max_seq_len: Optional[int] = None,
19
19
  num_parallel_samples: Optional[int] = None,
20
- **kwargs: Dict[str, Any],
20
+ **kwargs: Any,
21
21
  ):
22
22
  """
23
23
  Args:
@@ -30,11 +30,7 @@ from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
30
30
  import rebel
31
31
  import torch
32
32
  from rebel.compile_context import CompileContext
33
- from transformers import (
34
- PretrainedConfig,
35
- TimeSeriesTransformerForPrediction,
36
- TimeSeriesTransformerModel,
37
- )
33
+ from transformers import PretrainedConfig, TimeSeriesTransformerForPrediction, TimeSeriesTransformerModel
38
34
  from transformers.modeling_outputs import ModelOutput, SampleTSPredictionOutput, Seq2SeqTSModelOutput
39
35
  from transformers.modeling_utils import no_init_weights
40
36
 
@@ -331,12 +327,14 @@ class RBLNTimeSeriesTransformerForPrediction(RBLNModel):
331
327
  tensor_type="pt",
332
328
  device=rbln_config.device_map["encoder"],
333
329
  activate_profiler=rbln_config.activate_profiler,
330
+ timeout=rbln_config.timeout,
334
331
  ),
335
332
  rebel.Runtime(
336
333
  compiled_models[1],
337
334
  tensor_type="pt",
338
335
  device=rbln_config.device_map["decoder"],
339
336
  activate_profiler=rbln_config.activate_profiler,
337
+ timeout=rbln_config.timeout,
340
338
  ),
341
339
  ]
342
340
 
@@ -12,9 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Dict
16
-
17
- import rebel
15
+ from typing import Any
18
16
 
19
17
  from ....configuration_utils import RBLNModelConfig
20
18
  from ....utils.logging import get_logger
@@ -38,14 +36,13 @@ class RBLNWhisperForConditionalGenerationConfig(RBLNModelConfig):
38
36
  use_attention_mask: bool = None,
39
37
  enc_max_seq_len: int = None,
40
38
  dec_max_seq_len: int = None,
41
- **kwargs: Dict[str, Any],
39
+ **kwargs: Any,
42
40
  ):
43
41
  """
44
42
  Args:
45
43
  batch_size (int, optional): The batch size for inference. Defaults to 1.
46
44
  token_timestamps (bool, optional): Whether to output token timestamps during generation. Defaults to False.
47
45
  use_attention_mask (bool, optional): Whether to use attention masks during inference. This is automatically
48
- set to True for RBLN-CA02 devices.
49
46
  enc_max_seq_len (int, optional): Maximum sequence length for the encoder.
50
47
  dec_max_seq_len (int, optional): Maximum sequence length for the decoder.
51
48
  **kwargs: Additional arguments passed to the parent RBLNModelConfig.
@@ -64,10 +61,4 @@ class RBLNWhisperForConditionalGenerationConfig(RBLNModelConfig):
64
61
  self.dec_max_seq_len = dec_max_seq_len
65
62
 
66
63
  self.use_attention_mask = use_attention_mask
67
- npu = self.npu or rebel.get_npu_name()
68
- if npu == "RBLN-CA02":
69
- if self.use_attention_mask is False:
70
- logger.warning("Attention mask should be used with RBLN-CA02. Setting use_attention_mask to True.")
71
- self.use_attention_mask = True
72
- else:
73
- self.use_attention_mask = self.use_attention_mask or False
64
+ self.use_attention_mask = self.use_attention_mask or False
@@ -46,7 +46,7 @@ if TYPE_CHECKING:
46
46
  class RBLNRuntimeEncoder(RBLNPytorchRuntime):
47
47
  mandatory_members = ["main_input_name"]
48
48
 
49
- def forward(self, *args: List[torch.Tensor], **kwargs: Dict[str, torch.Tensor]):
49
+ def forward(self, *args: List[torch.Tensor], **kwargs: torch.Tensor):
50
50
  output = super().forward(*args, **kwargs)
51
51
  return BaseModelOutput(last_hidden_state=output)
52
52
 
@@ -73,6 +73,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
73
73
  decoder_input_ids: torch.Tensor = None,
74
74
  decoder_attention_mask: torch.Tensor = None,
75
75
  cache_position: torch.Tensor = None,
76
+ block_tables: torch.Tensor = None,
76
77
  ):
77
78
  inputs_bsz = decoder_input_ids.shape[0]
78
79
  padded_bsz = self.batch_size - inputs_bsz
@@ -89,11 +90,14 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
89
90
  )
90
91
  decoder_attention_mask[b_idx, : decoding_step + 1] = 1
91
92
 
93
+ if block_tables is None:
94
+ block_tables = self.default_block_tables
95
+
92
96
  outputs = super().forward(
93
97
  decoder_input_ids,
94
98
  decoder_attention_mask if self.use_attention_mask else None,
95
99
  cache_position,
96
- block_tables=self.default_block_tables,
100
+ block_tables=block_tables,
97
101
  )
98
102
 
99
103
  if isinstance(outputs, torch.Tensor):
@@ -345,12 +349,14 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
345
349
  tensor_type="pt",
346
350
  device=rbln_config.device_map["encoder"],
347
351
  activate_profiler=rbln_config.activate_profiler,
352
+ timeout=rbln_config.timeout,
348
353
  ),
349
354
  rebel.Runtime(
350
355
  compiled_models[1],
351
356
  tensor_type="pt",
352
357
  device=rbln_config.device_map["decoder"],
353
358
  activate_profiler=rbln_config.activate_profiler,
359
+ timeout=rbln_config.timeout,
354
360
  ),
355
361
  ]
356
362
 
@@ -12,14 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from .configuration_xlm_roberta import (
16
- RBLNXLMRobertaForSequenceClassificationConfig,
17
- RBLNXLMRobertaModelConfig,
18
- )
19
- from .modeling_xlm_roberta import (
20
- RBLNXLMRobertaForSequenceClassification,
21
- RBLNXLMRobertaModel,
22
- )
15
+ from .configuration_xlm_roberta import RBLNXLMRobertaForSequenceClassificationConfig, RBLNXLMRobertaModelConfig
16
+ from .modeling_xlm_roberta import RBLNXLMRobertaForSequenceClassification, RBLNXLMRobertaModel
23
17
 
24
18
 
25
19
  __all__ = [
@@ -0,0 +1,16 @@
1
+ from typing import Optional
2
+
3
+ import rebel
4
+
5
+ from .logging import get_logger
6
+
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def warn_deprecated_npu(npu: Optional[str] = None):
12
+ npu = npu or rebel.get_npu_name()
13
+ if npu == "RBLN-CA02":
14
+ logger.warning_once(
15
+ "Support for the RBLN-CA02 device is provided only up to optimum-rbln v0.8.0 and has reached end of life.",
16
+ )
optimum/rbln/utils/hub.py CHANGED
@@ -12,59 +12,23 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import os
16
15
  from pathlib import Path
17
16
  from typing import List, Optional, Union
18
17
 
19
- from huggingface_hub import HfApi, HfFolder, hf_hub_download
20
-
21
-
22
- class PushToHubMixin:
23
- def push_to_hub(
24
- self,
25
- save_directory: str,
26
- repository_id: str,
27
- private: Optional[bool] = None,
28
- use_auth_token: Union[bool, str] = True,
29
- ) -> str:
30
- huggingface_token = _get_huggingface_token(use_auth_token)
31
- api = HfApi()
32
-
33
- api.create_repo(
34
- token=huggingface_token,
35
- repo_id=repository_id,
36
- exist_ok=True,
37
- private=private,
38
- )
39
- for path, subdirs, files in os.walk(save_directory):
40
- for name in files:
41
- local_file_path = os.path.join(path, name)
42
- _, hub_file_path = os.path.split(local_file_path)
43
- # FIXME: when huggingface_hub fixes the return of upload_file
44
- try:
45
- api.upload_file(
46
- token=huggingface_token,
47
- repo_id=f"{repository_id}",
48
- path_or_fileobj=os.path.join(os.getcwd(), local_file_path),
49
- path_in_repo=hub_file_path,
50
- )
51
- except KeyError:
52
- pass
53
- except NameError:
54
- pass
18
+ from huggingface_hub import HfApi, get_token, hf_hub_download
55
19
 
56
20
 
57
21
  def pull_compiled_model_from_hub(
58
22
  model_id: Union[str, Path],
59
23
  subfolder: str,
60
- use_auth_token: Optional[Union[bool, str]],
24
+ token: Union[bool, str],
61
25
  revision: Optional[str],
62
26
  cache_dir: Optional[str],
63
27
  force_download: bool,
64
28
  local_files_only: bool,
65
29
  ) -> Path:
66
30
  """Pull model files from the HuggingFace Hub."""
67
- huggingface_token = _get_huggingface_token(use_auth_token)
31
+ huggingface_token = _get_huggingface_token(token)
68
32
  repo_files = list(
69
33
  map(
70
34
  Path,
@@ -87,7 +51,7 @@ def pull_compiled_model_from_hub(
87
51
  repo_id=model_id,
88
52
  filename=filename,
89
53
  subfolder=subfolder,
90
- use_auth_token=use_auth_token,
54
+ token=token,
91
55
  revision=revision,
92
56
  cache_dir=cache_dir,
93
57
  force_download=force_download,
@@ -113,10 +77,7 @@ def validate_files(
113
77
  raise FileExistsError(f"Multiple rbln_config.json files found in {location}. This is not expected.")
114
78
 
115
79
 
116
- def _get_huggingface_token(use_auth_token: Union[bool, str]) -> str:
117
- if isinstance(use_auth_token, str):
118
- return use_auth_token
119
- elif use_auth_token:
120
- return HfFolder.get_token()
121
- else:
122
- raise ValueError("`use_auth_token` must be provided to interact with the HuggingFace Hub.")
80
+ def _get_huggingface_token(token: Union[bool, str]) -> str:
81
+ if isinstance(token, str):
82
+ return token
83
+ return get_token()
@@ -12,13 +12,29 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import re
15
16
  import threading
16
- from typing import Any, Dict, List, Optional, Union
17
+ from typing import Any, List, Optional, Union
17
18
 
18
19
  import rebel
19
20
  import torch
20
21
 
21
22
 
23
+ def normalize_npu(npu: str) -> str:
24
+ """Normalize the NPU string by removing the form factor."""
25
+ match = re.match(r"(RBLN-CA|RBLN-CR)(\d+)", npu)
26
+ if match:
27
+ prefix, num = match.groups()
28
+ if len(num) == 1:
29
+ # Convert "RBLN-CAx" → "RBLN-CA0"
30
+ # (e.g., "RBLN-CA2" -> "RBLN-CA0")
31
+ npu = f"{prefix}0"
32
+ elif len(num) == 2:
33
+ # Strip form factor (e.g., "RBLN-CA15" → "RBLN-CA1")
34
+ npu = f"{prefix}{num[:-1]}"
35
+ return npu
36
+
37
+
22
38
  def tp_and_devices_are_ok(
23
39
  tensor_parallel_size: Optional[int] = None,
24
40
  device: Optional[Union[int, List[int]]] = None,
@@ -58,7 +74,7 @@ def tp_and_devices_are_ok(
58
74
  if npu is not None:
59
75
  for device_id in device:
60
76
  npu_name = rebel.get_npu_name(device_id)
61
- if npu_name != npu:
77
+ if normalize_npu(npu_name) != normalize_npu(npu):
62
78
  return f"Device {device_id} ({npu_name}) is not on the same NPU as {npu}."
63
79
 
64
80
  return None
@@ -78,7 +94,7 @@ class RBLNPytorchRuntime:
78
94
  def __call__(self, *args: Any, **kwds: Any) -> Any:
79
95
  return self.forward(*args, **kwds)
80
96
 
81
- def forward(self, *args: List["torch.Tensor"], **kwargs: Dict[str, "torch.Tensor"]):
97
+ def forward(self, *args: List["torch.Tensor"], **kwargs: "torch.Tensor"):
82
98
  # filtering useless args or kwarg such as None.
83
99
  args = list(filter(lambda arg: isinstance(arg, torch.Tensor), args))
84
100
  kwargs = dict(filter(lambda kwarg: isinstance(kwarg[1], torch.Tensor) or kwarg[0] == "out", kwargs.items()))
@@ -126,7 +142,7 @@ class UnavailableRuntime:
126
142
  """Returns an iterator with self as the only item."""
127
143
  return iter([self])
128
144
 
129
- def forward(self, *args: List["torch.Tensor"], **kwargs: Dict[str, "torch.Tensor"]):
145
+ def forward(self, *args: List["torch.Tensor"], **kwargs: "torch.Tensor"):
130
146
  """Raises a detailed RuntimeError explaining why inference cannot be performed."""
131
147
  raise RuntimeError(
132
148
  "Cannot perform inference: RBLN runtime is not available.\n\n"
@@ -147,13 +163,20 @@ class ContextRblnConfig:
147
163
  _local = threading.local()
148
164
 
149
165
  def __init__(
150
- self, device=None, device_map=None, create_runtimes=None, optimize_host_mem=None, activate_profiler=None
166
+ self,
167
+ device=None,
168
+ device_map=None,
169
+ create_runtimes=None,
170
+ optimize_host_mem=None,
171
+ activate_profiler=None,
172
+ timeout=None,
151
173
  ):
152
174
  self.device = device
153
175
  self.device_map = device_map
154
176
  self.create_runtimes = create_runtimes
155
177
  self.optimize_host_mem = optimize_host_mem
156
178
  self.activate_profiler = activate_profiler
179
+ self.timeout = timeout
157
180
 
158
181
  def __enter__(self):
159
182
  self._local.device = self.device
@@ -161,6 +184,7 @@ class ContextRblnConfig:
161
184
  self._local.create_runtimes = self.create_runtimes
162
185
  self._local.optimize_host_memory = self.optimize_host_mem
163
186
  self._local.activate_profiler = self.activate_profiler
187
+ self._local.timeout = self.timeout
164
188
  return self
165
189
 
166
190
  def __exit__(self, exc_type, exc_val, exc_tb):
@@ -169,6 +193,7 @@ class ContextRblnConfig:
169
193
  self._local.create_runtimes = None
170
194
  self._local.optimize_host_memory = None
171
195
  self._local.activate_profiler = None
196
+ self._local.timeout = None
172
197
 
173
198
  @classmethod
174
199
  def get_current_context(cls):
@@ -178,4 +203,5 @@ class ContextRblnConfig:
178
203
  "create_runtimes": getattr(cls._local, "create_runtimes", None),
179
204
  "optimize_host_memory": getattr(cls._local, "optimize_host_memory", None),
180
205
  "activate_profiler": getattr(cls._local, "activate_profiler", None),
206
+ "timeout": getattr(cls._local, "timeout", None),
181
207
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optimum-rbln
3
- Version: 0.8.1rc0
3
+ Version: 0.8.2
4
4
  Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
5
5
  Project-URL: Homepage, https://rebellions.ai
6
6
  Project-URL: Documentation, https://docs.rbln.ai